arrow/cpp/src/plasma/store.cc at master · OpenSourceJavaProject/arrow

History

1353 lines (1224 loc) · 52.9 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

// Licensed to the Apache Software Foundation (ASF) under one

// or more contributor license agreements. See the NOTICE file

// distributed with this work for additional information

// regarding copyright ownership. The ASF licenses this file

// to you under the Apache License, Version 2.0 (the

// "License"); you may not use this file except in compliance

// with the License. You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing,

// software distributed under the License is distributed on an

// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

// KIND, either express or implied. See the License for the

// specific language governing permissions and limitations

// under the License.

// PLASMA STORE: This is a simple object store server process

// It accepts incoming client connections on a unix domain socket

// (name passed in via the -s option of the executable) and uses a

// single thread to serve the clients. Each client establishes a

// connection and can create objects, wait for objects and seal

// objects through that connection.

// It keeps a hash table that maps object_ids (which are 20 byte long,

// just enough to store and SHA1 hash) to memory mapped files.

#include "plasma/store.h"

#include <assert.h>

#include <fcntl.h>

#include <getopt.h>

#include <limits.h>

#include <signal.h>

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <sys/ioctl.h>

#include <sys/socket.h>

#include <sys/statvfs.h>

#include <sys/types.h>

#include <sys/un.h>

#include <unistd.h>

#include <ctime>

#include <deque>

#include <iostream>

#include <memory>

#include <sstream>

#include <string>

#include <unordered_map>

#include <unordered_set>

#include <utility>

#include <vector>

#include <gflags/gflags.h>

#include "arrow/status.h"

#include "arrow/util/config.h"

#include "plasma/common.h"

#include "plasma/common_generated.h"

#include "plasma/fling.h"

#include "plasma/io.h"

#include "plasma/malloc.h"

#include "plasma/plasma_allocator.h"

#include "plasma/protocol.h"

#ifdef PLASMA_CUDA

#include "arrow/gpu/cuda_api.h"

using arrow::cuda::CudaBuffer;

using arrow::cuda::CudaContext;

using arrow::cuda::CudaDeviceManager;

#endif

using arrow::util::ArrowLog;

using arrow::util::ArrowLogLevel;

namespace fb = plasma::flatbuf;

namespace plasma {

void SetMallocGranularity(int value);

struct GetRequest {

GetRequest(Client* client, const std::vector<ObjectID>& object_ids);

/// The client that called get.

Client* client;

/// The ID of the timer that will time out and cause this wait to return to

/// the client if it hasn't already returned.

int64_t timer;

/// The object IDs involved in this request. This is used in the reply.

std::vector<ObjectID> object_ids;

/// The object information for the objects in this request. This is used in

/// the reply.

std::unordered_map<ObjectID, PlasmaObject> objects;

/// The minimum number of objects to wait for in this request.

int64_t num_objects_to_wait_for;

/// The number of object requests in this wait request that are already

/// satisfied.

int64_t num_satisfied;

};

GetRequest::GetRequest(Client* client, const std::vector<ObjectID>& object_ids)

: client(client),

timer(-1),

object_ids(object_ids.begin(), object_ids.end()),

objects(object_ids.size()),

num_satisfied(0) {

std::unordered_set<ObjectID> unique_ids(object_ids.begin(), object_ids.end());

num_objects_to_wait_for = unique_ids.size();

}

Client::Client(int fd) : fd(fd), notification_fd(-1) {}

PlasmaStore::PlasmaStore(EventLoop* loop, std::string directory, bool hugepages_enabled,

const std::string& socket_name,

std::shared_ptr<ExternalStore> external_store)

: loop_(loop),

eviction_policy_(&store_info_, PlasmaAllocator::GetFootprintLimit()),

external_store_(external_store) {

store_info_.directory = directory;

store_info_.hugepages_enabled = hugepages_enabled;

}

// TODO(pcm): Get rid of this destructor by using RAII to clean up data.

PlasmaStore::~PlasmaStore() {}

const PlasmaStoreInfo* PlasmaStore::GetPlasmaStoreInfo() { return &store_info_; }

// If this client is not already using the object, add the client to the

// object's list of clients, otherwise do nothing.

void PlasmaStore::AddToClientObjectIds(const ObjectID& object_id, ObjectTableEntry* entry,

Client* client) {

// Check if this client is already using the object.

if (client->object_ids.find(object_id) != client->object_ids.end()) {

return;

}

// If there are no other clients using this object, notify the eviction policy

// that the object is being used.

if (entry->ref_count == 0) {

// Tell the eviction policy that this object is being used.

eviction_policy_.BeginObjectAccess(object_id);

}

// Increase reference count.

entry->ref_count++;

// Add object id to the list of object ids that this client is using.

client->object_ids.insert(object_id);

}

// Allocate memory

uint8_t* PlasmaStore::AllocateMemory(size_t size, bool evict_if_full, int* fd,

int64_t* map_size, ptrdiff_t* offset, Client* client,

bool is_create) {

// First free up space from the client's LRU queue if quota enforcement is on.

if (evict_if_full) {

std::vector<ObjectID> client_objects_to_evict;

bool quota_ok = eviction_policy_.EnforcePerClientQuota(client, size, is_create,

&client_objects_to_evict);

if (!quota_ok) {

return nullptr;

}

EvictObjects(client_objects_to_evict);

}

// Try to evict objects until there is enough space.

uint8_t* pointer = nullptr;

while (true) {

// Allocate space for the new object. We use memalign instead of malloc

// in order to align the allocated region to a 64-byte boundary. This is not

// strictly necessary, but it is an optimization that could speed up the

// computation of a hash of the data (see compute_object_hash_parallel in

// plasma_client.cc). Note that even though this pointer is 64-byte aligned,

// it is not guaranteed that the corresponding pointer in the client will be

// 64-byte aligned, but in practice it often will be.

pointer = reinterpret_cast<uint8_t*>(PlasmaAllocator::Memalign(kBlockSize, size));

if (pointer || !evict_if_full) {

// If we manage to allocate the memory, return the pointer. If we cannot

// allocate the space, but we are also not allowed to evict anything to

// make more space, return an error to the client.

break;

}

// Tell the eviction policy how much space we need to create this object.

std::vector<ObjectID> objects_to_evict;

bool success = eviction_policy_.RequireSpace(size, &objects_to_evict);

EvictObjects(objects_to_evict);

// Return an error to the client if not enough space could be freed to

// create the object.

if (!success) {

break;

}

if (pointer != nullptr) {

GetMallocMapinfo(pointer, fd, map_size, offset);

ARROW_CHECK(*fd != -1);

}

return pointer;

}

#ifdef PLASMA_CUDA

arrow::Result<std::shared_ptr<CudaContext>> PlasmaStore::GetCudaContext(int device_num) {

DCHECK_NE(device_num, 0);

ARROW_ASSIGN_OR_RAISE(auto manager, CudaDeviceManager::Instance());

return manager->GetContext(device_num - 1);

}

Status PlasmaStore::AllocateCudaMemory(

int device_num, int64_t size, uint8_t** out_pointer,

std::shared_ptr<CudaIpcMemHandle>* out_ipc_handle) {

ARROW_ASSIGN_OR_RAISE(auto context, GetCudaContext(device_num));

ARROW_ASSIGN_OR_RAISE(auto cuda_buffer, context->Allocate(static_cast<int64_t>(size)));

*out_pointer = reinterpret_cast<uint8_t*>(cuda_buffer->address());

// The IPC handle will keep the buffer memory alive

return cuda_buffer->ExportForIpc().Value(out_ipc_handle);

}

Status PlasmaStore::FreeCudaMemory(int device_num, int64_t size, uint8_t* pointer) {

ARROW_ASSIGN_OR_RAISE(auto context, GetCudaContext(device_num));

RETURN_NOT_OK(context->Free(pointer, size));

return Status::OK();

}

#endif

// Create a new object buffer in the hash table.

PlasmaError PlasmaStore::CreateObject(const ObjectID& object_id, bool evict_if_full,

int64_t data_size, int64_t metadata_size,

int device_num, Client* client,

PlasmaObject* result) {

ARROW_LOG(DEBUG) << "creating object " << object_id.hex();

auto entry = GetObjectTableEntry(&store_info_, object_id);

if (entry != nullptr) {

// There is already an object with the same ID in the Plasma Store, so

// ignore this request.

return PlasmaError::ObjectExists;

}

int fd = -1;

int64_t map_size = 0;

ptrdiff_t offset = 0;

uint8_t* pointer = nullptr;

auto total_size = data_size + metadata_size;

if (device_num == 0) {

pointer =

AllocateMemory(total_size, evict_if_full, &fd, &map_size, &offset, client, true);

if (!pointer) {

ARROW_LOG(ERROR) << "Not enough memory to create the object " << object_id.hex()

<< ", data_size=" << data_size

<< ", metadata_size=" << metadata_size

<< ", will send a reply of PlasmaError::OutOfMemory";

return PlasmaError::OutOfMemory;

}

} else {

#ifdef PLASMA_CUDA

/// IPC GPU handle to share with clients.

std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle;

auto st = AllocateCudaMemory(device_num, total_size, &pointer, &ipc_handle);

if (!st.ok()) {

ARROW_LOG(ERROR) << "Failed to allocate CUDA memory: " << st.ToString();

return PlasmaError::OutOfMemory;

}

result->ipc_handle = ipc_handle;

#else

ARROW_LOG(ERROR) << "device_num != 0 but CUDA not enabled";

return PlasmaError::OutOfMemory;

#endif

}

auto ptr = std::make_unique<ObjectTableEntry>();

entry = store_info_.objects.emplace(object_id, std::move(ptr)).first->second.get();

entry->data_size = data_size;

entry->metadata_size = metadata_size;

entry->pointer = pointer;

// TODO(pcm): Set the other fields.

entry->fd = fd;

entry->map_size = map_size;

entry->offset = offset;

entry->state = ObjectState::PLASMA_CREATED;

entry->device_num = device_num;

entry->create_time = std::time(nullptr);

entry->construct_duration = -1;

#ifdef PLASMA_CUDA

entry->ipc_handle = result->ipc_handle;

#endif

result->store_fd = fd;

result->data_offset = offset;

result->metadata_offset = offset + data_size;

result->data_size = data_size;

result->metadata_size = metadata_size;

result->device_num = device_num;

// Notify the eviction policy that this object was created. This must be done

// immediately before the call to AddToClientObjectIds so that the

// eviction policy does not have an opportunity to evict the object.

eviction_policy_.ObjectCreated(object_id, client, true);

// Record that this client is using this object.

AddToClientObjectIds(object_id, store_info_.objects[object_id].get(), client);

return PlasmaError::OK;

}

void PlasmaObject_init(PlasmaObject* object, ObjectTableEntry* entry) {

DCHECK(object != nullptr);

DCHECK(entry != nullptr);

DCHECK(entry->state == ObjectState::PLASMA_SEALED);

#ifdef PLASMA_CUDA

if (entry->device_num != 0) {

object->ipc_handle = entry->ipc_handle;

}

#endif

object->store_fd = entry->fd;

object->data_offset = entry->offset;

object->metadata_offset = entry->offset + entry->data_size;

object->data_size = entry->data_size;

object->metadata_size = entry->metadata_size;

object->device_num = entry->device_num;

}

void PlasmaStore::RemoveGetRequest(GetRequest* get_request) {

// Remove the get request from each of the relevant object_get_requests hash

// tables if it is present there. It should only be present there if the get

// request timed out or if it was issued by a client that has disconnected.

for (ObjectID& object_id : get_request->object_ids) {

auto object_request_iter = object_get_requests_.find(object_id);

if (object_request_iter != object_get_requests_.end()) {

auto& get_requests = object_request_iter->second;

// Erase get_req from the vector.

auto it = std::find(get_requests.begin(), get_requests.end(), get_request);

if (it != get_requests.end()) {

get_requests.erase(it);

// If the vector is empty, remove the object ID from the map.

if (get_requests.empty()) {

object_get_requests_.erase(object_request_iter);

}

// Remove the get request.

if (get_request->timer != -1) {

ARROW_CHECK(loop_->RemoveTimer(get_request->timer) == kEventLoopOk);

}

delete get_request;

}

void PlasmaStore::RemoveGetRequestsForClient(Client* client) {

std::unordered_set<GetRequest*> get_requests_to_remove;

for (auto const& pair : object_get_requests_) {

for (GetRequest* get_request : pair.second) {

if (get_request->client == client) {

get_requests_to_remove.insert(get_request);

}

// It shouldn't be possible for a given client to be in the middle of multiple get

// requests.

ARROW_CHECK(get_requests_to_remove.size() <= 1);

for (GetRequest* get_request : get_requests_to_remove) {

RemoveGetRequest(get_request);

}

void PlasmaStore::ReturnFromGet(GetRequest* get_req) {

// Figure out how many file descriptors we need to send.

std::unordered_set<int> fds_to_send;

std::vector<int> store_fds;

std::vector<int64_t> mmap_sizes;

for (const auto& object_id : get_req->object_ids) {

PlasmaObject& object = get_req->objects[object_id];

int fd = object.store_fd;

if (object.data_size != -1 && fds_to_send.count(fd) == 0 && fd != -1) {

fds_to_send.insert(fd);

store_fds.push_back(fd);

mmap_sizes.push_back(GetMmapSize(fd));

}

// Send the get reply to the client.

Status s = SendGetReply(get_req->client->fd, &get_req->object_ids[0], get_req->objects,

get_req->object_ids.size(), store_fds, mmap_sizes);

WarnIfSigpipe(s.ok() ? 0 : -1, get_req->client->fd);

// If we successfully sent the get reply message to the client, then also send

// the file descriptors.

if (s.ok()) {

// Send all of the file descriptors for the present objects.

for (int store_fd : store_fds) {

// Only send the file descriptor if it hasn't been sent (see analogous

// logic in GetStoreFd in client.cc).

if (get_req->client->used_fds.find(store_fd) == get_req->client->used_fds.end()) {

WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd);

get_req->client->used_fds.insert(store_fd);

}

// Remove the get request from each of the relevant object_get_requests hash

// tables if it is present there. It should only be present there if the get

// request timed out.

RemoveGetRequest(get_req);

}

void PlasmaStore::UpdateObjectGetRequests(const ObjectID& object_id) {

auto it = object_get_requests_.find(object_id);

// If there are no get requests involving this object, then return.

if (it == object_get_requests_.end()) {

return;

}

auto& get_requests = it->second;

// After finishing the loop below, get_requests and it will have been

// invalidated by the removal of object_id from object_get_requests_.

size_t index = 0;

size_t num_requests = get_requests.size();

for (size_t i = 0; i < num_requests; ++i) {

auto get_req = get_requests[index];

auto entry = GetObjectTableEntry(&store_info_, object_id);

ARROW_CHECK(entry != nullptr);

PlasmaObject_init(&get_req->objects[object_id], entry);

get_req->num_satisfied += 1;

// Record the fact that this client will be using this object and will

// be responsible for releasing this object.

AddToClientObjectIds(object_id, entry, get_req->client);

// If this get request is done, reply to the client.

if (get_req->num_satisfied == get_req->num_objects_to_wait_for) {

ReturnFromGet(get_req);

} else {

// The call to ReturnFromGet will remove the current element in the

// array, so we only increment the counter in the else branch.

index += 1;

}

// No get requests should be waiting for this object anymore. The object ID

// may have been removed from the object_get_requests_ by ReturnFromGet, but

// if the get request has not returned yet, then remove the object ID from the

// map here.

it = object_get_requests_.find(object_id);

if (it != object_get_requests_.end()) {

object_get_requests_.erase(object_id);

}

void PlasmaStore::ProcessGetRequest(Client* client,

const std::vector<ObjectID>& object_ids,

int64_t timeout_ms) {

// Create a get request for this object.

auto get_req = new GetRequest(client, object_ids);

std::vector<ObjectID> evicted_ids;

std::vector<ObjectTableEntry*> evicted_entries;

for (auto object_id : object_ids) {

// Check if this object is already present locally. If so, record that the

// object is being used and mark it as accounted for.

auto entry = GetObjectTableEntry(&store_info_, object_id);

if (entry && entry->state == ObjectState::PLASMA_SEALED) {

// Update the get request to take into account the present object.

PlasmaObject_init(&get_req->objects[object_id], entry);

get_req->num_satisfied += 1;

// If necessary, record that this client is using this object. In the case

// where entry == NULL, this will be called from SealObject.

AddToClientObjectIds(object_id, entry, client);

} else if (entry && entry->state == ObjectState::PLASMA_EVICTED) {

// Make sure the object pointer is not already allocated

ARROW_CHECK(!entry->pointer);

entry->pointer =

AllocateMemory(entry->data_size + entry->metadata_size, /*evict=*/true,

&entry->fd, &entry->map_size, &entry->offset, client, false);

if (entry->pointer) {

entry->state = ObjectState::PLASMA_CREATED;

entry->create_time = std::time(nullptr);

eviction_policy_.ObjectCreated(object_id, client, false);

AddToClientObjectIds(object_id, store_info_.objects[object_id].get(), client);

evicted_ids.push_back(object_id);

evicted_entries.push_back(entry);

} else {

// We are out of memory and cannot allocate memory for this object.

// Change the state of the object back to PLASMA_EVICTED so some

// other request can try again.

entry->state = ObjectState::PLASMA_EVICTED;

}

} else {

// Add a placeholder plasma object to the get request to indicate that the

// object is not present. This will be parsed by the client. We set the

// data size to -1 to indicate that the object is not present.

get_req->objects[object_id].data_size = -1;

// Add the get request to the relevant data structures.

object_get_requests_[object_id].push_back(get_req);

}

if (!evicted_ids.empty()) {

unsigned char digest[kDigestSize] = {};

std::vector<std::shared_ptr<Buffer>> buffers;

for (size_t i = 0; i < evicted_ids.size(); ++i) {

ARROW_CHECK(evicted_entries[i]->pointer != nullptr);

buffers.emplace_back(new arrow::MutableBuffer(evicted_entries[i]->pointer,

evicted_entries[i]->data_size));

}

if (external_store_->Get(evicted_ids, buffers).ok()) {

for (size_t i = 0; i < evicted_ids.size(); ++i) {

evicted_entries[i]->state = ObjectState::PLASMA_SEALED;

std::memcpy(&evicted_entries[i]->digest[0], &digest[0], kDigestSize);

evicted_entries[i]->construct_duration =

std::time(nullptr) - evicted_entries[i]->create_time;

PlasmaObject_init(&get_req->objects[evicted_ids[i]], evicted_entries[i]);

get_req->num_satisfied += 1;

}

} else {

// We tried to get the objects from the external store, but could not get them.

// Set the state of these objects back to PLASMA_EVICTED so some other request

// can try again.

for (size_t i = 0; i < evicted_ids.size(); ++i) {

evicted_entries[i]->state = ObjectState::PLASMA_EVICTED;

}

// If all of the objects are present already or if the timeout is 0, return to

// the client.

if (get_req->num_satisfied == get_req->num_objects_to_wait_for || timeout_ms == 0) {

ReturnFromGet(get_req);

} else if (timeout_ms != -1) {

// Set a timer that will cause the get request to return to the client. Note

// that a timeout of -1 is used to indicate that no timer should be set.

get_req->timer = loop_->AddTimer(timeout_ms, [this, get_req](int64_t timer_id) {

ReturnFromGet(get_req);

return kEventLoopTimerDone;

});

}

int PlasmaStore::RemoveFromClientObjectIds(const ObjectID& object_id,

ObjectTableEntry* entry, Client* client) {

auto it = client->object_ids.find(object_id);

if (it != client->object_ids.end()) {

client->object_ids.erase(it);

// Decrease reference count.

entry->ref_count--;

// If no more clients are using this object, notify the eviction policy

// that the object is no longer being used.

if (entry->ref_count == 0) {

if (deletion_cache_.count(object_id) == 0) {

// Tell the eviction policy that this object is no longer being used.

eviction_policy_.EndObjectAccess(object_id);

} else {

// Above code does not really delete an object. Instead, it just put an

// object to LRU cache which will be cleaned when the memory is not enough.

deletion_cache_.erase(object_id);

EvictObjects({object_id});

}

// Return 1 to indicate that the client was removed.

return 1;

} else {

// Return 0 to indicate that the client was not removed.

return 0;

}

void PlasmaStore::EraseFromObjectTable(const ObjectID& object_id) {

auto& object = store_info_.objects[object_id];

auto buff_size = object->data_size + object->metadata_size;

if (object->device_num == 0) {

PlasmaAllocator::Free(object->pointer, buff_size);

} else {

#ifdef PLASMA_CUDA

ARROW_CHECK_OK(FreeCudaMemory(object->device_num, buff_size, object->pointer));

#endif

}

store_info_.objects.erase(object_id);

}

void PlasmaStore::ReleaseObject(const ObjectID& object_id, Client* client) {

auto entry = GetObjectTableEntry(&store_info_, object_id);

ARROW_CHECK(entry != nullptr);

// Remove the client from the object's array of clients.

ARROW_CHECK(RemoveFromClientObjectIds(object_id, entry, client) == 1);

}

// Check if an object is present.

ObjectStatus PlasmaStore::ContainsObject(const ObjectID& object_id) {

auto entry = GetObjectTableEntry(&store_info_, object_id);

return entry && (entry->state == ObjectState::PLASMA_SEALED ||

entry->state == ObjectState::PLASMA_EVICTED)

? ObjectStatus::OBJECT_FOUND

: ObjectStatus::OBJECT_NOT_FOUND;

}

void PlasmaStore::SealObjects(const std::vector<ObjectID>& object_ids,

const std::vector<std::string>& digests) {

std::vector<ObjectInfoT> infos;

ARROW_LOG(DEBUG) << "sealing " << object_ids.size() << " objects";

for (size_t i = 0; i < object_ids.size(); ++i) {

ObjectInfoT object_info;

auto entry = GetObjectTableEntry(&store_info_, object_ids[i]);

ARROW_CHECK(entry != nullptr);

ARROW_CHECK(entry->state == ObjectState::PLASMA_CREATED);

// Set the state of object to SEALED.

entry->state = ObjectState::PLASMA_SEALED;

// Set the object digest.

std::memcpy(&entry->digest[0], digests[i].c_str(), kDigestSize);

// Set object construction duration.

entry->construct_duration = std::time(nullptr) - entry->create_time;

object_info.object_id = object_ids[i].binary();

object_info.data_size = entry->data_size;

object_info.metadata_size = entry->metadata_size;

object_info.digest = digests[i];

infos.push_back(object_info);

}

PushNotifications(infos);

for (size_t i = 0; i < object_ids.size(); ++i) {

UpdateObjectGetRequests(object_ids[i]);

}

int PlasmaStore::AbortObject(const ObjectID& object_id, Client* client) {

auto entry = GetObjectTableEntry(&store_info_, object_id);

ARROW_CHECK(entry != nullptr) << "To abort an object it must be in the object table.";

ARROW_CHECK(entry->state != ObjectState::PLASMA_SEALED)

<< "To abort an object it must not have been sealed.";

auto it = client->object_ids.find(object_id);

if (it == client->object_ids.end()) {

// If the client requesting the abort is not the creator, do not

// perform the abort.

return 0;

} else {

// The client requesting the abort is the creator. Free the object.

EraseFromObjectTable(object_id);

client->object_ids.erase(it);

return 1;

}

PlasmaError PlasmaStore::DeleteObject(ObjectID& object_id) {

auto entry = GetObjectTableEntry(&store_info_, object_id);

// TODO(rkn): This should probably not fail, but should instead throw an

// error. Maybe we should also support deleting objects that have been

// created but not sealed.

if (entry == nullptr) {

// To delete an object it must be in the object table.

return PlasmaError::ObjectNotFound;

}

if (entry->state != ObjectState::PLASMA_SEALED) {

// To delete an object it must have been sealed.

// Put it into deletion cache, it will be deleted later.

deletion_cache_.emplace(object_id);

return PlasmaError::ObjectNotSealed;

}

if (entry->ref_count != 0) {

// To delete an object, there must be no clients currently using it.

// Put it into deletion cache, it will be deleted later.

deletion_cache_.emplace(object_id);

return PlasmaError::ObjectInUse;

}

eviction_policy_.RemoveObject(object_id);

EraseFromObjectTable(object_id);

// Inform all subscribers that the object has been deleted.

fb::ObjectInfoT notification;

notification.object_id = object_id.binary();

notification.is_deletion = true;

PushNotification(&notification);

return PlasmaError::OK;

}

void PlasmaStore::EvictObjects(const std::vector<ObjectID>& object_ids) {

if (object_ids.size() == 0) {

return;

}

std::vector<std::shared_ptr<arrow::Buffer>> evicted_object_data;

std::vector<ObjectTableEntry*> evicted_entries;

for (const auto& object_id : object_ids) {

ARROW_LOG(DEBUG) << "evicting object " << object_id.hex();

auto entry = GetObjectTableEntry(&store_info_, object_id);

// TODO(rkn): This should probably not fail, but should instead throw an

// error. Maybe we should also support deleting objects that have been

// created but not sealed.

ARROW_CHECK(entry != nullptr) << "To evict an object it must be in the object table.";

ARROW_CHECK(entry->state == ObjectState::PLASMA_SEALED)

<< "To evict an object it must have been sealed.";

ARROW_CHECK(entry->ref_count == 0)

<< "To evict an object, there must be no clients currently using it.";

// If there is a backing external store, then mark object for eviction to

// external store, free the object data pointer and keep a placeholder

// entry in ObjectTable

if (external_store_) {

evicted_object_data.push_back(std::make_shared<arrow::Buffer>(

entry->pointer, entry->data_size + entry->metadata_size));

evicted_entries.push_back(entry);

} else {

// If there is no backing external store, just erase the object entry

// and send a deletion notification.

EraseFromObjectTable(object_id);

// Inform all subscribers that the object has been deleted.

fb::ObjectInfoT notification;

notification.object_id = object_id.binary();

notification.is_deletion = true;

PushNotification(&notification);

}

if (external_store_ && !object_ids.empty()) {

ARROW_CHECK_OK(external_store_->Put(object_ids, evicted_object_data));

for (auto entry : evicted_entries) {

PlasmaAllocator::Free(entry->pointer, entry->data_size + entry->metadata_size);

entry->pointer = nullptr;

entry->state = ObjectState::PLASMA_EVICTED;

}

void PlasmaStore::ConnectClient(int listener_sock) {

int client_fd = AcceptClient(listener_sock);

Client* client = new Client(client_fd);

connected_clients_[client_fd] = std::unique_ptr<Client>(client);

// Add a callback to handle events on this socket.

// TODO(pcm): Check return value.

loop_->AddFileEvent(client_fd, kEventLoopRead, [this, client](int events) {

Status s = ProcessMessage(client);

if (!s.ok()) {

ARROW_LOG(FATAL) << "Failed to process file event: " << s;

}

});

ARROW_LOG(DEBUG) << "New connection with fd " << client_fd;

}

void PlasmaStore::DisconnectClient(int client_fd) {

ARROW_CHECK(client_fd > 0);

auto it = connected_clients_.find(client_fd);

ARROW_CHECK(it != connected_clients_.end());

loop_->RemoveFileEvent(client_fd);

// Close the socket.

close(client_fd);

ARROW_LOG(INFO) << "Disconnecting client on fd " << client_fd;

// Release all the objects that the client was using.

auto client = it->second.get();

eviction_policy_.ClientDisconnected(client);

std::unordered_map<ObjectID, ObjectTableEntry*> sealed_objects;

for (const auto& object_id : client->object_ids) {

auto it = store_info_.objects.find(object_id);

if (it == store_info_.objects.end()) {

continue;

}

if (it->second->state == ObjectState::PLASMA_SEALED) {

// Add sealed objects to a temporary list of object IDs. Do not perform

// the remove here, since it potentially modifies the object_ids table.

sealed_objects[it->first] = it->second.get();

} else {

// Abort unsealed object.

// Don't call AbortObject() because client->object_ids would be modified.

EraseFromObjectTable(object_id);

}

/// Remove all of the client's GetRequests.

RemoveGetRequestsForClient(client);

for (const auto& entry : sealed_objects) {

RemoveFromClientObjectIds(entry.first, entry.second, client);

}

if (client->notification_fd > 0) {

// This client has subscribed for notifications.

auto notify_fd = client->notification_fd;

loop_->RemoveFileEvent(notify_fd);

// Close socket.

close(notify_fd);

// Remove notification queue for this fd from global map.

pending_notifications_.erase(notify_fd);

// Reset fd.

client->notification_fd = -1;

}

connected_clients_.erase(it);

}

/// Send notifications about sealed objects to the subscribers. This is called

/// in SealObject. If the socket's send buffer is full, the notification will

/// be buffered, and this will be called again when the send buffer has room.

/// Since we call erase on pending_notifications_, all iterators get

/// invalidated, which is why we return a valid iterator to the next client to

/// be used in PushNotification.

///

/// \param it Iterator that points to the client to send the notification to.

/// \return Iterator pointing to the next client.

PlasmaStore::NotificationMap::iterator PlasmaStore::SendNotifications(

PlasmaStore::NotificationMap::iterator it) {

int client_fd = it->first;

auto& notifications = it->second.object_notifications;

int num_processed = 0;

bool closed = false;

// Loop over the array of pending notifications and send as many of them as

// possible.

for (size_t i = 0; i < notifications.size(); ++i) {

auto& notification = notifications.at(i);

// Decode the length, which is the first bytes of the message.

int64_t size = *(reinterpret_cast<int64_t*>(notification.get()));

// Attempt to send a notification about this object ID.

ssize_t nbytes = send(client_fd, notification.get(), sizeof(int64_t) + size, 0);

if (nbytes >= 0) {

ARROW_CHECK(nbytes == static_cast<ssize_t>(sizeof(int64_t)) + size);

} else if (nbytes == -1 &&

(errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) {

ARROW_LOG(DEBUG) << "The socket's send buffer is full, so we are caching this "

"notification and will send it later.";

// Add a callback to the event loop to send queued notifications whenever

// there is room in the socket's send buffer. Callbacks can be added

// more than once here and will be overwritten. The callback is removed

// at the end of the method.

// TODO(pcm): Introduce status codes and check in case the file descriptor

// is added twice.

loop_->AddFileEvent(client_fd, kEventLoopWrite, [this, client_fd](int events) {

SendNotifications(pending_notifications_.find(client_fd));

});

break;

} else {

ARROW_LOG(WARNING) << "Failed to send notification to client on fd " << client_fd;

if (errno == EPIPE) {

closed = true;

break;

}

num_processed += 1;

}

// Remove the sent notifications from the array.

notifications.erase(notifications.begin(), notifications.begin() + num_processed);

// If we have sent all notifications, remove the fd from the event loop.

if (notifications.empty()) {

loop_->RemoveFileEvent(client_fd);

}

// Stop sending notifications if the pipe was broken.

if (closed) {

close(client_fd);

return pending_notifications_.erase(it);

} else {

return ++it;

}

void PlasmaStore::PushNotification(fb::ObjectInfoT* object_info) {

auto it = pending_notifications_.begin();

while (it != pending_notifications_.end()) {

std::vector<fb::ObjectInfoT> info;

info.push_back(*object_info);

auto notification = CreatePlasmaNotificationBuffer(info);

it->second.object_notifications.emplace_back(std::move(notification));

it = SendNotifications(it);

}

void PlasmaStore::PushNotifications(std::vector<fb::ObjectInfoT>& object_info) {

auto it = pending_notifications_.begin();

while (it != pending_notifications_.end()) {

auto notifications = CreatePlasmaNotificationBuffer(object_info);

it->second.object_notifications.emplace_back(std::move(notifications));

it = SendNotifications(it);

}

void PlasmaStore::PushNotification(fb::ObjectInfoT* object_info, int client_fd) {

auto it = pending_notifications_.find(client_fd);

if (it != pending_notifications_.end()) {

std::vector<fb::ObjectInfoT> info;

info.push_back(*object_info);

auto notification = CreatePlasmaNotificationBuffer(info);

it->second.object_notifications.emplace_back(std::move(notification));

SendNotifications(it);

}

// Subscribe to notifications about sealed objects.

void PlasmaStore::SubscribeToUpdates(Client* client) {

ARROW_LOG(DEBUG) << "subscribing to updates on fd " << client->fd;

if (client->notification_fd > 0) {

// This client has already subscribed. Return.

return;

}

// TODO(rkn): The store could block here if the client doesn't send a file

// descriptor.

int fd = recv_fd(client->fd);

if (fd < 0) {

// This may mean that the client died before sending the file descriptor.

ARROW_LOG(WARNING) << "Failed to receive file descriptor from client on fd "

<< client->fd << ".";

return;

}

// Add this fd to global map, which is needed for this client to receive notifications.

pending_notifications_[fd];

client->notification_fd = fd;

// Push notifications to the new subscriber about existing sealed objects.

for (const auto& entry : store_info_.objects) {

if (entry.second->state == ObjectState::PLASMA_SEALED) {

ObjectInfoT info;

info.object_id = entry.first.binary();

info.data_size = entry.second->data_size;

info.metadata_size = entry.second->metadata_size;

info.digest =

std::string(reinterpret_cast<char*>(&entry.second->digest[0]), kDigestSize);

PushNotification(&info, fd);

}

Status PlasmaStore::ProcessMessage(Client* client) {

fb::MessageType type;

Status s = ReadMessage(client->fd, &type, &input_buffer_);

ARROW_CHECK(s.ok() || s.IsIOError());

uint8_t* input = input_buffer_.data();

size_t input_size = input_buffer_.size();

ObjectID object_id;

PlasmaObject object = {};

// Process the different types of requests.

switch (type) {

case fb::MessageType::PlasmaCreateRequest: {

bool evict_if_full;

int64_t data_size;

int64_t metadata_size;

int device_num;

RETURN_NOT_OK(ReadCreateRequest(input, input_size, &object_id, &evict_if_full,

&data_size, &metadata_size, &device_num));

PlasmaError error_code = CreateObject(object_id, evict_if_full, data_size,

metadata_size, device_num, client, &object);

int64_t mmap_size = 0;

if (error_code == PlasmaError::OK && device_num == 0) {

mmap_size = GetMmapSize(object.store_fd);

}

HANDLE_SIGPIPE(

SendCreateReply(client->fd, object_id, &object, error_code, mmap_size),

client->fd);

// Only send the file descriptor if it hasn't been sent (see analogous

// logic in GetStoreFd in client.cc). Similar in ReturnFromGet.

if (error_code == PlasmaError::OK && device_num == 0 &&

client->used_fds.find(object.store_fd) == client->used_fds.end()) {

WarnIfSigpipe(send_fd(client->fd, object.store_fd), client->fd);

client->used_fds.insert(object.store_fd);

}

} break;

case fb::MessageType::PlasmaCreateAndSealRequest: {

bool evict_if_full;

std::string data;

std::string metadata;

std::string digest;

digest.reserve(kDigestSize);

RETURN_NOT_OK(ReadCreateAndSealRequest(input, input_size, &object_id,

&evict_if_full, &data, &metadata, &digest));

// CreateAndSeal currently only supports device_num = 0, which corresponds

// to the host.

int device_num = 0;

PlasmaError error_code = CreateObject(object_id, evict_if_full, data.size(),

metadata.size(), device_num, client, &object);

// If the object was successfully created, fill out the object data and seal it.

if (error_code == PlasmaError::OK) {

auto entry = GetObjectTableEntry(&store_info_, object_id);

ARROW_CHECK(entry != nullptr);

// Write the inlined data and metadata into the allocated object.

std::memcpy(entry->pointer, data.data(), data.size());

std::memcpy(entry->pointer + data.size(), metadata.data(), metadata.size());

SealObjects({object_id}, {digest});

// Remove the client from the object's array of clients because the

// object is not being used by any client. The client was added to the

// object's array of clients in CreateObject. This is analogous to the

// Release call that happens in the client's Seal method.

ARROW_CHECK(RemoveFromClientObjectIds(object_id, entry, client) == 1);

}

// Reply to the client.

HANDLE_SIGPIPE(SendCreateAndSealReply(client->fd, error_code), client->fd);

} break;

case fb::MessageType::PlasmaCreateAndSealBatchRequest: {

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

store.cc

Latest commit

History

store.cc

File metadata and controls