Skip to content

Commit 2ec4e99

Browse files
committed
ARROW-15300: [C++] Update Skyhook for async dataset interfaces
After the synchronous dataset interfaces were removed in ARROW-13554, we need to update Skyhook to use the async interfaces. For now, we just use the same synchronous path as before, but on an I/O thread. Closes apache#12123 from lidavidm/arrow-15300 Authored-by: David Li <li.davidm96@gmail.com> Signed-off-by: David Li <li.davidm96@gmail.com>
1 parent d1fb8e7 commit 2ec4e99

5 files changed

Lines changed: 57 additions & 74 deletions

File tree

cpp/src/arrow/dataset/dataset.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,6 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
7878

7979
virtual ~Fragment() = default;
8080

81-
/// \brief Decide whether to apply filters and projections to this Fragment.
82-
bool apply_compute = true;
83-
8481
protected:
8582
Fragment() = default;
8683
explicit Fragment(compute::Expression partition_expression,

cpp/src/skyhook/CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,12 @@ add_dependencies(cls_skyhook ${ARROW_SKYHOOK_CLS_LIBRARIES})
6363

6464
# define the test builds
6565
if(ARROW_TEST_LINKAGE STREQUAL "static")
66-
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_static ${ARROW_TEST_STATIC_LINK_LIBS})
66+
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_client_static arrow_dataset_static
67+
${ARROW_TEST_STATIC_LINK_LIBS})
6768
else()
68-
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_dataset_shared ${ARROW_TEST_SHARED_LINK_LIBS})
69+
set(ARROW_SKYHOOK_TEST_LINK_LIBS arrow_skyhook_client_shared arrow_dataset_shared
70+
${ARROW_TEST_SHARED_LINK_LIBS})
6971
endif()
70-
list(APPEND ARROW_SKYHOOK_TEST_LINK_LIBS ${ARROW_SKYHOOK_CLIENT_LIBRARIES})
7172

7273
# build the cls and protocol tests
7374
add_arrow_test(cls_test

cpp/src/skyhook/client/file_skyhook.cc

Lines changed: 47 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -22,63 +22,11 @@
2222
#include "arrow/dataset/file_base.h"
2323
#include "arrow/dataset/file_ipc.h"
2424
#include "arrow/dataset/file_parquet.h"
25+
#include "arrow/util/checked_cast.h"
2526
#include "arrow/util/compression.h"
2627

2728
namespace skyhook {
2829

29-
/// A ScanTask to scan a file fragment in Skyhook format.
30-
class SkyhookScanTask : public arrow::dataset::ScanTask {
31-
public:
32-
SkyhookScanTask(std::shared_ptr<arrow::dataset::ScanOptions> options,
33-
std::shared_ptr<arrow::dataset::Fragment> fragment,
34-
arrow::dataset::FileSource source,
35-
std::shared_ptr<skyhook::SkyhookDirectObjectAccess> doa,
36-
skyhook::SkyhookFileType::type file_format,
37-
arrow::compute::Expression partition_expression)
38-
: ScanTask(std::move(options), std::move(fragment)),
39-
source_(std::move(source)),
40-
doa_(std::move(doa)),
41-
file_format_(file_format),
42-
partition_expression_(std::move(partition_expression)) {}
43-
44-
arrow::Result<arrow::RecordBatchIterator> Execute() override {
45-
/// Retrieve the size of the file using POSIX `stat`.
46-
struct stat st {};
47-
RETURN_NOT_OK(doa_->Stat(source_.path(), st));
48-
49-
/// Create a ScanRequest instance.
50-
skyhook::ScanRequest req;
51-
req.filter_expression = options_->filter;
52-
req.partition_expression = partition_expression_;
53-
req.projection_schema = options_->projected_schema;
54-
req.dataset_schema = options_->dataset_schema;
55-
req.file_size = st.st_size;
56-
req.file_format = file_format_;
57-
58-
/// Serialize the ScanRequest into a ceph bufferlist.
59-
ceph::bufferlist request;
60-
RETURN_NOT_OK(skyhook::SerializeScanRequest(req, &request));
61-
62-
/// Execute the Ceph object class method `scan_op`.
63-
ceph::bufferlist result;
64-
RETURN_NOT_OK(doa_->Exec(st.st_ino, "scan_op", request, result));
65-
66-
/// Read RecordBatches from the result bufferlist. Since, this step might use
67-
/// threads for decompressing compressed batches, to avoid running into
68-
/// [ARROW-12597], we switch off threaded decompression to avoid nested threading
69-
/// scenarios when scan tasks are executed in parallel by the CpuThreadPool.
70-
arrow::RecordBatchVector batches;
71-
RETURN_NOT_OK(skyhook::DeserializeTable(result, !options_->use_threads, &batches));
72-
return arrow::MakeVectorIterator(std::move(batches));
73-
}
74-
75-
protected:
76-
arrow::dataset::FileSource source_;
77-
std::shared_ptr<skyhook::SkyhookDirectObjectAccess> doa_;
78-
skyhook::SkyhookFileType::type file_format_;
79-
arrow::compute::Expression partition_expression_;
80-
};
81-
8230
class SkyhookFileFormat::Impl {
8331
public:
8432
Impl(std::shared_ptr<RadosConnCtx> ctx, std::string file_format)
@@ -95,12 +43,10 @@ class SkyhookFileFormat::Impl {
9543
return arrow::Status::OK();
9644
}
9745

98-
arrow::Result<arrow::dataset::ScanTaskIterator> ScanFile(
46+
arrow::Result<arrow::RecordBatchGenerator> ScanBatchesAsync(
47+
const std::shared_ptr<const SkyhookFileFormat>& format,
9948
const std::shared_ptr<arrow::dataset::ScanOptions>& options,
10049
const std::shared_ptr<arrow::dataset::FileFragment>& file) const {
101-
/// Make sure client-side filtering and projection is turned off.
102-
file->apply_compute = false;
103-
10450
/// Convert string file format name to Enum.
10551
skyhook::SkyhookFileType::type file_format;
10652
if (file_format_ == "parquet") {
@@ -111,9 +57,46 @@ class SkyhookFileFormat::Impl {
11157
return arrow::Status::Invalid("Unsupported file format ", file_format_);
11258
}
11359

114-
arrow::dataset::ScanTaskVector v{std::make_shared<SkyhookScanTask>(
115-
options, file, file->source(), doa_, file_format, file->partition_expression())};
116-
return arrow::MakeVectorIterator(v);
60+
auto fut = arrow::DeferNotOk(options->io_context.executor()->Submit(
61+
[file, file_format, format,
62+
options]() -> arrow::Result<arrow::RecordBatchGenerator> {
63+
auto self = format->impl_.get();
64+
65+
/// Retrieve the size of the file using POSIX `stat`.
66+
struct stat st {};
67+
RETURN_NOT_OK(self->doa_->Stat(file->source().path(), st));
68+
69+
/// Create a ScanRequest instance.
70+
skyhook::ScanRequest req;
71+
req.filter_expression = options->filter;
72+
req.partition_expression = file->partition_expression();
73+
req.projection_schema = options->projected_schema;
74+
req.dataset_schema = options->dataset_schema;
75+
req.file_size = st.st_size;
76+
req.file_format = file_format;
77+
78+
/// Serialize the ScanRequest into a ceph bufferlist.
79+
ceph::bufferlist request;
80+
RETURN_NOT_OK(skyhook::SerializeScanRequest(req, &request));
81+
82+
/// Execute the Ceph object class method `scan_op`.
83+
ceph::bufferlist result;
84+
RETURN_NOT_OK(self->doa_->Exec(st.st_ino, "scan_op", request, result));
85+
86+
/// Read RecordBatches from the result bufferlist. Since, this step might use
87+
/// threads for decompressing compressed batches, to avoid running into
88+
/// [ARROW-12597], we switch off threaded decompression to avoid nested
89+
/// threading scenarios when scan tasks are executed in parallel by the
90+
/// CpuThreadPool.
91+
arrow::RecordBatchVector batches;
92+
RETURN_NOT_OK(
93+
skyhook::DeserializeTable(result, !options->use_threads, &batches));
94+
auto gen = arrow::MakeVectorGenerator(std::move(batches));
95+
// Keep Ceph client alive
96+
arrow::RecordBatchGenerator gen_with_client = [format, gen]() { return gen(); };
97+
return gen_with_client;
98+
}));
99+
return arrow::MakeFromFuture(std::move(fut));
117100
}
118101

119102
arrow::Result<std::shared_ptr<arrow::Schema>> Inspect(
@@ -160,10 +143,12 @@ arrow::Result<std::shared_ptr<arrow::Schema>> SkyhookFileFormat::Inspect(
160143
return impl_->Inspect(source);
161144
}
162145

163-
arrow::Result<arrow::dataset::ScanTaskIterator> SkyhookFileFormat::ScanFile(
146+
arrow::Result<arrow::RecordBatchGenerator> SkyhookFileFormat::ScanBatchesAsync(
164147
const std::shared_ptr<arrow::dataset::ScanOptions>& options,
165148
const std::shared_ptr<arrow::dataset::FileFragment>& file) const {
166-
return impl_->ScanFile(options, file);
149+
return impl_->ScanBatchesAsync(
150+
arrow::internal::checked_pointer_cast<const SkyhookFileFormat>(shared_from_this()),
151+
options, file);
167152
}
168153

169154
std::shared_ptr<arrow::dataset::FileWriteOptions>

cpp/src/skyhook/client/file_skyhook.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@
1616
// under the License.
1717
#pragma once
1818

19-
#include "arrow/api.h"
19+
#include <memory>
20+
#include <string>
21+
2022
#include "arrow/dataset/file_parquet.h"
2123
#include "arrow/dataset/scanner.h"
2224
#include "arrow/dataset/type_fwd.h"
2325
#include "arrow/dataset/visibility.h"
26+
#include "arrow/type_fwd.h"
2427

2528
namespace skyhook {
2629

@@ -77,11 +80,7 @@ class SkyhookFileFormat : public arrow::dataset::FileFormat {
7780
arrow::Result<std::shared_ptr<arrow::Schema>> Inspect(
7881
const arrow::dataset::FileSource& source) const override;
7982

80-
/// \brief Scan a file fragment.
81-
/// \param[in] options The ScanOptions to use.
82-
/// \param[in] file The file fragment to scan.
83-
/// \return An iterator of ScanTasks.
84-
arrow::Result<arrow::dataset::ScanTaskIterator> ScanFile(
83+
arrow::Result<arrow::RecordBatchGenerator> ScanBatchesAsync(
8584
const std::shared_ptr<arrow::dataset::ScanOptions>& options,
8685
const std::shared_ptr<arrow::dataset::FileFragment>& file) const override;
8786

cpp/src/skyhook/cls/cls_skyhook_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "arrow/dataset/file_base.h"
2222
#include "arrow/filesystem/api.h"
2323
#include "arrow/io/api.h"
24+
#include "arrow/table.h"
2425
#include "arrow/testing/gtest_util.h"
2526
#include "arrow/util/checked_cast.h"
2627
#include "arrow/util/iterator.h"

0 commit comments

Comments
 (0)