2222#include " arrow/dataset/file_base.h"
2323#include " arrow/dataset/file_ipc.h"
2424#include " arrow/dataset/file_parquet.h"
25+ #include " arrow/util/checked_cast.h"
2526#include " arrow/util/compression.h"
2627
2728namespace skyhook {
2829
29- // / A ScanTask to scan a file fragment in Skyhook format.
30- class SkyhookScanTask : public arrow ::dataset::ScanTask {
31- public:
32- SkyhookScanTask (std::shared_ptr<arrow::dataset::ScanOptions> options,
33- std::shared_ptr<arrow::dataset::Fragment> fragment,
34- arrow::dataset::FileSource source,
35- std::shared_ptr<skyhook::SkyhookDirectObjectAccess> doa,
36- skyhook::SkyhookFileType::type file_format,
37- arrow::compute::Expression partition_expression)
38- : ScanTask(std::move(options), std::move(fragment)),
39- source_ (std::move(source)),
40- doa_(std::move(doa)),
41- file_format_(file_format),
42- partition_expression_(std::move(partition_expression)) {}
43-
44- arrow::Result<arrow::RecordBatchIterator> Execute () override {
45- // / Retrieve the size of the file using POSIX `stat`.
46- struct stat st {};
47- RETURN_NOT_OK (doa_->Stat (source_.path (), st));
48-
49- // / Create a ScanRequest instance.
50- skyhook::ScanRequest req;
51- req.filter_expression = options_->filter ;
52- req.partition_expression = partition_expression_;
53- req.projection_schema = options_->projected_schema ;
54- req.dataset_schema = options_->dataset_schema ;
55- req.file_size = st.st_size ;
56- req.file_format = file_format_;
57-
58- // / Serialize the ScanRequest into a ceph bufferlist.
59- ceph::bufferlist request;
60- RETURN_NOT_OK (skyhook::SerializeScanRequest (req, &request));
61-
62- // / Execute the Ceph object class method `scan_op`.
63- ceph::bufferlist result;
64- RETURN_NOT_OK (doa_->Exec (st.st_ino , " scan_op" , request, result));
65-
66- // / Read RecordBatches from the result bufferlist. Since, this step might use
67- // / threads for decompressing compressed batches, to avoid running into
68- // / [ARROW-12597], we switch off threaded decompression to avoid nested threading
69- // / scenarios when scan tasks are executed in parallel by the CpuThreadPool.
70- arrow::RecordBatchVector batches;
71- RETURN_NOT_OK (skyhook::DeserializeTable (result, !options_->use_threads , &batches));
72- return arrow::MakeVectorIterator (std::move (batches));
73- }
74-
75- protected:
76- arrow::dataset::FileSource source_;
77- std::shared_ptr<skyhook::SkyhookDirectObjectAccess> doa_;
78- skyhook::SkyhookFileType::type file_format_;
79- arrow::compute::Expression partition_expression_;
80- };
81-
8230class SkyhookFileFormat ::Impl {
8331 public:
8432 Impl (std::shared_ptr<RadosConnCtx> ctx, std::string file_format)
@@ -95,12 +43,10 @@ class SkyhookFileFormat::Impl {
9543 return arrow::Status::OK ();
9644 }
9745
98- arrow::Result<arrow::dataset::ScanTaskIterator> ScanFile (
46+ arrow::Result<arrow::RecordBatchGenerator> ScanBatchesAsync (
47+ const std::shared_ptr<const SkyhookFileFormat>& format,
9948 const std::shared_ptr<arrow::dataset::ScanOptions>& options,
10049 const std::shared_ptr<arrow::dataset::FileFragment>& file) const {
101- // / Make sure client-side filtering and projection is turned off.
102- file->apply_compute = false ;
103-
10450 // / Convert string file format name to Enum.
10551 skyhook::SkyhookFileType::type file_format;
10652 if (file_format_ == " parquet" ) {
@@ -111,9 +57,46 @@ class SkyhookFileFormat::Impl {
11157 return arrow::Status::Invalid (" Unsupported file format " , file_format_);
11258 }
11359
114- arrow::dataset::ScanTaskVector v{std::make_shared<SkyhookScanTask>(
115- options, file, file->source (), doa_, file_format, file->partition_expression ())};
116- return arrow::MakeVectorIterator (v);
60+ auto fut = arrow::DeferNotOk (options->io_context .executor ()->Submit (
61+ [file, file_format, format,
62+ options]() -> arrow::Result<arrow::RecordBatchGenerator> {
63+ auto self = format->impl_ .get ();
64+
65+ // / Retrieve the size of the file using POSIX `stat`.
66+ struct stat st {};
67+ RETURN_NOT_OK (self->doa_ ->Stat (file->source ().path (), st));
68+
69+ // / Create a ScanRequest instance.
70+ skyhook::ScanRequest req;
71+ req.filter_expression = options->filter ;
72+ req.partition_expression = file->partition_expression ();
73+ req.projection_schema = options->projected_schema ;
74+ req.dataset_schema = options->dataset_schema ;
75+ req.file_size = st.st_size ;
76+ req.file_format = file_format;
77+
78+ // / Serialize the ScanRequest into a ceph bufferlist.
79+ ceph::bufferlist request;
80+ RETURN_NOT_OK (skyhook::SerializeScanRequest (req, &request));
81+
82+ // / Execute the Ceph object class method `scan_op`.
83+ ceph::bufferlist result;
84+ RETURN_NOT_OK (self->doa_ ->Exec (st.st_ino , " scan_op" , request, result));
85+
86+ // / Read RecordBatches from the result bufferlist. Since, this step might use
87+ // / threads for decompressing compressed batches, to avoid running into
88+ // / [ARROW-12597], we switch off threaded decompression to avoid nested
89+ // / threading scenarios when scan tasks are executed in parallel by the
90+ // / CpuThreadPool.
91+ arrow::RecordBatchVector batches;
92+ RETURN_NOT_OK (
93+ skyhook::DeserializeTable (result, !options->use_threads , &batches));
94+ auto gen = arrow::MakeVectorGenerator (std::move (batches));
95+ // Keep Ceph client alive
96+ arrow::RecordBatchGenerator gen_with_client = [format, gen]() { return gen (); };
97+ return gen_with_client;
98+ }));
99+ return arrow::MakeFromFuture (std::move (fut));
117100 }
118101
119102 arrow::Result<std::shared_ptr<arrow::Schema>> Inspect (
@@ -160,10 +143,12 @@ arrow::Result<std::shared_ptr<arrow::Schema>> SkyhookFileFormat::Inspect(
160143 return impl_->Inspect (source);
161144}
162145
163- arrow::Result<arrow::dataset::ScanTaskIterator > SkyhookFileFormat::ScanFile (
146+ arrow::Result<arrow::RecordBatchGenerator > SkyhookFileFormat::ScanBatchesAsync (
164147 const std::shared_ptr<arrow::dataset::ScanOptions>& options,
165148 const std::shared_ptr<arrow::dataset::FileFragment>& file) const {
166- return impl_->ScanFile (options, file);
149+ return impl_->ScanBatchesAsync (
150+ arrow::internal::checked_pointer_cast<const SkyhookFileFormat>(shared_from_this ()),
151+ options, file);
167152}
168153
169154std::shared_ptr<arrow::dataset::FileWriteOptions>
0 commit comments