Skip to content

Commit 2b2eeeb

Browse files
westonpacelidavidm
authored andcommitted
ARROW-12289: [C++] Create basic AsyncScanner implementation
Adds a naive implementation of `AsyncScanner` which is different from `SyncScanner` in a few ways: * It does not use `ScanTask` and instead relies on `Fragment::ScanBatchesAsync` which returns `RecordBatchGenerator`. * It does an unordered scan by default (i.e. batches from file N may arrive before all batches from file N-1 have arrived) and can order it if asked for * It uses the unordered scan for `ToTable`. It is "naive" because this PR does not add a complete implementation for `FileFragment::ScanBatchesAsync`. This method relies on `FileFormat::ScanBatchesAsync` (in the same way that `FileFragment::Scan` relies on `FileFormat::ScanFile`). This method (`FileFormat::ScanBatchesAsync`) _should_ be overridden in each of the formats (to rely on an async reader) but it is not (yet). As a result, the performance for `AsyncScanner` is poor since it does not do any "per-file" parallelism nor does it do any "per-batch" parallelism. Follow-up tasks are ARROW-12355 (CSV), ARROW-11772 (IPC), ARROW-11843 (Parquet) In addition, this PR is built on top of ARROW-12287 so that will need to be merged first. It will also need to rebase changes from ARROW-12161 and ARROW-11797. Closes apache#10008 from westonpace/feature/arrow-12289 Authored-by: Weston Pace <weston.pace@gmail.com> Signed-off-by: David Li <li.davidm96@gmail.com>
1 parent 1dc8f94 commit 2b2eeeb

16 files changed

Lines changed: 650 additions & 138 deletions

cpp/src/arrow/dataset/dataset.cc

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,55 @@ Result<ScanTaskIterator> InMemoryFragment::Scan(std::shared_ptr<ScanOptions> opt
9595
return MakeMapIterator(fn, std::move(batches_it));
9696
}
9797

98+
Result<RecordBatchGenerator> InMemoryFragment::ScanBatchesAsync(
99+
const std::shared_ptr<ScanOptions>& options) {
100+
struct State {
101+
State(std::shared_ptr<InMemoryFragment> fragment, int64_t batch_size)
102+
: fragment(std::move(fragment)),
103+
batch_index(0),
104+
offset(0),
105+
batch_size(batch_size) {}
106+
107+
std::shared_ptr<RecordBatch> Next() {
108+
const auto& next_parent = fragment->record_batches_[batch_index];
109+
if (offset < next_parent->num_rows()) {
110+
auto next = next_parent->Slice(offset, batch_size);
111+
offset += batch_size;
112+
return next;
113+
}
114+
batch_index++;
115+
offset = 0;
116+
return nullptr;
117+
}
118+
119+
bool Finished() { return batch_index >= fragment->record_batches_.size(); }
120+
121+
std::shared_ptr<InMemoryFragment> fragment;
122+
std::size_t batch_index;
123+
int64_t offset;
124+
int64_t batch_size;
125+
};
126+
127+
struct Generator {
128+
Generator(std::shared_ptr<InMemoryFragment> fragment, int64_t batch_size)
129+
: state(std::make_shared<State>(std::move(fragment), batch_size)) {}
130+
131+
Future<std::shared_ptr<RecordBatch>> operator()() {
132+
while (!state->Finished()) {
133+
auto next = state->Next();
134+
if (next) {
135+
return Future<std::shared_ptr<RecordBatch>>::MakeFinished(std::move(next));
136+
}
137+
}
138+
return AsyncGeneratorEnd<std::shared_ptr<RecordBatch>>();
139+
}
140+
141+
std::shared_ptr<State> state;
142+
};
143+
return Generator(internal::checked_pointer_cast<InMemoryFragment>(shared_from_this()),
144+
options->batch_size);
145+
}
146+
98147
Dataset::Dataset(std::shared_ptr<Schema> schema, Expression partition_expression)
99148
: schema_(std::move(schema)),
100149
partition_expression_(std::move(partition_expression)) {}
@@ -189,11 +238,11 @@ Result<FragmentIterator> InMemoryDataset::GetFragmentsImpl(Expression) {
189238
" which did not match InMemorySource's: ", *schema);
190239
}
191240

192-
RecordBatchVector batches{batch};
193-
return std::make_shared<InMemoryFragment>(std::move(batches));
241+
return std::make_shared<InMemoryFragment>(RecordBatchVector{std::move(batch)});
194242
};
195243

196-
return MakeMaybeMapIterator(std::move(create_fragment), get_batches_->Get());
244+
auto batches_it = get_batches_->Get();
245+
return MakeMaybeMapIterator(std::move(create_fragment), std::move(batches_it));
197246
}
198247

199248
Result<std::shared_ptr<UnionDataset>> UnionDataset::Make(std::shared_ptr<Schema> schema,

cpp/src/arrow/dataset/dataset.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
namespace arrow {
3535
namespace dataset {
3636

37+
using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
38+
3739
/// \brief A granular piece of a Dataset, such as an individual file.
3840
///
3941
/// A Fragment can be read/scanned separately from other fragments. It yields a
@@ -64,6 +66,10 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
6466
/// To receive a record batch stream which is fully filtered and projected, use Scanner.
6567
virtual Result<ScanTaskIterator> Scan(std::shared_ptr<ScanOptions> options) = 0;
6668

69+
/// An asynchronous version of Scan
70+
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
71+
const std::shared_ptr<ScanOptions>& options) = 0;
72+
6773
virtual std::string type_name() const = 0;
6874
virtual std::string ToString() const { return type_name(); }
6975

@@ -113,6 +119,8 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
113119
explicit InMemoryFragment(RecordBatchVector record_batches, Expression = literal(true));
114120

115121
Result<ScanTaskIterator> Scan(std::shared_ptr<ScanOptions> options) override;
122+
Result<RecordBatchGenerator> ScanBatchesAsync(
123+
const std::shared_ptr<ScanOptions>& options) override;
116124

117125
std::string type_name() const override { return "in-memory"; }
118126

cpp/src/arrow/dataset/file_base.cc

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,70 @@ Result<std::shared_ptr<FileFragment>> FileFormat::MakeFragment(
102102
std::move(partition_expression), std::move(physical_schema)));
103103
}
104104

105+
// TODO(ARROW-12355[CSV], ARROW-11772[IPC], ARROW-11843[Parquet]) The following
106+
// implementation of ScanBatchesAsync is both ugly and terribly ineffecient. Each of the
107+
// formats should provide their own efficient implementation.
108+
Result<RecordBatchGenerator> FileFormat::ScanBatchesAsync(
109+
const std::shared_ptr<ScanOptions>& scan_options,
110+
const std::shared_ptr<FileFragment>& file) {
111+
ARROW_ASSIGN_OR_RAISE(auto scan_task_it, ScanFile(scan_options, file));
112+
struct State {
113+
State(std::shared_ptr<ScanOptions> scan_options, ScanTaskIterator scan_task_it)
114+
: scan_options(std::move(scan_options)),
115+
scan_task_it(std::move(scan_task_it)),
116+
current_rb_it(),
117+
finished(false) {}
118+
119+
std::shared_ptr<ScanOptions> scan_options;
120+
ScanTaskIterator scan_task_it;
121+
RecordBatchIterator current_rb_it;
122+
bool finished;
123+
};
124+
struct Generator {
125+
Future<std::shared_ptr<RecordBatch>> operator()() {
126+
while (!state->finished) {
127+
if (!state->current_rb_it) {
128+
RETURN_NOT_OK(PumpScanTask());
129+
if (state->finished) {
130+
return AsyncGeneratorEnd<std::shared_ptr<RecordBatch>>();
131+
}
132+
}
133+
ARROW_ASSIGN_OR_RAISE(auto next_batch, state->current_rb_it.Next());
134+
if (IsIterationEnd(next_batch)) {
135+
state->current_rb_it = RecordBatchIterator();
136+
} else {
137+
return Future<std::shared_ptr<RecordBatch>>::MakeFinished(next_batch);
138+
}
139+
}
140+
return AsyncGeneratorEnd<std::shared_ptr<RecordBatch>>();
141+
}
142+
Status PumpScanTask() {
143+
ARROW_ASSIGN_OR_RAISE(auto next_task, state->scan_task_it.Next());
144+
if (IsIterationEnd(next_task)) {
145+
state->finished = true;
146+
} else {
147+
ARROW_ASSIGN_OR_RAISE(state->current_rb_it, next_task->Execute());
148+
}
149+
return Status::OK();
150+
}
151+
std::shared_ptr<State> state;
152+
};
153+
return Generator{std::make_shared<State>(scan_options, std::move(scan_task_it))};
154+
}
155+
105156
Result<std::shared_ptr<Schema>> FileFragment::ReadPhysicalSchemaImpl() {
106157
return format_->Inspect(source_);
107158
}
108159

109160
Result<ScanTaskIterator> FileFragment::Scan(std::shared_ptr<ScanOptions> options) {
110161
auto self = std::dynamic_pointer_cast<FileFragment>(shared_from_this());
111-
return format_->ScanFile(std::move(options), self);
162+
return format_->ScanFile(options, self);
163+
}
164+
165+
Result<RecordBatchGenerator> FileFragment::ScanBatchesAsync(
166+
const std::shared_ptr<ScanOptions>& options) {
167+
auto self = std::dynamic_pointer_cast<FileFragment>(shared_from_this());
168+
return format_->ScanBatchesAsync(options, self);
112169
}
113170

114171
struct FileSystemDataset::FragmentSubtrees {

cpp/src/arrow/dataset/file_base.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,13 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileForma
149149
/// \brief Open a FileFragment for scanning.
150150
/// May populate lazy properties of the FileFragment.
151151
virtual Result<ScanTaskIterator> ScanFile(
152-
std::shared_ptr<ScanOptions> options,
152+
const std::shared_ptr<ScanOptions>& options,
153153
const std::shared_ptr<FileFragment>& file) const = 0;
154154

155+
virtual Result<RecordBatchGenerator> ScanBatchesAsync(
156+
const std::shared_ptr<ScanOptions>& options,
157+
const std::shared_ptr<FileFragment>& file);
158+
155159
/// \brief Open a fragment
156160
virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
157161
FileSource source, Expression partition_expression,
@@ -178,6 +182,8 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileForma
178182
class ARROW_DS_EXPORT FileFragment : public Fragment {
179183
public:
180184
Result<ScanTaskIterator> Scan(std::shared_ptr<ScanOptions> options) override;
185+
Result<RecordBatchGenerator> ScanBatchesAsync(
186+
const std::shared_ptr<ScanOptions>& options) override;
181187

182188
std::string type_name() const override { return format_->type_name(); }
183189
std::string ToString() const override { return source_.path(); };

cpp/src/arrow/dataset/file_csv.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,11 +191,10 @@ Result<std::shared_ptr<Schema>> CsvFileFormat::Inspect(const FileSource& source)
191191
}
192192

193193
Result<ScanTaskIterator> CsvFileFormat::ScanFile(
194-
std::shared_ptr<ScanOptions> options,
194+
const std::shared_ptr<ScanOptions>& options,
195195
const std::shared_ptr<FileFragment>& fragment) const {
196196
auto this_ = checked_pointer_cast<const CsvFileFormat>(shared_from_this());
197-
auto task =
198-
std::make_shared<CsvScanTask>(std::move(this_), std::move(options), fragment);
197+
auto task = std::make_shared<CsvScanTask>(std::move(this_), options, fragment);
199198

200199
return MakeVectorIterator<std::shared_ptr<ScanTask>>({std::move(task)});
201200
}

cpp/src/arrow/dataset/file_csv.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
5454

5555
/// \brief Open a file for scanning
5656
Result<ScanTaskIterator> ScanFile(
57-
std::shared_ptr<ScanOptions> options,
57+
const std::shared_ptr<ScanOptions>& options,
5858
const std::shared_ptr<FileFragment>& fragment) const override;
5959

6060
Result<std::shared_ptr<FileWriter>> MakeWriter(

cpp/src/arrow/dataset/file_ipc.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,9 @@ Result<std::shared_ptr<Schema>> IpcFileFormat::Inspect(const FileSource& source)
168168
}
169169

170170
Result<ScanTaskIterator> IpcFileFormat::ScanFile(
171-
std::shared_ptr<ScanOptions> options,
171+
const std::shared_ptr<ScanOptions>& options,
172172
const std::shared_ptr<FileFragment>& fragment) const {
173-
return IpcScanTaskIterator::Make(std::move(options), std::move(fragment));
173+
return IpcScanTaskIterator::Make(options, fragment);
174174
}
175175

176176
//

cpp/src/arrow/dataset/file_ipc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
5353

5454
/// \brief Open a file for scanning
5555
Result<ScanTaskIterator> ScanFile(
56-
std::shared_ptr<ScanOptions> options,
56+
const std::shared_ptr<ScanOptions>& options,
5757
const std::shared_ptr<FileFragment>& fragment) const override;
5858

5959
Result<std::shared_ptr<FileWriter>> MakeWriter(

cpp/src/arrow/dataset/file_parquet.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ Result<std::unique_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
326326
}
327327

328328
Result<ScanTaskIterator> ParquetFileFormat::ScanFile(
329-
std::shared_ptr<ScanOptions> options,
329+
const std::shared_ptr<ScanOptions>& options,
330330
const std::shared_ptr<FileFragment>& fragment) const {
331331
auto* parquet_fragment = checked_cast<ParquetFileFragment*>(fragment.get());
332332
std::vector<int> row_groups;

cpp/src/arrow/dataset/file_parquet.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
9696

9797
/// \brief Open a file for scanning
9898
Result<ScanTaskIterator> ScanFile(
99-
std::shared_ptr<ScanOptions> options,
99+
const std::shared_ptr<ScanOptions>& options,
100100
const std::shared_ptr<FileFragment>& file) const override;
101101

102102
using FileFormat::MakeFragment;

0 commit comments

Comments
 (0)