Skip to content

Commit c287cfe

Browse files
authored
apacheGH-15264: [C++] Add scanner tests for disabling readahead and fix relevant bugs (apache#29185)
* Closes: apache#15264 Authored-by: Weston Pace <weston.pace@gmail.com> Signed-off-by: Weston Pace <weston.pace@gmail.com>
1 parent 52def4e commit c287cfe

8 files changed

Lines changed: 44 additions & 6 deletions

File tree

cpp/src/arrow/dataset/file_csv_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,7 @@ class TestCsvFileFormatScan : public FileFormatScanMixin<CsvFormatHelper> {};
491491

492492
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReader) { TestScan(); }
493493
TEST_P(TestCsvFileFormatScan, ScanBatchSize) { TestScanBatchSize(); }
494+
TEST_P(TestCsvFileFormatScan, ScanNoReadhead) { TestScanNoReadahead(); }
494495
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
495496
// NOTE(ARROW-14658): TestScanProjectedNested is ignored since CSV
496497
// doesn't have any nested types for us to work with

cpp/src/arrow/dataset/file_ipc.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ Result<RecordBatchGenerator> IpcFileFormat::ScanBatchesAsync(
171171
}
172172
WRAP_ASYNC_GENERATOR_WITH_CHILD_SPAN(
173173
generator, "arrow::dataset::IpcFileFormat::ScanBatchesAsync::Next");
174+
if (readahead_level == 0) {
175+
return MakeChunkedBatchGenerator(std::move(generator), options->batch_size);
176+
}
174177
auto batch_generator = MakeReadaheadGenerator(std::move(generator), readahead_level);
175178
return MakeChunkedBatchGenerator(std::move(batch_generator), options->batch_size);
176179
};

cpp/src/arrow/dataset/file_ipc_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ class TestIpcFileFormatScan : public FileFormatScanMixin<IpcFormatHelper> {};
135135

136136
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReader) { TestScan(); }
137137
TEST_P(TestIpcFileFormatScan, ScanBatchSize) { TestScanBatchSize(); }
138+
TEST_P(TestIpcFileFormatScan, ScanNoReadahead) { TestScanNoReadahead(); }
138139
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
139140
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderProjectedNested) {
140141
TestScanProjectedNested();

cpp/src/arrow/dataset/file_orc_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ class TestOrcFileFormatScan : public FileFormatScanMixin<OrcFormatHelper> {};
7070

7171
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReader) { TestScan(); }
7272
TEST_P(TestOrcFileFormatScan, ScanBatchSize) { TestScanBatchSize(); }
73+
TEST_P(TestOrcFileFormatScan, ScanNoReadahead) { TestScanNoReadahead(); }
7374
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
7475
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderProjectedNested) {
7576
TestScanProjectedNested();

cpp/src/arrow/dataset/file_parquet.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,9 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
476476
::arrow::internal::GetCpuThreadPool(), rows_to_readahead));
477477
RecordBatchGenerator sliced =
478478
SlicingGenerator(std::move(generator), options->batch_size);
479+
if (batch_readahead == 0) {
480+
return sliced;
481+
}
479482
RecordBatchGenerator sliced_readahead =
480483
MakeSerialReadaheadGenerator(std::move(sliced), batch_readahead);
481484
return sliced_readahead;

cpp/src/arrow/dataset/file_parquet_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ class TestParquetFileFormatScan : public FileFormatScanMixin<ParquetFormatHelper
389389

390390
TEST_P(TestParquetFileFormatScan, ScanRecordBatchReader) { TestScan(); }
391391
TEST_P(TestParquetFileFormatScan, ScanBatchSize) { TestScanBatchSize(); }
392+
TEST_P(TestParquetFileFormatScan, ScanNoReadahead) { TestScanNoReadahead(); }
392393
TEST_P(TestParquetFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
393394
TEST_P(TestParquetFileFormatScan, ScanRecordBatchReaderProjectedNested) {
394395
// TODO(ARROW-1888): enable fine-grained column projection.

cpp/src/arrow/dataset/scanner.cc

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -988,16 +988,25 @@ Result<compute::ExecNode*> MakeScanNode(compute::ExecPlan* plan,
988988

989989
AsyncGenerator<EnumeratedRecordBatch> merged_batch_gen;
990990
if (require_sequenced_output) {
991-
ARROW_ASSIGN_OR_RAISE(merged_batch_gen,
992-
MakeSequencedMergedGenerator(std::move(batch_gen_gen),
993-
scan_options->fragment_readahead));
991+
if (scan_options->fragment_readahead > 1) {
992+
ARROW_ASSIGN_OR_RAISE(merged_batch_gen, MakeSequencedMergedGenerator(
993+
std::move(batch_gen_gen),
994+
scan_options->fragment_readahead));
995+
} else {
996+
merged_batch_gen = MakeConcatenatedGenerator(std::move(batch_gen_gen));
997+
}
994998
} else {
995999
merged_batch_gen =
9961000
MakeMergedGenerator(std::move(batch_gen_gen), scan_options->fragment_readahead);
9971001
}
9981002

999-
auto batch_gen = MakeReadaheadGenerator(std::move(merged_batch_gen),
1000-
scan_options->fragment_readahead);
1003+
AsyncGenerator<EnumeratedRecordBatch> batch_gen;
1004+
if (scan_options->fragment_readahead > 1) {
1005+
batch_gen = MakeReadaheadGenerator(std::move(merged_batch_gen),
1006+
scan_options->fragment_readahead);
1007+
} else {
1008+
batch_gen = std::move(merged_batch_gen);
1009+
}
10011010

10021011
auto gen = MakeMappedGenerator(
10031012
std::move(batch_gen),

cpp/src/arrow/dataset/test_util.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -604,11 +604,16 @@ class FileFormatScanMixin : public FileFormatFixtureMixin<FormatHelper>,
604604
}
605605

606606
// Scan the fragment through the scanner.
607-
RecordBatchIterator Batches(std::shared_ptr<Fragment> fragment) {
607+
RecordBatchIterator Batches(std::shared_ptr<Fragment> fragment,
608+
bool use_readahead = true) {
608609
auto dataset = std::make_shared<FragmentDataset>(opts_->dataset_schema,
609610
FragmentVector{fragment});
610611
ScannerBuilder builder(dataset, opts_);
611612
ARROW_EXPECT_OK(builder.UseThreads(GetParam().use_threads));
613+
if (!use_readahead) {
614+
ARROW_EXPECT_OK(builder.FragmentReadahead(0));
615+
ARROW_EXPECT_OK(builder.BatchReadahead(0));
616+
}
612617
EXPECT_OK_AND_ASSIGN(auto scanner, builder.Finish());
613618
EXPECT_OK_AND_ASSIGN(auto batch_it, scanner->ScanBatches());
614619
return MakeMapIterator([](TaggedRecordBatch tagged) { return tagged.record_batch; },
@@ -656,6 +661,20 @@ class FileFormatScanMixin : public FileFormatFixtureMixin<FormatHelper>,
656661
}
657662
ASSERT_EQ(row_count, GetParam().expected_rows());
658663
}
664+
void TestScanNoReadahead() {
665+
auto reader = GetRecordBatchReader(schema({field("f64", float64())}));
666+
auto source = this->GetFileSource(reader.get());
667+
668+
this->SetSchema(reader->schema()->fields());
669+
auto fragment = this->MakeFragment(*source);
670+
671+
int64_t row_count = 0;
672+
for (auto maybe_batch : Batches(fragment, /*use_readahead=*/false)) {
673+
ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
674+
row_count += batch->num_rows();
675+
}
676+
ASSERT_EQ(row_count, GetParam().expected_rows());
677+
}
659678
// Ensure file formats only return columns needed to fulfill filter/projection
660679
void TestScanProjected() {
661680
auto f32 = field("f32", float32());

0 commit comments

Comments
 (0)