Skip to content

Commit 5fb2243

Browse files
committed
ARROW-14658: [C++] Add basic support for nested field refs in scanning
This implements the following: - Being able to project and filter on nested fields in the scanner/query engine. Parquet, ORC, and Feather are supported/tested. For ORC and Feather, we will read the entire top-level column. (CSV does not support reading any nested types, though if it does in the future, it should behave the same as Feather/ORC.) For Parquet, we could materialize only the leaf nodes necessary for the projection, but without ARROW-1888 this will fail later on in the scanning pipeline, so we behave the same as Feather/ORC. The following are not implemented: - Normally, the scanner can fill in a column of nulls if a requested column does not exist in a file. This is not supported for nested field refs because we need ARROW-1888 to be implemented. - A nested field ref cannot be used as a key/target of an aggregation or join. However, you can first project the nested fields into their own fields, then aggregate/join on them as usual. This limitation is because the aggregate/join nodes currently compute a FieldPath to resolve a FieldRef, but then throw away the path, keeping only the first index. To implement this, we would need to store the FieldPath and use the struct_field kernel to resolve the actual array, however, this will have more overhead and we should be careful about regressions here, especially in the common case of no nested field refs. - Only FieldRefs consisting of field names are supported. For FieldRefs consisting of FieldPath (= a sequence of indices), the semantics are unclear. So far, the scanner is robust to individual files having fields in a different order than the overall dataset, but this won't work for FieldPath, so either we must require that the schema is consistent across files, or come up with some way to map file schemas onto the dataset schema so that indices have a consistent meaning. Closes apache#11704 from lidavidm/arrow-14658 Authored-by: David Li <li.davidm96@gmail.com> Signed-off-by: David Li <li.davidm96@gmail.com>
1 parent 30ddc2f commit 5fb2243

18 files changed

Lines changed: 694 additions & 124 deletions

cpp/src/arrow/compute/exec/plan_test.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,73 @@ TEST(ExecPlanExecution, SourceGroupedSum) {
793793
}
794794
}
795795

796+
TEST(ExecPlanExecution, NestedSourceFilter) {
797+
for (bool parallel : {false, true}) {
798+
SCOPED_TRACE(parallel ? "parallel/merged" : "serial");
799+
800+
auto input = MakeNestedBatches();
801+
auto empty = ExecBatchFromJSON({input.schema->field(0)->type()}, R"([])");
802+
auto expected = ExecBatchFromJSON({input.schema->field(0)->type()}, R"([
803+
[{"i32": 5, "bool": null}],
804+
[{"i32": 6, "bool": false}],
805+
[{"i32": 7, "bool": false}]
806+
])");
807+
808+
ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make());
809+
AsyncGenerator<util::optional<ExecBatch>> sink_gen;
810+
811+
ASSERT_OK(Declaration::Sequence(
812+
{
813+
{"source", SourceNodeOptions{input.schema,
814+
input.gen(parallel, /*slow=*/false)}},
815+
{"filter", FilterNodeOptions{greater_equal(
816+
field_ref(FieldRef("struct", "i32")), literal(5))}},
817+
{"sink", SinkNodeOptions{&sink_gen}},
818+
})
819+
.AddToPlan(plan.get()));
820+
821+
ASSERT_THAT(StartAndCollect(plan.get(), sink_gen),
822+
Finishes(ResultWith(UnorderedElementsAreArray({empty, expected}))));
823+
}
824+
}
825+
826+
TEST(ExecPlanExecution, NestedSourceProjectGroupedSum) {
827+
for (bool parallel : {false, true}) {
828+
SCOPED_TRACE(parallel ? "parallel/merged" : "serial");
829+
830+
auto input = MakeNestedBatches();
831+
auto expected = ExecBatchFromJSON({int64(), boolean()}, R"([
832+
[null, true],
833+
[17, false],
834+
[5, null]
835+
])");
836+
837+
ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make());
838+
AsyncGenerator<util::optional<ExecBatch>> sink_gen;
839+
840+
ASSERT_OK(
841+
Declaration::Sequence(
842+
{
843+
{"source",
844+
SourceNodeOptions{input.schema, input.gen(parallel, /*slow=*/false)}},
845+
{"project", ProjectNodeOptions{{
846+
field_ref(FieldRef("struct", "i32")),
847+
field_ref(FieldRef("struct", "bool")),
848+
},
849+
{"i32", "bool"}}},
850+
{"aggregate", AggregateNodeOptions{/*aggregates=*/{{"hash_sum", nullptr}},
851+
/*targets=*/{"i32"},
852+
/*names=*/{"sum(i32)"},
853+
/*keys=*/{"bool"}}},
854+
{"sink", SinkNodeOptions{&sink_gen}},
855+
})
856+
.AddToPlan(plan.get()));
857+
858+
ASSERT_THAT(StartAndCollect(plan.get(), sink_gen),
859+
Finishes(ResultWith(UnorderedElementsAreArray({expected}))));
860+
}
861+
}
862+
796863
TEST(ExecPlanExecution, SourceFilterProjectGroupedSumFilter) {
797864
for (bool parallel : {false, true}) {
798865
SCOPED_TRACE(parallel ? "parallel/merged" : "serial");

cpp/src/arrow/compute/exec/test_util.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,20 @@ BatchesWithSchema MakeBasicBatches() {
190190
return out;
191191
}
192192

193+
BatchesWithSchema MakeNestedBatches() {
194+
auto ty = struct_({field("i32", int32()), field("bool", boolean())});
195+
BatchesWithSchema out;
196+
out.batches = {
197+
ExecBatchFromJSON(
198+
{ty},
199+
R"([[{"i32": null, "bool": true}], [{"i32": 4, "bool": false}], [null]])"),
200+
ExecBatchFromJSON(
201+
{ty},
202+
R"([[{"i32": 5, "bool": null}], [{"i32": 6, "bool": false}], [{"i32": 7, "bool": false}]])")};
203+
out.schema = schema({field("struct", ty)});
204+
return out;
205+
}
206+
193207
BatchesWithSchema MakeRandomBatches(const std::shared_ptr<Schema>& schema,
194208
int num_batches, int batch_size) {
195209
BatchesWithSchema out;

cpp/src/arrow/compute/exec/test_util.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ Future<std::vector<ExecBatch>> StartAndCollect(
8787
ARROW_TESTING_EXPORT
8888
BatchesWithSchema MakeBasicBatches();
8989

90+
ARROW_TESTING_EXPORT
91+
BatchesWithSchema MakeNestedBatches();
92+
9093
ARROW_TESTING_EXPORT
9194
BatchesWithSchema MakeRandomBatches(const std::shared_ptr<Schema>& schema,
9295
int num_batches = 10, int batch_size = 4);

cpp/src/arrow/dataset/file_csv.cc

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,28 @@ static inline Result<csv::ConvertOptions> GetConvertOptions(
116116

117117
if (!scan_options) return convert_options;
118118

119-
auto materialized = scan_options->MaterializedFields();
120-
std::unordered_set<std::string> materialized_fields(materialized.begin(),
121-
materialized.end());
119+
auto field_refs = scan_options->MaterializedFields();
120+
std::unordered_set<std::string> materialized_fields;
121+
materialized_fields.reserve(field_refs.size());
122+
// Preprocess field refs. We try to avoid FieldRef::GetFoo here since that's
123+
// quadratic (and this is significant overhead with 1000+ columns)
124+
for (const auto& ref : field_refs) {
125+
if (const std::string* name = ref.name()) {
126+
// Common case
127+
materialized_fields.emplace(*name);
128+
continue;
129+
}
130+
// Currently CSV reader doesn't support reading any nested types, so this
131+
// path shouldn't be hit. However, implement it in the same way as IPC/ORC:
132+
// load the entire top-level field if a nested field is selected.
133+
ARROW_ASSIGN_OR_RAISE(auto field, ref.GetOneOrNone(*scan_options->dataset_schema));
134+
if (column_names.find(field->name()) == column_names.end()) continue;
135+
// Only read the requested columns
136+
convert_options.include_columns.push_back(field->name());
137+
// Properly set conversion types
138+
convert_options.column_types[field->name()] = field->type();
139+
}
140+
122141
for (auto field : scan_options->dataset_schema->fields()) {
123142
if (materialized_fields.find(field->name()) == materialized_fields.end()) continue;
124143
// Ignore virtual columns.

cpp/src/arrow/dataset/file_csv_test.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -384,13 +384,20 @@ class TestCsvFileFormatScan : public FileFormatScanMixin<CsvFormatHelper> {};
384384

385385
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReader) { TestScan(); }
386386
TEST_P(TestCsvFileFormatScan, ScanBatchSize) { TestScanBatchSize(); }
387-
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
388-
TestScanWithVirtualColumn();
389-
}
390387
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
388+
// NOTE(ARROW-14658): TestScanProjectedNested is ignored since CSV
389+
// doesn't have any nested types for us to work with
391390
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderProjectedMissingCols) {
392391
TestScanProjectedMissingCols();
393392
}
393+
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
394+
TestScanWithVirtualColumn();
395+
}
396+
// The CSV reader rejects duplicate columns, so skip
397+
// ScanRecordBatchReaderWithDuplicateColumn
398+
TEST_P(TestCsvFileFormatScan, ScanRecordBatchReaderWithDuplicateColumnError) {
399+
TestScanWithDuplicateColumnError();
400+
}
394401

395402
INSTANTIATE_TEST_SUITE_P(TestScan, TestCsvFileFormatScan,
396403
::testing::ValuesIn(TestFormatParams::Values()),

cpp/src/arrow/dataset/file_ipc.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@ static inline Future<std::shared_ptr<ipc::RecordBatchFileReader>> OpenReaderAsyn
7575
}
7676

7777
static inline Result<std::vector<int>> GetIncludedFields(
78-
const Schema& schema, const std::vector<std::string>& materialized_fields) {
78+
const Schema& schema, const std::vector<FieldRef>& materialized_fields) {
7979
std::vector<int> included_fields;
8080

81-
for (FieldRef ref : materialized_fields) {
81+
for (const auto& ref : materialized_fields) {
8282
ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOneOrNone(schema));
8383
if (match.indices().empty()) continue;
8484

cpp/src/arrow/dataset/file_ipc_test.cc

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,22 @@ class TestIpcFileFormatScan : public FileFormatScanMixin<IpcFormatHelper> {};
135135

136136
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReader) { TestScan(); }
137137
TEST_P(TestIpcFileFormatScan, ScanBatchSize) { TestScanBatchSize(); }
138-
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
139-
TestScanWithVirtualColumn();
140-
}
141138
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
139+
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderProjectedNested) {
140+
TestScanProjectedNested();
141+
}
142142
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderProjectedMissingCols) {
143143
TestScanProjectedMissingCols();
144144
}
145+
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
146+
TestScanWithVirtualColumn();
147+
}
148+
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumn) {
149+
TestScanWithDuplicateColumn();
150+
}
151+
TEST_P(TestIpcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumnError) {
152+
TestScanWithDuplicateColumnError();
153+
}
145154
TEST_P(TestIpcFileFormatScan, FragmentScanOptions) {
146155
auto reader = GetRecordBatchReader(
147156
// ARROW-12077: on Windows/mimalloc/release, nullable list column leads to crash

cpp/src/arrow/dataset/file_orc.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,11 @@ class OrcScanTask {
7979
// filter out virtual columns
8080
std::vector<std::string> included_fields;
8181
ARROW_ASSIGN_OR_RAISE(auto schema, reader->ReadSchema());
82-
for (auto name : materialized_fields) {
83-
FieldRef ref(name);
82+
for (const auto& ref : materialized_fields) {
8483
ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOneOrNone(*schema));
8584
if (match.indices().empty()) continue;
8685

87-
included_fields.push_back(name);
86+
included_fields.push_back(schema->field(match.indices()[0])->name());
8887
}
8988

9089
return RecordBatchIterator(

cpp/src/arrow/dataset/file_orc_test.cc

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,25 @@ TEST_F(TestOrcFileFormat, CountRows) { TestCountRows(); }
6969
class TestOrcFileFormatScan : public FileFormatScanMixin<OrcFormatHelper> {};
7070

7171
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReader) { TestScan(); }
72-
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
73-
TestScanWithVirtualColumn();
72+
TEST_P(TestOrcFileFormatScan, ScanBatchSize) {
73+
// TODO(ARROW-14153): TestScanBatchSize();
7474
}
7575
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
76+
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderProjectedNested) {
77+
TestScanProjectedNested();
78+
}
7679
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderProjectedMissingCols) {
7780
TestScanProjectedMissingCols();
7881
}
82+
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
83+
TestScanWithVirtualColumn();
84+
}
85+
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumn) {
86+
TestScanWithDuplicateColumn();
87+
}
88+
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderWithDuplicateColumnError) {
89+
TestScanWithDuplicateColumnError();
90+
}
7991
INSTANTIATE_TEST_SUITE_P(TestScan, TestOrcFileFormatScan,
8092
::testing::ValuesIn(TestFormatParams::Values()),
8193
TestFormatParams::ToTestNameString);

cpp/src/arrow/dataset/file_parquet.cc

Lines changed: 102 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -155,30 +155,112 @@ void AddColumnIndices(const SchemaField& schema_field,
155155
}
156156
}
157157

158-
// Compute the column projection out of an optional arrow::Schema
159-
std::vector<int> InferColumnProjection(const parquet::arrow::FileReader& reader,
160-
const ScanOptions& options) {
158+
Status ResolveOneFieldRef(
159+
const SchemaManifest& manifest, const FieldRef& field_ref,
160+
const std::unordered_map<std::string, const SchemaField*>& field_lookup,
161+
const std::unordered_set<std::string>& duplicate_fields,
162+
std::vector<int>* columns_selection) {
163+
if (const std::string* name = field_ref.name()) {
164+
auto it = field_lookup.find(*name);
165+
if (it != field_lookup.end()) {
166+
AddColumnIndices(*it->second, columns_selection);
167+
} else if (duplicate_fields.find(*name) != duplicate_fields.end()) {
168+
// We shouldn't generally get here because SetProjection will reject such references
169+
return Status::Invalid("Ambiguous reference to column '", *name,
170+
"' which occurs more than once");
171+
}
172+
// "Virtual" column: field is not in file but is in the ScanOptions.
173+
// Ignore it here, as projection will pad the batch with a null column.
174+
return Status::OK();
175+
}
176+
177+
const SchemaField* toplevel = nullptr;
178+
const SchemaField* field = nullptr;
179+
if (const std::vector<FieldRef>* refs = field_ref.nested_refs()) {
180+
// Only supports a sequence of names
181+
for (const auto& ref : *refs) {
182+
if (const std::string* name = ref.name()) {
183+
if (!field) {
184+
// First lookup, top-level field
185+
auto it = field_lookup.find(*name);
186+
if (it != field_lookup.end()) {
187+
field = it->second;
188+
toplevel = field;
189+
} else if (duplicate_fields.find(*name) != duplicate_fields.end()) {
190+
return Status::Invalid("Ambiguous reference to column '", *name,
191+
"' which occurs more than once");
192+
} else {
193+
// Virtual column
194+
return Status::OK();
195+
}
196+
} else {
197+
const SchemaField* result = nullptr;
198+
for (const auto& child : field->children) {
199+
if (child.field->name() == *name) {
200+
if (!result) {
201+
result = &child;
202+
} else {
203+
return Status::Invalid("Ambiguous nested reference to column '", *name,
204+
"' which occurs more than once in field ",
205+
field->field->ToString());
206+
}
207+
}
208+
}
209+
if (!result) {
210+
// Virtual column
211+
return Status::OK();
212+
}
213+
field = result;
214+
}
215+
continue;
216+
}
217+
return Status::NotImplemented("Inferring column projection from FieldRef ",
218+
field_ref.ToString());
219+
}
220+
} else {
221+
return Status::NotImplemented("Inferring column projection from FieldRef ",
222+
field_ref.ToString());
223+
}
224+
225+
if (field) {
226+
// TODO(ARROW-1888): support fine-grained column projection. We should be
227+
// able to materialize only the child fields requested, and not the entire
228+
// top-level field.
229+
// Right now, if enabled, projection/filtering will fail when they cast the
230+
// physical schema to the dataset schema.
231+
AddColumnIndices(*toplevel, columns_selection);
232+
}
233+
return Status::OK();
234+
}
235+
236+
// Compute the column projection based on the scan options
237+
Result<std::vector<int>> InferColumnProjection(const parquet::arrow::FileReader& reader,
238+
const ScanOptions& options) {
161239
auto manifest = reader.manifest();
162240
// Checks if the field is needed in either the projection or the filter.
163-
auto field_names = options.MaterializedFields();
164-
std::unordered_set<std::string> materialized_fields{field_names.cbegin(),
165-
field_names.cend()};
166-
auto should_materialize_column = [&materialized_fields](const std::string& f) {
167-
return materialized_fields.find(f) != materialized_fields.end();
168-
};
169-
170-
std::vector<int> columns_selection;
171-
// Note that the loop is using the file's schema to iterate instead of the
172-
// materialized fields of the ScanOptions. This ensures that missing
173-
// fields in the file (but present in the ScanOptions) will be ignored. The
174-
// scanner's projector will take care of padding the column with the proper
175-
// values.
241+
auto field_refs = options.MaterializedFields();
242+
243+
// Build a lookup table from top level field name to field metadata.
244+
// This is to avoid quadratic-time mapping of projected fields to
245+
// column indices, in the common case of selecting top level
246+
// columns. For nested fields, we will pay the cost of a linear scan
247+
// assuming for now that this is relatively rare, but this can be
248+
// optimized. (Also, we don't want to pay the cost of building all
249+
// the lookup tables up front if they're rarely used.)
250+
std::unordered_map<std::string, const SchemaField*> field_lookup;
251+
std::unordered_set<std::string> duplicate_fields;
176252
for (const auto& schema_field : manifest.schema_fields) {
177-
if (should_materialize_column(schema_field.field->name())) {
178-
AddColumnIndices(schema_field, &columns_selection);
253+
const auto it = field_lookup.emplace(schema_field.field->name(), &schema_field);
254+
if (!it.second) {
255+
duplicate_fields.emplace(schema_field.field->name());
179256
}
180257
}
181258

259+
std::vector<int> columns_selection;
260+
for (const auto& ref : field_refs) {
261+
RETURN_NOT_OK(ResolveOneFieldRef(manifest, ref, field_lookup, duplicate_fields,
262+
&columns_selection));
263+
}
182264
return columns_selection;
183265
}
184266

@@ -351,7 +433,8 @@ Result<RecordBatchGenerator> ParquetFileFormat::ScanBatchesAsync(
351433
parquet_fragment->FilterRowGroups(options->filter));
352434
if (row_groups.empty()) return MakeEmptyGenerator<std::shared_ptr<RecordBatch>>();
353435
}
354-
auto column_projection = InferColumnProjection(*reader, *options);
436+
ARROW_ASSIGN_OR_RAISE(auto column_projection,
437+
InferColumnProjection(*reader, *options));
355438
ARROW_ASSIGN_OR_RAISE(
356439
auto parquet_scan_options,
357440
GetFragmentScanOptions<ParquetFragmentScanOptions>(

0 commit comments

Comments
 (0)