Skip to content

Commit bfdfeda

Browse files
committed
ARROW-8943: [C++][Python][Dataset] Add partitioning support to ParquetDatasetFactory
Closes apache#7437 from fsaintjacques/ARROW-8943-parquet-dataset-partition Authored-by: François Saint-Jacques <fsaintjacques@gmail.com> Signed-off-by: François Saint-Jacques <fsaintjacques@gmail.com>
1 parent d08af2f commit bfdfeda

12 files changed

Lines changed: 320 additions & 86 deletions

File tree

cpp/examples/arrow/dataset-parquet-scan-example.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ std::shared_ptr<ds::Dataset> GetDatasetFromDirectory(
100100
std::shared_ptr<ds::Dataset> GetParquetDatasetFromMetadata(
101101
std::shared_ptr<fs::FileSystem> fs, std::shared_ptr<ds::ParquetFileFormat> format,
102102
std::string metadata_path) {
103-
auto factory = ds::ParquetDatasetFactory::Make(metadata_path, fs, format).ValueOrDie();
103+
ds::ParquetFactoryOptions options;
104+
auto factory =
105+
ds::ParquetDatasetFactory::Make(metadata_path, fs, format, options).ValueOrDie();
104106
return factory->Finish().ValueOrDie();
105107
}
106108

cpp/src/arrow/dataset/discovery.cc

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,6 @@ FileSystemDatasetFactory::FileSystemDatasetFactory(
110110
format_(std::move(format)),
111111
options_(std::move(options)) {}
112112

113-
util::optional<util::string_view> FileSystemDatasetFactory::RemovePartitionBaseDir(
114-
util::string_view path) {
115-
const util::string_view partition_base_dir{options_.partition_base_dir};
116-
return fs::internal::RemoveAncestor(partition_base_dir, path);
117-
}
118-
119113
Result<std::shared_ptr<DatasetFactory>> FileSystemDatasetFactory::Make(
120114
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
121115
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options) {
@@ -186,23 +180,6 @@ Result<std::shared_ptr<DatasetFactory>> FileSystemDatasetFactory::Make(
186180
std::move(options));
187181
}
188182

189-
Result<std::shared_ptr<Schema>> FileSystemDatasetFactory::PartitionSchema() {
190-
if (auto partitioning = options_.partitioning.partitioning()) {
191-
return partitioning->schema();
192-
}
193-
194-
std::vector<std::string> relative_paths;
195-
for (const auto& path : paths_) {
196-
if (auto relative = RemovePartitionBaseDir(path)) {
197-
auto relative_str = relative->to_string();
198-
auto basename_filename = fs::internal::GetAbstractPathParent(relative_str);
199-
relative_paths.push_back(basename_filename.first);
200-
}
201-
}
202-
203-
return options_.partitioning.factory()->Inspect(relative_paths);
204-
}
205-
206183
Result<std::vector<std::shared_ptr<Schema>>> FileSystemDatasetFactory::InspectSchemas(
207184
InspectOptions options) {
208185
std::vector<std::shared_ptr<Schema>> schemas;
@@ -215,7 +192,9 @@ Result<std::vector<std::shared_ptr<Schema>>> FileSystemDatasetFactory::InspectSc
215192
schemas.push_back(schema);
216193
}
217194

218-
ARROW_ASSIGN_OR_RAISE(auto partition_schema, PartitionSchema());
195+
ARROW_ASSIGN_OR_RAISE(auto partition_schema,
196+
options_.partitioning.GetOrInferSchema(
197+
StripPrefixAndFilename(paths_, options_.partition_base_dir)));
219198
schemas.push_back(partition_schema);
220199

221200
return schemas;
@@ -245,13 +224,8 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
245224

246225
std::vector<std::shared_ptr<FileFragment>> fragments;
247226
for (const auto& path : paths_) {
248-
std::shared_ptr<Expression> partition = scalar(true);
249-
if (auto relative = RemovePartitionBaseDir(path)) {
250-
auto relative_str = relative->to_string();
251-
auto basename_filename = fs::internal::GetAbstractPathParent(relative_str);
252-
ARROW_ASSIGN_OR_RAISE(partition, partitioning->Parse(basename_filename.first));
253-
}
254-
227+
auto fixed_path = StripPrefixAndFilename(path, options_.partition_base_dir);
228+
ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path));
255229
ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({path, fs_}, partition));
256230
fragments.push_back(fragment);
257231
}

cpp/src/arrow/dataset/discovery.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,6 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
233233
std::shared_ptr<fs::FileSystem> fs_;
234234
std::shared_ptr<FileFormat> format_;
235235
FileSystemFactoryOptions options_;
236-
237-
private:
238-
util::optional<util::string_view> RemovePartitionBaseDir(util::string_view path);
239236
};
240237

241238
} // namespace dataset

cpp/src/arrow/dataset/file_parquet.cc

Lines changed: 89 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -500,44 +500,49 @@ Result<FragmentVector> ParquetFileFragment::SplitByRowGroup(
500500

501501
ParquetDatasetFactory::ParquetDatasetFactory(
502502
std::shared_ptr<fs::FileSystem> filesystem, std::shared_ptr<ParquetFileFormat> format,
503-
std::shared_ptr<parquet::FileMetaData> metadata, std::string base_path)
503+
std::shared_ptr<parquet::FileMetaData> metadata, std::string base_path,
504+
ParquetFactoryOptions options)
504505
: filesystem_(std::move(filesystem)),
505506
format_(std::move(format)),
506507
metadata_(std::move(metadata)),
507-
base_path_(std::move(base_path)) {}
508+
base_path_(std::move(base_path)),
509+
options_(std::move(options)) {}
508510

509511
Result<std::shared_ptr<DatasetFactory>> ParquetDatasetFactory::Make(
510512
const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
511-
std::shared_ptr<ParquetFileFormat> format) {
513+
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options) {
512514
// Paths in ColumnChunk are relative to the `_metadata` file. Thus, the base
513515
// directory of all parquet files is `dirname(metadata_path)`.
514516
auto dirname = arrow::fs::internal::GetAbstractPathParent(metadata_path).first;
515-
return Make({metadata_path, filesystem}, dirname, filesystem, format);
517+
return Make({metadata_path, filesystem}, dirname, filesystem, std::move(format),
518+
std::move(options));
516519
}
517520

518521
Result<std::shared_ptr<DatasetFactory>> ParquetDatasetFactory::Make(
519522
const FileSource& metadata_source, const std::string& base_path,
520-
std::shared_ptr<fs::FileSystem> filesystem,
521-
std::shared_ptr<ParquetFileFormat> format) {
523+
std::shared_ptr<fs::FileSystem> filesystem, std::shared_ptr<ParquetFileFormat> format,
524+
ParquetFactoryOptions options) {
522525
DCHECK_NE(filesystem, nullptr);
523526
DCHECK_NE(format, nullptr);
524527

528+
// By automatically setting the options base_dir to the metadata's base_path,
529+
// we provide a better experience for user providing Partitioning that are
530+
// relative to the base_dir instead of the full path.
531+
if (options.partition_base_dir.empty()) {
532+
options.partition_base_dir = base_path;
533+
}
534+
525535
ARROW_ASSIGN_OR_RAISE(auto reader, format->GetReader(metadata_source));
526536
auto metadata = reader->parquet_reader()->metadata();
527537

528-
return std::shared_ptr<DatasetFactory>(new ParquetDatasetFactory(
529-
std::move(filesystem), std::move(format), std::move(metadata), base_path));
530-
}
531-
532-
Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchemas(
533-
InspectOptions options) {
534-
std::shared_ptr<Schema> schema;
535-
RETURN_NOT_OK(parquet::arrow::FromParquetSchema(metadata_->schema(), &schema));
536-
return std::vector<std::shared_ptr<Schema>>{schema};
538+
return std::shared_ptr<DatasetFactory>(
539+
new ParquetDatasetFactory(std::move(filesystem), std::move(format),
540+
std::move(metadata), base_path, std::move(options)));
537541
}
538542

539-
static Result<std::string> FileFromRowGroup(const std::string& base_path,
540-
const parquet::RowGroupMetaData& row_group) {
543+
static inline Result<std::string> FileFromRowGroup(
544+
fs::FileSystem* filesystem, const std::string& base_path,
545+
const parquet::RowGroupMetaData& row_group) {
541546
try {
542547
auto n_columns = row_group.num_columns();
543548
if (n_columns == 0) {
@@ -565,17 +570,43 @@ static Result<std::string> FileFromRowGroup(const std::string& base_path,
565570
}
566571
}
567572

568-
return fs::internal::JoinAbstractPath(std::vector<std::string>{base_path, path});
573+
path = fs::internal::JoinAbstractPath(std::vector<std::string>{base_path, path});
574+
// Normalizing path is required for Windows.
575+
return filesystem->NormalizePath(std::move(path));
569576
} catch (const ::parquet::ParquetException& e) {
570577
return Status::Invalid("Extracting file path from RowGroup failed. Parquet threw:",
571578
e.what());
572579
}
573580
}
574581

582+
Result<std::vector<std::string>> ParquetDatasetFactory::CollectPaths(
583+
const parquet::FileMetaData& metadata,
584+
const parquet::ArrowReaderProperties& properties) {
585+
try {
586+
std::unordered_set<std::string> unique_paths;
587+
ARROW_ASSIGN_OR_RAISE(auto manifest, GetSchemaManifest(metadata, properties));
588+
589+
for (int i = 0; i < metadata.num_row_groups(); i++) {
590+
auto row_group = metadata.RowGroup(i);
591+
ARROW_ASSIGN_OR_RAISE(auto path,
592+
FileFromRowGroup(filesystem_.get(), base_path_, *row_group));
593+
unique_paths.emplace(std::move(path));
594+
}
595+
596+
std::vector<std::string> paths;
597+
for (const auto& path : unique_paths) {
598+
paths.emplace_back(path);
599+
}
600+
return paths;
601+
} catch (const ::parquet::ParquetException& e) {
602+
return Status::Invalid("Could not infer file paths from FileMetaData:", e.what());
603+
}
604+
}
605+
575606
Result<std::vector<std::shared_ptr<FileFragment>>>
576607
ParquetDatasetFactory::CollectParquetFragments(
577608
const parquet::FileMetaData& metadata,
578-
const parquet::ArrowReaderProperties& properties) {
609+
const parquet::ArrowReaderProperties& properties, const Partitioning& partitioning) {
579610
try {
580611
auto n_columns = metadata.num_columns();
581612
if (n_columns == 0) {
@@ -584,14 +615,12 @@ ParquetDatasetFactory::CollectParquetFragments(
584615
}
585616

586617
std::unordered_map<std::string, std::vector<RowGroupInfo>> path_to_row_group_infos;
587-
588618
ARROW_ASSIGN_OR_RAISE(auto manifest, GetSchemaManifest(metadata, properties));
589619

590620
for (int i = 0; i < metadata.num_row_groups(); i++) {
591621
auto row_group = metadata.RowGroup(i);
592-
ARROW_ASSIGN_OR_RAISE(auto path, FileFromRowGroup(base_path_, *row_group));
593-
// Normalizing path is required for Windows.
594-
ARROW_ASSIGN_OR_RAISE(path, filesystem_->NormalizePath(std::move(path)));
622+
ARROW_ASSIGN_OR_RAISE(auto path,
623+
FileFromRowGroup(filesystem_.get(), base_path_, *row_group));
595624
auto stats = RowGroupStatisticsAsExpression(*row_group, manifest);
596625
auto num_rows = row_group->num_rows();
597626

@@ -611,9 +640,13 @@ ParquetDatasetFactory::CollectParquetFragments(
611640
std::vector<std::shared_ptr<FileFragment>> fragments;
612641
fragments.reserve(path_to_row_group_infos.size());
613642
for (auto&& elem : path_to_row_group_infos) {
614-
ARROW_ASSIGN_OR_RAISE(auto fragment,
615-
format_->MakeFragment({std::move(elem.first), filesystem_},
616-
scalar(true), std::move(elem.second)));
643+
const auto& path = elem.first;
644+
auto partition =
645+
partitioning.Parse(StripPrefixAndFilename(path, options_.partition_base_dir))
646+
.ValueOr(scalar(true));
647+
ARROW_ASSIGN_OR_RAISE(
648+
auto fragment, format_->MakeFragment({path, filesystem_}, std::move(partition),
649+
std::move(elem.second)));
617650
fragments.push_back(std::move(fragment));
618651
}
619652

@@ -623,15 +656,44 @@ ParquetDatasetFactory::CollectParquetFragments(
623656
}
624657
}
625658

659+
Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchemas(
660+
InspectOptions options) {
661+
std::vector<std::shared_ptr<Schema>> schemas;
662+
663+
std::shared_ptr<Schema> schema;
664+
RETURN_NOT_OK(parquet::arrow::FromParquetSchema(metadata_->schema(), &schema));
665+
schemas.push_back(std::move(schema));
666+
667+
if (options_.partitioning.factory() != nullptr) {
668+
// Gather paths found in RowGroups' ColumnChunks.
669+
auto properties = MakeArrowReaderProperties(*format_, *metadata_);
670+
ARROW_ASSIGN_OR_RAISE(auto paths, CollectPaths(*metadata_, properties));
671+
672+
ARROW_ASSIGN_OR_RAISE(auto partition_schema,
673+
options_.partitioning.GetOrInferSchema(StripPrefixAndFilename(
674+
paths, options_.partition_base_dir)));
675+
schemas.push_back(std::move(partition_schema));
676+
}
677+
678+
return schemas;
679+
}
680+
626681
Result<std::shared_ptr<Dataset>> ParquetDatasetFactory::Finish(FinishOptions options) {
627682
std::shared_ptr<Schema> schema = options.schema;
628683
bool schema_missing = schema == nullptr;
629684
if (schema_missing) {
630685
ARROW_ASSIGN_OR_RAISE(schema, Inspect(options.inspect_options));
631686
}
632687

688+
std::shared_ptr<Partitioning> partitioning = options_.partitioning.partitioning();
689+
if (partitioning == nullptr) {
690+
auto factory = options_.partitioning.factory();
691+
ARROW_ASSIGN_OR_RAISE(partitioning, factory->Finish(schema));
692+
}
693+
633694
auto properties = MakeArrowReaderProperties(*format_, *metadata_);
634-
ARROW_ASSIGN_OR_RAISE(auto fragments, CollectParquetFragments(*metadata_, properties));
695+
ARROW_ASSIGN_OR_RAISE(auto fragments,
696+
CollectParquetFragments(*metadata_, properties, *partitioning));
635697
return FileSystemDataset::Make(std::move(schema), scalar(true), format_,
636698
std::move(fragments));
637699
}

cpp/src/arrow/dataset/file_parquet.h

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,34 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
215215
friend class ParquetFileFormat;
216216
};
217217

218+
struct ParquetFactoryOptions {
219+
// Either an explicit Partitioning or a PartitioningFactory to discover one.
220+
//
221+
// If a factory is provided, it will be used to infer a schema for partition fields
222+
// based on file and directory paths then construct a Partitioning. The default
223+
// is a Partitioning which will yield no partition information.
224+
//
225+
// The (explicit or discovered) partitioning will be applied to discovered files
226+
// and the resulting partition information embedded in the Dataset.
227+
PartitioningOrFactory partitioning{Partitioning::Default()};
228+
229+
// For the purposes of applying the partitioning, paths will be stripped
230+
// of the partition_base_dir. Files not matching the partition_base_dir
231+
// prefix will be skipped for partition discovery. The ignored files will still
232+
// be part of the Dataset, but will not have partition information.
233+
//
234+
// Example:
235+
// partition_base_dir = "/dataset";
236+
//
237+
// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
238+
//
239+
// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
240+
//
241+
// This is useful for partitioning which parses directory when ordering
242+
// is important, e.g. DirectoryPartitioning.
243+
std::string partition_base_dir;
244+
};
245+
218246
/// \brief Create FileSystemDataset from custom `_metadata` cache file.
219247
///
220248
/// Dask and other systems will generate a cache metadata file by concatenating
@@ -234,9 +262,10 @@ class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
234262
/// \param[in] metadata_path path of the metadata parquet file
235263
/// \param[in] filesystem from which to open/read the path
236264
/// \param[in] format to read the file with.
265+
/// \param[in] options see ParquetFactoryOptions
237266
static Result<std::shared_ptr<DatasetFactory>> Make(
238267
const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
239-
std::shared_ptr<ParquetFileFormat> format);
268+
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
240269

241270
/// \brief Create a ParquetDatasetFactory from a metadata source.
242271
///
@@ -248,10 +277,11 @@ class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
248277
/// \param[in] base_path used as the prefix of every parquet files referenced
249278
/// \param[in] filesystem from which to read the files referenced.
250279
/// \param[in] format to read the file with.
280+
/// \param[in] options see ParquetFactoryOptions
251281
static Result<std::shared_ptr<DatasetFactory>> Make(
252282
const FileSource& metadata, const std::string& base_path,
253283
std::shared_ptr<fs::FileSystem> filesystem,
254-
std::shared_ptr<ParquetFileFormat> format);
284+
std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
255285

256286
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
257287
InspectOptions options) override;
@@ -262,17 +292,25 @@ class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
262292
ParquetDatasetFactory(std::shared_ptr<fs::FileSystem> fs,
263293
std::shared_ptr<ParquetFileFormat> format,
264294
std::shared_ptr<parquet::FileMetaData> metadata,
265-
std::string base_path);
295+
std::string base_path, ParquetFactoryOptions options);
266296

267297
std::shared_ptr<fs::FileSystem> filesystem_;
268298
std::shared_ptr<ParquetFileFormat> format_;
269299
std::shared_ptr<parquet::FileMetaData> metadata_;
270300
std::string base_path_;
301+
ParquetFactoryOptions options_;
302+
FragmentVector fragments_;
271303

272304
private:
273-
Result<std::vector<std::shared_ptr<FileFragment>>> CollectParquetFragments(
305+
Result<std::vector<std::string>> CollectPaths(
274306
const parquet::FileMetaData& metadata,
275307
const parquet::ArrowReaderProperties& properties);
308+
309+
Result<std::vector<std::shared_ptr<FileFragment>>> CollectParquetFragments(
310+
const parquet::FileMetaData& metadata,
311+
const parquet::ArrowReaderProperties& properties, const Partitioning& partitioning);
312+
313+
Result<std::shared_ptr<Schema>> PartitionSchema();
276314
};
277315

278316
} // namespace dataset

cpp/src/arrow/dataset/partition.cc

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,5 +598,30 @@ std::shared_ptr<PartitioningFactory> HivePartitioning::MakeFactory() {
598598
return std::shared_ptr<PartitioningFactory>(new HivePartitioningFactory());
599599
}
600600

601+
std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) {
602+
auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path);
603+
auto base_less = maybe_base_less ? maybe_base_less->to_string() : path;
604+
auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
605+
return basename_filename.first;
606+
}
607+
608+
std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
609+
const std::string& prefix) {
610+
std::vector<std::string> result;
611+
for (const auto& path : paths) {
612+
result.emplace_back(StripPrefixAndFilename(path, prefix));
613+
}
614+
return result;
615+
}
616+
617+
Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
618+
const std::vector<std::string>& paths) {
619+
if (auto part = partitioning()) {
620+
return part->schema();
621+
}
622+
623+
return factory()->Inspect(paths);
624+
}
625+
601626
} // namespace dataset
602627
} // namespace arrow

0 commit comments

Comments
 (0)