@@ -500,44 +500,49 @@ Result<FragmentVector> ParquetFileFragment::SplitByRowGroup(
500500
501501ParquetDatasetFactory::ParquetDatasetFactory (
502502 std::shared_ptr<fs::FileSystem> filesystem, std::shared_ptr<ParquetFileFormat> format,
503- std::shared_ptr<parquet::FileMetaData> metadata, std::string base_path)
503+ std::shared_ptr<parquet::FileMetaData> metadata, std::string base_path,
504+ ParquetFactoryOptions options)
504505 : filesystem_(std::move(filesystem)),
505506 format_(std::move(format)),
506507 metadata_(std::move(metadata)),
507- base_path_(std::move(base_path)) {}
508+ base_path_(std::move(base_path)),
509+ options_(std::move(options)) {}
508510
509511Result<std::shared_ptr<DatasetFactory>> ParquetDatasetFactory::Make (
510512 const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
511- std::shared_ptr<ParquetFileFormat> format) {
513+ std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options ) {
512514 // Paths in ColumnChunk are relative to the `_metadata` file. Thus, the base
513515 // directory of all parquet files is `dirname(metadata_path)`.
514516 auto dirname = arrow::fs::internal::GetAbstractPathParent (metadata_path).first ;
515- return Make ({metadata_path, filesystem}, dirname, filesystem, format);
517+ return Make ({metadata_path, filesystem}, dirname, filesystem, std::move (format),
518+ std::move (options));
516519}
517520
518521Result<std::shared_ptr<DatasetFactory>> ParquetDatasetFactory::Make (
519522 const FileSource& metadata_source, const std::string& base_path,
520- std::shared_ptr<fs::FileSystem> filesystem,
521- std::shared_ptr<ParquetFileFormat> format ) {
523+ std::shared_ptr<fs::FileSystem> filesystem, std::shared_ptr<ParquetFileFormat> format,
524+ ParquetFactoryOptions options ) {
522525 DCHECK_NE (filesystem, nullptr );
523526 DCHECK_NE (format, nullptr );
524527
528+ // By automatically setting the options base_dir to the metadata's base_path,
529+ // we provide a better experience for user providing Partitioning that are
530+ // relative to the base_dir instead of the full path.
531+ if (options.partition_base_dir .empty ()) {
532+ options.partition_base_dir = base_path;
533+ }
534+
525535 ARROW_ASSIGN_OR_RAISE (auto reader, format->GetReader (metadata_source));
526536 auto metadata = reader->parquet_reader ()->metadata ();
527537
528- return std::shared_ptr<DatasetFactory>(new ParquetDatasetFactory (
529- std::move (filesystem), std::move (format), std::move (metadata), base_path));
530- }
531-
532- Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchemas (
533- InspectOptions options) {
534- std::shared_ptr<Schema> schema;
535- RETURN_NOT_OK (parquet::arrow::FromParquetSchema (metadata_->schema (), &schema));
536- return std::vector<std::shared_ptr<Schema>>{schema};
538+ return std::shared_ptr<DatasetFactory>(
539+ new ParquetDatasetFactory (std::move (filesystem), std::move (format),
540+ std::move (metadata), base_path, std::move (options)));
537541}
538542
539- static Result<std::string> FileFromRowGroup (const std::string& base_path,
540- const parquet::RowGroupMetaData& row_group) {
543+ static inline Result<std::string> FileFromRowGroup (
544+ fs::FileSystem* filesystem, const std::string& base_path,
545+ const parquet::RowGroupMetaData& row_group) {
541546 try {
542547 auto n_columns = row_group.num_columns ();
543548 if (n_columns == 0 ) {
@@ -565,17 +570,43 @@ static Result<std::string> FileFromRowGroup(const std::string& base_path,
565570 }
566571 }
567572
568- return fs::internal::JoinAbstractPath (std::vector<std::string>{base_path, path});
573+ path = fs::internal::JoinAbstractPath (std::vector<std::string>{base_path, path});
574+ // Normalizing path is required for Windows.
575+ return filesystem->NormalizePath (std::move (path));
569576 } catch (const ::parquet::ParquetException& e) {
570577 return Status::Invalid (" Extracting file path from RowGroup failed. Parquet threw:" ,
571578 e.what ());
572579 }
573580}
574581
582+ Result<std::vector<std::string>> ParquetDatasetFactory::CollectPaths (
583+ const parquet::FileMetaData& metadata,
584+ const parquet::ArrowReaderProperties& properties) {
585+ try {
586+ std::unordered_set<std::string> unique_paths;
587+ ARROW_ASSIGN_OR_RAISE (auto manifest, GetSchemaManifest (metadata, properties));
588+
589+ for (int i = 0 ; i < metadata.num_row_groups (); i++) {
590+ auto row_group = metadata.RowGroup (i);
591+ ARROW_ASSIGN_OR_RAISE (auto path,
592+ FileFromRowGroup (filesystem_.get (), base_path_, *row_group));
593+ unique_paths.emplace (std::move (path));
594+ }
595+
596+ std::vector<std::string> paths;
597+ for (const auto & path : unique_paths) {
598+ paths.emplace_back (path);
599+ }
600+ return paths;
601+ } catch (const ::parquet::ParquetException& e) {
602+ return Status::Invalid (" Could not infer file paths from FileMetaData:" , e.what ());
603+ }
604+ }
605+
575606Result<std::vector<std::shared_ptr<FileFragment>>>
576607ParquetDatasetFactory::CollectParquetFragments (
577608 const parquet::FileMetaData& metadata,
578- const parquet::ArrowReaderProperties& properties) {
609+ const parquet::ArrowReaderProperties& properties, const Partitioning& partitioning ) {
579610 try {
580611 auto n_columns = metadata.num_columns ();
581612 if (n_columns == 0 ) {
@@ -584,14 +615,12 @@ ParquetDatasetFactory::CollectParquetFragments(
584615 }
585616
586617 std::unordered_map<std::string, std::vector<RowGroupInfo>> path_to_row_group_infos;
587-
588618 ARROW_ASSIGN_OR_RAISE (auto manifest, GetSchemaManifest (metadata, properties));
589619
590620 for (int i = 0 ; i < metadata.num_row_groups (); i++) {
591621 auto row_group = metadata.RowGroup (i);
592- ARROW_ASSIGN_OR_RAISE (auto path, FileFromRowGroup (base_path_, *row_group));
593- // Normalizing path is required for Windows.
594- ARROW_ASSIGN_OR_RAISE (path, filesystem_->NormalizePath (std::move (path)));
622+ ARROW_ASSIGN_OR_RAISE (auto path,
623+ FileFromRowGroup (filesystem_.get (), base_path_, *row_group));
595624 auto stats = RowGroupStatisticsAsExpression (*row_group, manifest);
596625 auto num_rows = row_group->num_rows ();
597626
@@ -611,9 +640,13 @@ ParquetDatasetFactory::CollectParquetFragments(
611640 std::vector<std::shared_ptr<FileFragment>> fragments;
612641 fragments.reserve (path_to_row_group_infos.size ());
613642 for (auto && elem : path_to_row_group_infos) {
614- ARROW_ASSIGN_OR_RAISE (auto fragment,
615- format_->MakeFragment ({std::move (elem.first ), filesystem_},
616- scalar (true ), std::move (elem.second )));
643+ const auto & path = elem.first ;
644+ auto partition =
645+ partitioning.Parse (StripPrefixAndFilename (path, options_.partition_base_dir ))
646+ .ValueOr (scalar (true ));
647+ ARROW_ASSIGN_OR_RAISE (
648+ auto fragment, format_->MakeFragment ({path, filesystem_}, std::move (partition),
649+ std::move (elem.second )));
617650 fragments.push_back (std::move (fragment));
618651 }
619652
@@ -623,15 +656,44 @@ ParquetDatasetFactory::CollectParquetFragments(
623656 }
624657}
625658
659+ Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchemas (
660+ InspectOptions options) {
661+ std::vector<std::shared_ptr<Schema>> schemas;
662+
663+ std::shared_ptr<Schema> schema;
664+ RETURN_NOT_OK (parquet::arrow::FromParquetSchema (metadata_->schema (), &schema));
665+ schemas.push_back (std::move (schema));
666+
667+ if (options_.partitioning .factory () != nullptr ) {
668+ // Gather paths found in RowGroups' ColumnChunks.
669+ auto properties = MakeArrowReaderProperties (*format_, *metadata_);
670+ ARROW_ASSIGN_OR_RAISE (auto paths, CollectPaths (*metadata_, properties));
671+
672+ ARROW_ASSIGN_OR_RAISE (auto partition_schema,
673+ options_.partitioning .GetOrInferSchema (StripPrefixAndFilename (
674+ paths, options_.partition_base_dir )));
675+ schemas.push_back (std::move (partition_schema));
676+ }
677+
678+ return schemas;
679+ }
680+
626681Result<std::shared_ptr<Dataset>> ParquetDatasetFactory::Finish (FinishOptions options) {
627682 std::shared_ptr<Schema> schema = options.schema ;
628683 bool schema_missing = schema == nullptr ;
629684 if (schema_missing) {
630685 ARROW_ASSIGN_OR_RAISE (schema, Inspect (options.inspect_options ));
631686 }
632687
688+ std::shared_ptr<Partitioning> partitioning = options_.partitioning .partitioning ();
689+ if (partitioning == nullptr ) {
690+ auto factory = options_.partitioning .factory ();
691+ ARROW_ASSIGN_OR_RAISE (partitioning, factory->Finish (schema));
692+ }
693+
633694 auto properties = MakeArrowReaderProperties (*format_, *metadata_);
634- ARROW_ASSIGN_OR_RAISE (auto fragments, CollectParquetFragments (*metadata_, properties));
695+ ARROW_ASSIGN_OR_RAISE (auto fragments,
696+ CollectParquetFragments (*metadata_, properties, *partitioning));
635697 return FileSystemDataset::Make (std::move (schema), scalar (true ), format_,
636698 std::move (fragments));
637699}
0 commit comments