2828#include " arrow/util/range.h"
2929#include " arrow/util/stl.h"
3030#include " parquet/arrow/reader.h"
31+ #include " parquet/arrow/schema.h"
3132#include " parquet/file_reader.h"
33+ #include " parquet/statistics.h"
3234
3335namespace arrow {
3436namespace dataset {
3537
36- // / \brief A ScanTask backed by a parquet file and a subset of RowGroups .
38+ // / \brief A ScanTask backed by a parquet file and a RowGroup within a parquet file .
3739class ParquetScanTask : public ScanTask {
3840 public:
39- ParquetScanTask (std::vector< int > row_groups , std::vector<int > columns_projection,
41+ ParquetScanTask (int row_group , std::vector<int > columns_projection,
4042 std::shared_ptr<parquet::arrow::FileReader> reader,
4143 std::shared_ptr<ScanOptions> options,
4244 std::shared_ptr<ScanContext> context)
43- : row_groups_(std::move(row_groups) ),
45+ : row_group_(row_group ),
4446 columns_projection_ (std::move(columns_projection)),
4547 reader_(reader),
4648 options_(std::move(options)),
@@ -54,7 +56,7 @@ class ParquetScanTask : public ScanTask {
5456 // Thus the memory incurred by the RecordBatchReader is allocated when
5557 // Scan is called.
5658 std::unique_ptr<RecordBatchReader> record_batch_reader;
57- auto status = reader_->GetRecordBatchReader (row_groups_ , columns_projection_,
59+ auto status = reader_->GetRecordBatchReader ({row_group_} , columns_projection_,
5860 &record_batch_reader);
5961 // Propagate the previous error as an error iterator.
6062 if (!status.ok ()) {
@@ -65,8 +67,7 @@ class ParquetScanTask : public ScanTask {
6567 }
6668
6769 private:
68- // Subset of RowGroups and columns bound to this task.
69- std::vector<int > row_groups_;
70+ int row_group_;
7071 std::vector<int > columns_projection_;
7172 // The ScanTask _must_ hold a reference to reader_ because there's no
7273 // guarantee the producing ParquetScanTaskIterator is still alive. This is a
@@ -77,35 +78,52 @@ class ParquetScanTask : public ScanTask {
7778 std::shared_ptr<ScanContext> context_;
7879};
7980
80- constexpr int64_t kDefaultRowCountPerPartition = 1U << 16 ;
81-
82- // A class that clusters RowGroups of a Parquet file until the cluster has a specified
83- // total row count. This doesn't guarantee exact row counts; it may exceed the target.
84- class ParquetRowGroupPartitioner {
81+ // Skip RowGroups with a filter and metadata
82+ class RowGroupSkipper {
8583 public:
86- ParquetRowGroupPartitioner (std::shared_ptr<parquet::FileMetaData> metadata,
87- int64_t row_count = kDefaultRowCountPerPartition )
88- : metadata_(std::move(metadata)), row_count_(row_count), row_group_idx_(0 ) {
84+ static constexpr int kIterationDone = -1 ;
85+
86+ RowGroupSkipper (std::shared_ptr<parquet::FileMetaData> metadata,
87+ std::shared_ptr<Expression> filter)
88+ : metadata_(std::move(metadata)), filter_(filter), row_group_idx_(0 ) {
8989 num_row_groups_ = metadata_->num_row_groups ();
9090 }
9191
92- std::vector<int > Next () {
93- int64_t partition_size = 0 ;
94- std::vector<int > partitions;
92+ int Next () {
93+ while (row_group_idx_ < num_row_groups_) {
94+ const auto row_group_idx = row_group_idx_++;
95+ const auto row_group = metadata_->RowGroup (row_group_idx);
9596
96- while (row_group_idx_ < num_row_groups_ && partition_size < row_count_) {
97- partition_size += metadata_->RowGroup (row_group_idx_)->num_rows ();
98- partitions.push_back (row_group_idx_++);
97+ const auto num_rows = row_group->num_rows ();
98+ if (CanSkip (*row_group)) {
99+ rows_skipped_ += num_rows;
100+ continue ;
101+ }
102+
103+ return row_group_idx;
99104 }
100105
101- return partitions ;
106+ return kIterationDone ;
102107 }
103108
104109 private:
110+ bool CanSkip (const parquet::RowGroupMetaData& metadata) const {
111+ auto maybe_stats_expr = RowGroupStatisticsAsExpression (metadata);
112+ // Errors with statistics are ignored and post-filtering will apply.
113+ if (!maybe_stats_expr.ok ()) {
114+ return false ;
115+ }
116+
117+ auto stats_expr = maybe_stats_expr.ValueOrDie ();
118+ auto expr = filter_->Assume (stats_expr);
119+ return (expr->IsNull () || expr->Equals (false ));
120+ }
121+
105122 std::shared_ptr<parquet::FileMetaData> metadata_;
106- int64_t row_count_ ;
123+ std::shared_ptr<Expression> filter_ ;
107124 int row_group_idx_;
108125 int num_row_groups_;
126+ int64_t rows_skipped_;
109127};
110128
111129class ParquetScanTaskIterator {
@@ -130,16 +148,16 @@ class ParquetScanTaskIterator {
130148 }
131149
132150 Status Next (std::unique_ptr<ScanTask>* task) {
133- auto partition = partitioner_ .Next ();
151+ auto row_group = skipper_ .Next ();
134152
135153 // Iteration is done.
136- if (partition. size () == 0 ) {
154+ if (row_group == RowGroupSkipper:: kIterationDone ) {
137155 task->reset (nullptr );
138156 return Status::OK ();
139157 }
140158
141- task->reset (new ParquetScanTask ( std::move (partition), columns_projection_, reader_,
142- options_, context_));
159+ task->reset (
160+ new ParquetScanTask (row_group, columns_projection_, reader_, options_, context_));
143161
144162 return Status::OK ();
145163 }
@@ -163,13 +181,13 @@ class ParquetScanTaskIterator {
163181 : options_(std::move(options)),
164182 context_ (std::move(context)),
165183 columns_projection_(columns_projection),
166- partitioner_ (std::move(metadata)),
184+ skipper_ (std::move(metadata), options_->filter ),
167185 reader_(std::move(reader)) {}
168186
169187 std::shared_ptr<ScanOptions> options_;
170188 std::shared_ptr<ScanContext> context_;
171189 std::vector<int > columns_projection_;
172- ParquetRowGroupPartitioner partitioner_ ;
190+ RowGroupSkipper skipper_ ;
173191 std::shared_ptr<parquet::arrow::FileReader> reader_;
174192};
175193
@@ -220,5 +238,70 @@ Status ParquetFileFormat::OpenReader(
220238 return Status::OK ();
221239}
222240
241+ using parquet::arrow::SchemaField;
242+ using parquet::arrow::StatisticsAsScalars;
243+
244+ static std::shared_ptr<Expression> ColumnChunkStatisticsAsExpression (
245+ const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
246+ // For the remaining of this function, failure to extract/parse statistics
247+ // are ignored by returning the `true` scalar. The goal is two fold. First
248+ // avoid that an optimization break the computation. Second, allow the
249+ // following columns to maybe succeed in extracting column statistics.
250+
251+ // For now, only leaf (primitive) types are supported.
252+ if (!schema_field.is_leaf ()) {
253+ return scalar (true );
254+ }
255+
256+ auto column_metadata = metadata.ColumnChunk (schema_field.column_index );
257+ auto field = schema_field.field ;
258+ auto field_expr = field_ref (field->name ());
259+
260+ // In case of missing statistics, return nothing.
261+ if (!column_metadata->is_stats_set ()) {
262+ return scalar (true );
263+ }
264+
265+ auto statistics = column_metadata->statistics ();
266+ if (statistics == nullptr ) {
267+ return scalar (true );
268+ }
269+
270+ // Optimize for corner case where all values are nulls
271+ if (statistics->num_values () == statistics->null_count ()) {
272+ std::shared_ptr<Scalar> null_scalar;
273+ if (!MakeNullScalar (field->type (), &null_scalar).ok ()) {
274+ // MakeNullScalar can fail for some nested/repeated types.
275+ return scalar (true );
276+ }
277+
278+ return equal (field_expr, scalar (null_scalar));
279+ }
280+
281+ std::shared_ptr<Scalar> min, max;
282+ if (!StatisticsAsScalars (*statistics, &min, &max).ok ()) {
283+ return scalar (true );
284+ }
285+
286+ return and_ (greater_equal (field_expr, scalar (min)),
287+ less_equal (field_expr, scalar (max)));
288+ }
289+
290+ using parquet::arrow::SchemaManifest;
291+
292+ Result<std::shared_ptr<Expression>> RowGroupStatisticsAsExpression (
293+ const parquet::RowGroupMetaData& metadata) {
294+ SchemaManifest manifest;
295+ RETURN_NOT_OK (SchemaManifest::Make (
296+ metadata.schema (), nullptr , parquet::default_arrow_reader_properties (), &manifest));
297+
298+ std::vector<std::shared_ptr<Expression>> expressions;
299+ for (const auto & schema_field : manifest.schema_fields ) {
300+ expressions.emplace_back (ColumnChunkStatisticsAsExpression (schema_field, metadata));
301+ }
302+
303+ return expressions.empty () ? scalar (true ) : and_ (expressions);
304+ }
305+
223306} // namespace dataset
224307} // namespace arrow
0 commit comments