OpenSourceJavaProject
diff --git a/‎cpp/src/arrow/dataset/file_parquet.cc‎
Lines changed: 111 additions & 28 deletions b/‎cpp/src/arrow/dataset/file_parquet.cc‎
Lines changed: 111 additions & 28 deletions
diff --git a/‎cpp/src/arrow/dataset/file_parquet.h‎
Lines changed: 4 additions & 0 deletions b/‎cpp/src/arrow/dataset/file_parquet.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/src/arrow/dataset/file_parquet_test.cc‎
Lines changed: 94 additions & 6 deletions b/‎cpp/src/arrow/dataset/file_parquet_test.cc‎
Lines changed: 94 additions & 6 deletions
diff --git a/‎cpp/src/arrow/dataset/filter.h‎
Lines changed: 4 additions & 0 deletions b/‎cpp/src/arrow/dataset/filter.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/src/arrow/dataset/filter_test.cc‎
Lines changed: 7 additions & 1 deletion b/‎cpp/src/arrow/dataset/filter_test.cc‎
Lines changed: 7 additions & 1 deletion
@@ -28,19 +28,21 @@
 #include "arrow/util/range.h"
 #include "arrow/util/stl.h"
 #include "parquet/arrow/reader.h"
+#include "parquet/arrow/schema.h"
 #include "parquet/file_reader.h"
+#include "parquet/statistics.h"
 
 namespace arrow {
 namespace dataset {
 
-/// \brief A ScanTask backed by a parquet file and a subset of RowGroups.
+/// \brief A ScanTask backed by a parquet file and a RowGroup within a parquet file.
 class ParquetScanTask : public ScanTask {
  public:
-  ParquetScanTask(std::vector<int> row_groups, std::vector<int> columns_projection,
+  ParquetScanTask(int row_group, std::vector<int> columns_projection,
                   std::shared_ptr<parquet::arrow::FileReader> reader,
                   std::shared_ptr<ScanOptions> options,
                   std::shared_ptr<ScanContext> context)
-      : row_groups_(std::move(row_groups)),
+      : row_group_(row_group),
         columns_projection_(std::move(columns_projection)),
         reader_(reader),
         options_(std::move(options)),
@@ -54,7 +56,7 @@ class ParquetScanTask : public ScanTask {
     // Thus the memory incurred by the RecordBatchReader is allocated when
     // Scan is called.
     std::unique_ptr<RecordBatchReader> record_batch_reader;
-    auto status = reader_->GetRecordBatchReader(row_groups_, columns_projection_,
+    auto status = reader_->GetRecordBatchReader({row_group_}, columns_projection_,
                                                 &record_batch_reader);
     // Propagate the previous error as an error iterator.
     if (!status.ok()) {
@@ -65,8 +67,7 @@ class ParquetScanTask : public ScanTask {
   }
 
  private:
-  // Subset of RowGroups and columns bound to this task.
-  std::vector<int> row_groups_;
+  int row_group_;
   std::vector<int> columns_projection_;
   // The ScanTask _must_ hold a reference to reader_ because there's no
   // guarantee the producing ParquetScanTaskIterator is still alive. This is a
@@ -77,35 +78,52 @@ class ParquetScanTask : public ScanTask {
   std::shared_ptr<ScanContext> context_;
 };
 
-constexpr int64_t kDefaultRowCountPerPartition = 1U << 16;
-
-// A class that clusters RowGroups of a Parquet file until the cluster has a specified
-// total row count. This doesn't guarantee exact row counts; it may exceed the target.
-class ParquetRowGroupPartitioner {
+// Skip RowGroups with a filter and metadata
+class RowGroupSkipper {
  public:
-  ParquetRowGroupPartitioner(std::shared_ptr<parquet::FileMetaData> metadata,
-                             int64_t row_count = kDefaultRowCountPerPartition)
-      : metadata_(std::move(metadata)), row_count_(row_count), row_group_idx_(0) {
+  static constexpr int kIterationDone = -1;
+
+  RowGroupSkipper(std::shared_ptr<parquet::FileMetaData> metadata,
+                  std::shared_ptr<Expression> filter)
+      : metadata_(std::move(metadata)), filter_(filter), row_group_idx_(0) {
     num_row_groups_ = metadata_->num_row_groups();
   }
 
-  std::vector<int> Next() {
-    int64_t partition_size = 0;
-    std::vector<int> partitions;
+  int Next() {
+    while (row_group_idx_ < num_row_groups_) {
+      const auto row_group_idx = row_group_idx_++;
+      const auto row_group = metadata_->RowGroup(row_group_idx);
 
-    while (row_group_idx_ < num_row_groups_ && partition_size < row_count_) {
-      partition_size += metadata_->RowGroup(row_group_idx_)->num_rows();
-      partitions.push_back(row_group_idx_++);
+      const auto num_rows = row_group->num_rows();
+      if (CanSkip(*row_group)) {
+        rows_skipped_ += num_rows;
+        continue;
+      }
+
+      return row_group_idx;
     }
 
-    return partitions;
+    return kIterationDone;
   }
 
  private:
+  bool CanSkip(const parquet::RowGroupMetaData& metadata) const {
+    auto maybe_stats_expr = RowGroupStatisticsAsExpression(metadata);
+    // Errors with statistics are ignored and post-filtering will apply.
+    if (!maybe_stats_expr.ok()) {
+      return false;
+    }
+
+    auto stats_expr = maybe_stats_expr.ValueOrDie();
+    auto expr = filter_->Assume(stats_expr);
+    return (expr->IsNull() || expr->Equals(false));
+  }
+
   std::shared_ptr<parquet::FileMetaData> metadata_;
-  int64_t row_count_;
+  std::shared_ptr<Expression> filter_;
   int row_group_idx_;
   int num_row_groups_;
+  int64_t rows_skipped_;
 };
 
 class ParquetScanTaskIterator {
@@ -130,16 +148,16 @@ class ParquetScanTaskIterator {
   }
 
   Status Next(std::unique_ptr<ScanTask>* task) {
-    auto partition = partitioner_.Next();
+    auto row_group = skipper_.Next();
 
     // Iteration is done.
-    if (partition.size() == 0) {
+    if (row_group == RowGroupSkipper::kIterationDone) {
       task->reset(nullptr);
       return Status::OK();
     }
 
-    task->reset(new ParquetScanTask(std::move(partition), columns_projection_, reader_,
-                                    options_, context_));
+    task->reset(
+        new ParquetScanTask(row_group, columns_projection_, reader_, options_, context_));
 
     return Status::OK();
   }
@@ -163,13 +181,13 @@ class ParquetScanTaskIterator {
       : options_(std::move(options)),
         context_(std::move(context)),
         columns_projection_(columns_projection),
-        partitioner_(std::move(metadata)),
+        skipper_(std::move(metadata), options_->filter),
         reader_(std::move(reader)) {}
 
   std::shared_ptr<ScanOptions> options_;
   std::shared_ptr<ScanContext> context_;
   std::vector<int> columns_projection_;
-  ParquetRowGroupPartitioner partitioner_;
+  RowGroupSkipper skipper_;
   std::shared_ptr<parquet::arrow::FileReader> reader_;
 };
 
@@ -220,5 +238,70 @@ Status ParquetFileFormat::OpenReader(
   return Status::OK();
 }
 
+using parquet::arrow::SchemaField;
+using parquet::arrow::StatisticsAsScalars;
+
+static std::shared_ptr<Expression> ColumnChunkStatisticsAsExpression(
+    const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
+  // For the remaining of this function, failure to extract/parse statistics
+  // are ignored by returning the `true` scalar. The goal is two fold. First
+  // avoid that an optimization break the computation. Second, allow the
+  // following columns to maybe succeed in extracting column statistics.
+
+  // For now, only leaf (primitive) types are supported.
+  if (!schema_field.is_leaf()) {
+    return scalar(true);
+  }
+
+  auto column_metadata = metadata.ColumnChunk(schema_field.column_index);
+  auto field = schema_field.field;
+  auto field_expr = field_ref(field->name());
+
+  // In case of missing statistics, return nothing.
+  if (!column_metadata->is_stats_set()) {
+    return scalar(true);
+  }
+
+  auto statistics = column_metadata->statistics();
+  if (statistics == nullptr) {
+    return scalar(true);
+  }
+
+  // Optimize for corner case where all values are nulls
+  if (statistics->num_values() == statistics->null_count()) {
+    std::shared_ptr<Scalar> null_scalar;
+    if (!MakeNullScalar(field->type(), &null_scalar).ok()) {
+      // MakeNullScalar can fail for some nested/repeated types.
+      return scalar(true);
+    }
+
+    return equal(field_expr, scalar(null_scalar));
+  }
+
+  std::shared_ptr<Scalar> min, max;
+  if (!StatisticsAsScalars(*statistics, &min, &max).ok()) {
+    return scalar(true);
+  }
+
+  return and_(greater_equal(field_expr, scalar(min)),
+              less_equal(field_expr, scalar(max)));
+}
+
+using parquet::arrow::SchemaManifest;
+
+Result<std::shared_ptr<Expression>> RowGroupStatisticsAsExpression(
+    const parquet::RowGroupMetaData& metadata) {
+  SchemaManifest manifest;
+  RETURN_NOT_OK(SchemaManifest::Make(
+      metadata.schema(), nullptr, parquet::default_arrow_reader_properties(), &manifest));
+
+  std::vector<std::shared_ptr<Expression>> expressions;
+  for (const auto& schema_field : manifest.schema_fields) {
+    expressions.emplace_back(ColumnChunkStatisticsAsExpression(schema_field, metadata));
+  }
+
+  return expressions.empty() ? scalar(true) : and_(expressions);
+}
+
 }  // namespace dataset
 }  // namespace arrow
@@ -26,6 +26,7 @@
 
 namespace parquet {
 class ParquetFileReader;
+class RowGroupMetaData;
 }  // namespace parquet
 
 namespace arrow {
@@ -75,5 +76,8 @@ class ARROW_DS_EXPORT ParquetFragment : public FileBasedDataFragment {
   bool splittable() const override { return true; }
 };
 
+Result<std::shared_ptr<Expression>> RowGroupStatisticsAsExpression(
+    const parquet::RowGroupMetaData& metadata);
+
 }  // namespace dataset
 }  // namespace arrow
@@ -104,8 +104,8 @@ class ArrowParquetWriterMixin : public ::testing::Test {
     std::shared_ptr<Buffer> out;
     auto sink = CreateOutputStream(pool);
 
-    ARROW_EXPECT_OK(WriteRecordBatchReader(reader, pool, sink));
-    ARROW_EXPECT_OK(sink->Finish(&out));
+    ABORT_NOT_OK(WriteRecordBatchReader(reader, pool, sink));
+    ABORT_NOT_OK(sink->Finish(&out));
 
     return out;
   }
@@ -145,12 +145,9 @@ class ParquetBufferFixtureMixin : public ArrowParquetWriterMixin {
 };
 
 class TestParquetFileFormat : public ParquetBufferFixtureMixin {
- public:
-  TestParquetFileFormat() : ctx_(std::make_shared<ScanContext>()) {}
-
  protected:
   std::shared_ptr<ScanOptions> opts_ = ScanOptions::Defaults();
-  std::shared_ptr<ScanContext> ctx_;
+  std::shared_ptr<ScanContext> ctx_ = std::make_shared<ScanContext>();
 };
 
 TEST_F(TestParquetFileFormat, ScanRecordBatchReader) {
@@ -199,5 +196,96 @@ TEST_F(TestParquetFileFormat, Inspect) {
   EXPECT_EQ(*actual, *schema_);
 }
 
+void CountRowsInScan(ScanTaskIterator& it, int64_t expected_rows,
+                     int64_t expected_batches) {
+  int64_t actual_rows = 0;
+  int64_t actual_batches = 0;
+
+  for (auto maybe_scan_task : it) {
+    ASSERT_OK_AND_ASSIGN(auto scan_task, std::move(maybe_scan_task));
+    for (auto maybe_record_batch : scan_task->Scan()) {
+      ASSERT_OK_AND_ASSIGN(auto record_batch, std::move(maybe_record_batch));
+      actual_rows += record_batch->num_rows();
+      actual_batches++;
+    }
+  }
+
+  EXPECT_EQ(actual_rows, expected_rows);
+  EXPECT_EQ(actual_batches, expected_batches);
+}
+
+class TestParquetFileFormatPushDown : public TestParquetFileFormat {
+ public:
+  void CountRowsAndBatchesInScan(DataFragment& fragment, int64_t expected_rows,
+                                 int64_t expected_batches) {
+    int64_t actual_rows = 0;
+    int64_t actual_batches = 0;
+
+    ScanTaskIterator it;
+    ASSERT_OK(fragment.Scan(ctx_, &it));
+    for (auto maybe_scan_task : it) {
+      ASSERT_OK_AND_ASSIGN(auto scan_task, std::move(maybe_scan_task));
+      for (auto maybe_record_batch : scan_task->Scan()) {
+        ASSERT_OK_AND_ASSIGN(auto record_batch, std::move(maybe_record_batch));
+        actual_rows += record_batch->num_rows();
+        actual_batches++;
+      }
+    }
+
+    EXPECT_EQ(actual_rows, expected_rows);
+    EXPECT_EQ(actual_batches, expected_batches);
+  }
+};
+
+TEST_F(TestParquetFileFormatPushDown, Basic) {
+  // Given a number `n`, the arithmetic dataset creates n RecordBatches where
+  // each RecordBatch is keyed by a unique integer in [1, n]. Let `rb_i` denote
+  // the record batch keyed by `i`. `rb_i` is composed of `i` rows where all
+  // values are a variant of `i`, e.g. {"i64": i, "u8": i, ... }.
+  //
+  // Thus the ArithmeticDataset(n) has n RecordBatches and the total number of
+  // rows is n(n+1)/2.
+  //
+  // This test uses the DataFragment directly, and so no post-filtering is
+  // applied via ScanOptions' evaluator. Thus, counting the number of returned
+  // rows and returned row groups is a good enough proxy to check if pushdown
+  // predicate is working.
+  constexpr int64_t kNumRowGroups = 16;
+  constexpr int64_t kTotalNumRows = kNumRowGroups * (kNumRowGroups + 1) / 2;
+
+  auto reader = ArithmeticDatasetFixture::GetRecordBatchReader(kNumRowGroups);
+  auto source = GetFileSource(reader.get());
+  auto fragment = std::make_shared<ParquetFragment>(*source, opts_);
+
+  opts_->filter = scalar(true);
+  CountRowsAndBatchesInScan(*fragment, kTotalNumRows, kNumRowGroups);
+
+  for (int64_t i = 1; i <= kNumRowGroups; i++) {
+    opts_->filter = ("i64"_ == int64_t(i)).Copy();
+    CountRowsAndBatchesInScan(*fragment, i, 1);
+  }
+
+  /* Out of bound filters should skip all RowGroups. */
+  opts_->filter = scalar(false);
+  CountRowsAndBatchesInScan(*fragment, 0, 0);
+  opts_->filter = ("i64"_ == int64_t(kNumRowGroups + 1)).Copy();
+  CountRowsAndBatchesInScan(*fragment, 0, 0);
+  opts_->filter = ("i64"_ == int64_t(-1)).Copy();
+  CountRowsAndBatchesInScan(*fragment, 0, 0);
+  // No rows match 1 and 2.
+  opts_->filter = ("i64"_ == int64_t(1) and "u8"_ == uint8_t(2)).Copy();
+  CountRowsAndBatchesInScan(*fragment, 0, 0);
+
+  opts_->filter = ("i64"_ == int64_t(2) or "i64"_ == int64_t(4)).Copy();
+  CountRowsAndBatchesInScan(*fragment, 2 + 4, 2);
+
+  opts_->filter = ("i64"_ < int64_t(6)).Copy();
+  CountRowsAndBatchesInScan(*fragment, 5 * (5 + 1) / 2, 5);
+
+  opts_->filter = ("i64"_ >= int64_t(6)).Copy();
+  CountRowsAndBatchesInScan(*fragment, kTotalNumRows - (5 * (5 + 1) / 2),
+                            kNumRowGroups - 5);
+}
+
 }  // namespace dataset
 }  // namespace arrow
@@ -179,6 +179,10 @@ class ARROW_DS_EXPORT Expression {
     return Copy();
   }
 
+  std::shared_ptr<Expression> Assume(const std::shared_ptr<Expression>& given) const {
+    return Assume(*given);
+  }
+
   /// returns a debug string representing this expression
   virtual std::string ToString() const = 0;
 
 
@@ -62,7 +62,13 @@ class ExpressionsTest : public ::testing::Test {
     auto simplified = expr.Assume(given);
     ASSERT_EQ(E{simplified}, E{expected})
         << "  simplification of: " << expr.ToString() << std::endl
-        << "              given: " << given.ToString() << std::endl;
+        << "              given: " << given.ToString() << std::endl
+        << "           expected: " << expected.ToString() << std::endl;
+  }
+
+  void AssertSimplifiesTo(const Expression& expr, const Expression& given,
+                          const std::shared_ptr<Expression>& expected) {
+    AssertSimplifiesTo(expr, given, *expected);
   }
 
   std::shared_ptr<Schema> schema_ =
Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,10 @@ class ARROW_DS_EXPORT Expression {`
`179`	`179`	`return Copy();`
`180`	`180`	`}`
`181`	`181`
	`182`	`+ std::shared_ptr<Expression> Assume(const std::shared_ptr<Expression>& given) const {`
	`183`	`+ return Assume(*given);`
	`184`	`+ }`
	`185`	`+`
`182`	`186`	`/// returns a debug string representing this expression`
`183`	`187`	`virtual std::string ToString() const = 0;`
`184`	`188`