Skip to content

Commit d07dc75

Browse files
vibhathapitrou
andauthored
ARROW-16911: [C++] Add Equals method to Partitioning (apache#13567)
Adding `Equals` method to `Partitioning` class and extended classes. Also include a few test cases. Lead-authored-by: Vibhatha Abeykoon <vibhatha@gmail.com> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 0fda96c commit d07dc75

3 files changed

Lines changed: 140 additions & 1 deletion

File tree

cpp/src/arrow/dataset/partition.cc

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ std::shared_ptr<Partitioning> Partitioning::Default() {
8282

8383
std::string type_name() const override { return "default"; }
8484

85+
bool Equals(const Partitioning& other) const override {
86+
return type_name() == other.type_name();
87+
}
88+
8589
Result<compute::Expression> Parse(const std::string& path) const override {
8690
return compute::literal(true);
8791
}
@@ -115,6 +119,28 @@ static Result<RecordBatchVector> ApplyGroupings(
115119
return out;
116120
}
117121

122+
bool KeyValuePartitioning::Equals(const Partitioning& other) const {
123+
if (this == &other) {
124+
return true;
125+
}
126+
const auto& kv_partitioning = checked_cast<const KeyValuePartitioning&>(other);
127+
const auto& other_dictionaries = kv_partitioning.dictionaries();
128+
if (dictionaries_.size() != other_dictionaries.size()) {
129+
return false;
130+
}
131+
int64_t idx = 0;
132+
for (const auto& array : dictionaries_) {
133+
const auto& other_array = other_dictionaries[idx++];
134+
bool match = (array == nullptr && other_array == nullptr) ||
135+
(array && other_array && array->Equals(other_array));
136+
if (!match) {
137+
return false;
138+
}
139+
}
140+
return options_.segment_encoding == kv_partitioning.options_.segment_encoding &&
141+
Partitioning::Equals(other);
142+
}
143+
118144
Result<Partitioning::PartitionedBatches> KeyValuePartitioning::Partition(
119145
const std::shared_ptr<RecordBatch>& batch) const {
120146
std::vector<int> key_indices;
@@ -381,6 +407,10 @@ Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
381407
return ParsePartitionSegments(segments);
382408
}
383409

410+
bool DirectoryPartitioning::Equals(const Partitioning& other) const {
411+
return type_name() == other.type_name() && KeyValuePartitioning::Equals(other);
412+
}
413+
384414
FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
385415
ArrayVector dictionaries,
386416
KeyValuePartitioningOptions options)
@@ -678,6 +708,13 @@ std::shared_ptr<PartitioningFactory> FilenamePartitioning::MakeFactory(
678708
new FilenamePartitioningFactory(std::move(field_names), options));
679709
}
680710

711+
bool FilenamePartitioning::Equals(const Partitioning& other) const {
712+
if (type_name() != other.type_name()) {
713+
return false;
714+
}
715+
return KeyValuePartitioning::Equals(other);
716+
}
717+
681718
Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
682719
const std::string& segment, const HivePartitioningOptions& options) {
683720
auto name_end = string_view(segment).find_first_of('=');
@@ -754,6 +791,19 @@ Result<PartitionPathFormat> HivePartitioning::FormatValues(
754791
return PartitionPathFormat{fs::internal::JoinAbstractPath(std::move(segments)), ""};
755792
}
756793

794+
bool HivePartitioning::Equals(const Partitioning& other) const {
795+
if (this == &other) {
796+
return true;
797+
}
798+
if (type_name() != other.type_name()) {
799+
return false;
800+
}
801+
const auto& hive_part = ::arrow::internal::checked_cast<const HivePartitioning&>(other);
802+
return null_fallback() == hive_part.null_fallback() &&
803+
options().null_fallback == hive_part.options().null_fallback &&
804+
KeyValuePartitioning::Equals(other);
805+
}
806+
757807
class HivePartitioningFactory : public KeyValuePartitioningFactory {
758808
public:
759809
explicit HivePartitioningFactory(HivePartitioningFactoryOptions options)

cpp/src/arrow/dataset/partition.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "arrow/compute/exec/expression.h"
3131
#include "arrow/dataset/type_fwd.h"
3232
#include "arrow/dataset/visibility.h"
33+
#include "arrow/util/compare.h"
3334
#include "arrow/util/optional.h"
3435

3536
namespace arrow {
@@ -63,13 +64,18 @@ struct ARROW_DS_EXPORT PartitionPathFormat {
6364
/// Paths are consumed from left to right. Paths must be relative to
6465
/// the root of a partition; path prefixes must be removed before passing
6566
/// the path to a partitioning for parsing.
66-
class ARROW_DS_EXPORT Partitioning {
67+
class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable<Partitioning> {
6768
public:
6869
virtual ~Partitioning() = default;
6970

7071
/// \brief The name identifying the kind of partitioning
7172
virtual std::string type_name() const = 0;
7273

74+
//// \brief Return whether the partitionings are equal
75+
virtual bool Equals(const Partitioning& other) const {
76+
return schema_->Equals(other.schema_, /*check_metadata=*/false);
77+
}
78+
7379
/// \brief If the input batch shares any fields with this partitioning,
7480
/// produce sub-batches which satisfy mutually exclusive Expressions.
7581
struct PartitionedBatches {
@@ -180,6 +186,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
180186

181187
const ArrayVector& dictionaries() const { return dictionaries_; }
182188

189+
bool Equals(const Partitioning& other) const override;
190+
183191
protected:
184192
KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
185193
KeyValuePartitioningOptions options)
@@ -223,6 +231,8 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
223231

224232
std::string type_name() const override { return "directory"; }
225233

234+
bool Equals(const Partitioning& other) const override;
235+
226236
/// \brief Create a factory for a directory partitioning.
227237
///
228238
/// \param[in] field_names The names for the partition fields. Types will be
@@ -282,6 +292,8 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
282292
static Result<util::optional<Key>> ParseKey(const std::string& segment,
283293
const HivePartitioningOptions& options);
284294

295+
bool Equals(const Partitioning& other) const override;
296+
285297
/// \brief Create a factory for a hive partitioning.
286298
static std::shared_ptr<PartitioningFactory> MakeFactory(
287299
HivePartitioningFactoryOptions = {});
@@ -310,6 +322,8 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
310322

311323
std::string type_name() const override { return name_; }
312324

325+
bool Equals(const Partitioning& other) const override { return false; }
326+
313327
Result<compute::Expression> Parse(const std::string& path) const override {
314328
return parse_impl_(path);
315329
}
@@ -352,6 +366,8 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
352366
static std::shared_ptr<PartitioningFactory> MakeFactory(
353367
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
354368

369+
bool Equals(const Partitioning& other) const override;
370+
355371
private:
356372
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
357373

cpp/src/arrow/dataset/partition_test.cc

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,21 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
206206
equal(field_ref("beta"), literal("foo"))));
207207
}
208208

209+
TEST_F(TestPartitioning, DirectoryPartitioningEquals) {
210+
auto part = std::make_shared<DirectoryPartitioning>(
211+
schema({field("alpha", int32()), field("beta", utf8())}));
212+
auto other = std::make_shared<DirectoryPartitioning>(
213+
schema({field("alpha", int32()), field("gamma", utf8())}));
214+
auto another = std::make_shared<DirectoryPartitioning>(
215+
schema({field("alpha", int32()), field("beta", utf8())}));
216+
auto some_other = std::make_shared<DirectoryPartitioning>(
217+
schema({field("alpha", int32()), field("beta", utf8())}));
218+
EXPECT_TRUE(part->Equals(*part));
219+
EXPECT_FALSE(part->Equals(*other));
220+
EXPECT_TRUE(part->Equals(*another));
221+
EXPECT_TRUE(another->Equals(*some_other));
222+
}
223+
209224
TEST_F(TestPartitioning, FilenamePartitioning) {
210225
partitioning_ = std::make_shared<FilenamePartitioning>(
211226
schema({field("alpha", int32()), field("beta", utf8())}));
@@ -222,6 +237,21 @@ TEST_F(TestPartitioning, FilenamePartitioning) {
222237
equal(field_ref("beta"), literal("foo"))));
223238
}
224239

240+
TEST_F(TestPartitioning, FilenamePartitioningEquals) {
241+
auto part = std::make_shared<FilenamePartitioning>(
242+
schema({field("alpha", int32()), field("beta", utf8())}));
243+
auto other_part = std::make_shared<FilenamePartitioning>(
244+
schema({field("sigma", int32()), field("beta", utf8())}));
245+
auto another_part = std::make_shared<FilenamePartitioning>(
246+
schema({field("sigma", int64()), field("beta", utf8())}));
247+
auto some_other_part = std::make_shared<FilenamePartitioning>(
248+
schema({field("sigma", int64()), field("beta", utf8())}));
249+
EXPECT_TRUE(part->Equals(*part));
250+
EXPECT_FALSE(part->Equals(*other_part));
251+
EXPECT_FALSE(other_part->Equals(*another_part));
252+
EXPECT_TRUE(another_part->Equals(*some_other_part));
253+
}
254+
225255
TEST_F(TestPartitioning, DirectoryPartitioningFormat) {
226256
partitioning_ = std::make_shared<DirectoryPartitioning>(
227257
schema({field("alpha", int32()), field("beta", utf8())}));
@@ -426,6 +456,41 @@ TEST_F(TestPartitioning, HivePartitioning) {
426456
AssertParseError("/alpha=0.0/beta=3.25/"); // conversion of "0.0" to int32 fails
427457
}
428458

459+
TEST_F(TestPartitioning, HivePartitioningEquals) {
460+
const auto& array_vector = ArrayVector();
461+
ArrayVector other_vector(2);
462+
other_vector[0] = ArrayFromJSON(utf8(), R"(["foo", "bar", "baz"])");
463+
other_vector[1] = ArrayFromJSON(utf8(), R"(["bar", "foo", "baz"])");
464+
auto part = std::make_shared<HivePartitioning>(
465+
schema({field("alpha", int32()), field("beta", float32())}), array_vector, "xyz");
466+
auto other_part = std::make_shared<HivePartitioning>(
467+
schema({field("sigma", int32()), field("beta", float32())}), array_vector, "xyz");
468+
auto another_part = std::make_shared<HivePartitioning>(
469+
schema({field("alpha", int32()), field("beta", float32())}), other_vector, "xyz");
470+
auto some_part = std::make_shared<HivePartitioning>(
471+
schema({field("alpha", int32()), field("beta", float32())}), array_vector, "abc");
472+
auto match_part = std::make_shared<HivePartitioning>(
473+
schema({field("alpha", int32()), field("beta", float32())}), array_vector, "xyz");
474+
EXPECT_TRUE(part->Equals(*part));
475+
EXPECT_FALSE(part->Equals(*other_part));
476+
EXPECT_FALSE(part->Equals(*another_part));
477+
EXPECT_FALSE(part->Equals(*some_part));
478+
EXPECT_TRUE(part->Equals(*match_part));
479+
}
480+
481+
TEST_F(TestPartitioning, CrossCheckPartitioningEquals) {
482+
auto file_part = std::make_shared<FilenamePartitioning>(
483+
schema({field("alpha", int32()), field("beta", utf8())}));
484+
auto dir_part = std::make_shared<DirectoryPartitioning>(
485+
schema({field("alpha", int32()), field("beta", utf8())}));
486+
auto hive_part = std::make_shared<HivePartitioning>(
487+
schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
488+
EXPECT_FALSE(file_part->Equals(*dir_part));
489+
EXPECT_FALSE(dir_part->Equals(*file_part));
490+
EXPECT_FALSE(dir_part->Equals(*hive_part));
491+
EXPECT_FALSE(hive_part->Equals(*dir_part));
492+
}
493+
429494
TEST_F(TestPartitioning, HivePartitioningFormat) {
430495
partitioning_ = std::make_shared<HivePartitioning>(
431496
schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
@@ -891,6 +956,14 @@ class RangePartitioning : public Partitioning {
891956

892957
std::string type_name() const override { return "range"; }
893958

959+
bool Equals(const Partitioning& other) const override {
960+
if (this == &other) {
961+
return true;
962+
}
963+
return checked_cast<const RangePartitioning&>(other).type_name() == type_name() &&
964+
Partitioning::Equals(other);
965+
}
966+
894967
Result<compute::Expression> Parse(const std::string& path) const override {
895968
std::vector<compute::Expression> ranges;
896969

0 commit comments

Comments
 (0)