Skip to content

Commit 53db29f

Browse files
committed
ARROW-8726: [C++] Filename should not be part of DirectoryPartitioning
As the name of the class imply, only directory should be considered for partitions values. Closes apache#7377 from fsaintjacques/ARROW-8726-incorrect-part Authored-by: François Saint-Jacques <fsaintjacques@gmail.com> Signed-off-by: François Saint-Jacques <fsaintjacques@gmail.com>
1 parent 224c60e commit 53db29f

5 files changed

Lines changed: 42 additions & 14 deletions

File tree

cpp/src/arrow/dataset/discovery.cc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,10 +191,12 @@ Result<std::shared_ptr<Schema>> FileSystemDatasetFactory::PartitionSchema() {
191191
return partitioning->schema();
192192
}
193193

194-
std::vector<util::string_view> relative_paths;
194+
std::vector<std::string> relative_paths;
195195
for (const auto& path : paths_) {
196196
if (auto relative = RemovePartitionBaseDir(path)) {
197-
relative_paths.push_back(*relative);
197+
auto relative_str = relative->to_string();
198+
auto basename_filename = fs::internal::GetAbstractPathParent(relative_str);
199+
relative_paths.push_back(basename_filename.first);
198200
}
199201
}
200202

@@ -245,7 +247,9 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
245247
for (const auto& path : paths_) {
246248
std::shared_ptr<Expression> partition = scalar(true);
247249
if (auto relative = RemovePartitionBaseDir(path)) {
248-
partition = partitioning->Parse(relative->to_string()).ValueOr(scalar(true));
250+
auto relative_str = relative->to_string();
251+
auto basename_filename = fs::internal::GetAbstractPathParent(relative_str);
252+
partition = partitioning->Parse(basename_filename.first).ValueOr(scalar(true));
249253
}
250254

251255
ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({path, fs_}, partition));

cpp/src/arrow/dataset/discovery_test.cc

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,19 @@
1717

1818
#include "arrow/dataset/discovery.h"
1919

20-
#include <memory>
21-
#include <utility>
22-
2320
#include <gmock/gmock.h>
2421
#include <gtest/gtest.h>
2522

23+
#include <memory>
24+
#include <utility>
25+
26+
#include "arrow/dataset/filter.h"
2627
#include "arrow/dataset/partition.h"
2728
#include "arrow/dataset/test_util.h"
2829
#include "arrow/filesystem/test_util.h"
2930
#include "arrow/testing/gtest_util.h"
3031
#include "arrow/type_fwd.h"
32+
#include "arrow/util/checked_cast.h"
3133

3234
using testing::SizeIs;
3335

@@ -198,9 +200,10 @@ TEST_F(FileSystemDatasetFactoryTest, ExplicitPartition) {
198200

199201
TEST_F(FileSystemDatasetFactoryTest, DiscoveredPartition) {
200202
selector_.base_dir = "a=ignored/base";
203+
selector_.recursive = true;
201204
factory_options_.partitioning = HivePartitioning::MakeFactory();
202205

203-
auto a_1 = "a=ignored/base/a=1";
206+
auto a_1 = "a=ignored/base/a=1/file.data";
204207
MakeFactory({fs::File(a_1)});
205208

206209
InspectOptions options;
@@ -340,6 +343,27 @@ TEST_F(FileSystemDatasetFactoryTest, InspectFragmentsLimit) {
340343
}
341344
}
342345

346+
TEST_F(FileSystemDatasetFactoryTest, FilenameNotPartOfPartitions) {
347+
// ARROW-8726: Ensure filename is not a partition.
348+
349+
// Creates a partition with 2 explicit fields. The type `int32` is
350+
// specifically chosen such that parsing would fail given a non-integer
351+
// string.
352+
auto s = schema({field("first", utf8()), field("second", int32())});
353+
factory_options_.partitioning = std::make_shared<DirectoryPartitioning>(s);
354+
355+
selector_.recursive = true;
356+
// The file doesn't have a directory component for the second partition
357+
// column. In such case, the filename should not be used.
358+
MakeFactory({fs::File("one/file.parquet")});
359+
360+
ASSERT_OK_AND_ASSIGN(auto dataset, factory_->Finish());
361+
for (const auto& maybe_fragment : dataset->GetFragments()) {
362+
ASSERT_OK_AND_ASSIGN(auto fragment, maybe_fragment);
363+
ASSERT_TRUE(fragment->partition_expression()->Equals(("first"_ == "one")));
364+
}
365+
}
366+
343367
std::shared_ptr<DatasetFactory> DatasetFactoryFromSchemas(
344368
std::vector<std::shared_ptr<Schema>> schemas) {
345369
return std::make_shared<MockDatasetFactory>(schemas);

cpp/src/arrow/dataset/partition.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ class DirectoryPartitioningFactory : public PartitioningFactory {
265265
std::string type_name() const override { return "schema"; }
266266

267267
Result<std::shared_ptr<Schema>> Inspect(
268-
const std::vector<string_view>& paths) const override {
268+
const std::vector<std::string>& paths) const override {
269269
KeyValuePartitioningInspectImpl impl;
270270

271271
for (const auto& name : field_names_) {
@@ -274,7 +274,7 @@ class DirectoryPartitioningFactory : public PartitioningFactory {
274274

275275
for (auto path : paths) {
276276
size_t field_index = 0;
277-
for (auto&& segment : fs::internal::SplitAbstractPath(path.to_string())) {
277+
for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
278278
if (field_index == field_names_.size()) break;
279279

280280
impl.InsertRepr(static_cast<int>(field_index++), std::move(segment));
@@ -574,11 +574,11 @@ class HivePartitioningFactory : public PartitioningFactory {
574574
std::string type_name() const override { return "hive"; }
575575

576576
Result<std::shared_ptr<Schema>> Inspect(
577-
const std::vector<string_view>& paths) const override {
577+
const std::vector<std::string>& paths) const override {
578578
KeyValuePartitioningInspectImpl impl;
579579

580580
for (auto path : paths) {
581-
for (auto&& segment : fs::internal::SplitAbstractPath(path.to_string())) {
581+
for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
582582
if (auto key = HivePartitioning::ParseKey(segment)) {
583583
impl.InsertRepr(key->name, key->value);
584584
}

cpp/src/arrow/dataset/partition.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class ARROW_DS_EXPORT PartitioningFactory {
9797

9898
/// Get the schema for the resulting Partitioning.
9999
virtual Result<std::shared_ptr<Schema>> Inspect(
100-
const std::vector<util::string_view>& paths) const = 0;
100+
const std::vector<std::string>& paths) const = 0;
101101

102102
/// Create a partitioning using the provided schema
103103
/// (fields may be dropped).

cpp/src/arrow/dataset/partition_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,14 @@ class TestPartitioning : public ::testing::Test {
5454
AssertParse(path, expected.Copy());
5555
}
5656

57-
void AssertInspect(const std::vector<util::string_view>& paths,
57+
void AssertInspect(const std::vector<std::string>& paths,
5858
const std::vector<std::shared_ptr<Field>>& expected) {
5959
ASSERT_OK_AND_ASSIGN(auto actual, factory_->Inspect(paths));
6060
ASSERT_EQ(*actual, Schema(expected));
6161
ASSERT_OK(factory_->Finish(actual).status());
6262
}
6363

64-
void AssertInspectError(const std::vector<util::string_view>& paths) {
64+
void AssertInspectError(const std::vector<std::string>& paths) {
6565
ASSERT_RAISES(Invalid, factory_->Inspect(paths));
6666
}
6767

0 commit comments

Comments
 (0)