Skip to content

Commit bd60dbc

Browse files
committed
ARROW-12790: [C++] Improve HadoopFileSystem conformance
* Ensure the HadoopFileSystem meets most requirements from the FileSystem API. * Implement HadoopFileSystem::CopyFile. * Enable generic filesystem tests for HadoopFileSystem. * Add generic filesystem test for special characters. Closes apache#10574 from pitrou/ARROW-12790-hdfs-special-chars Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 133b1a9 commit bd60dbc

7 files changed

Lines changed: 226 additions & 83 deletions

File tree

cpp/src/arrow/filesystem/hdfs.cc

Lines changed: 63 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,18 @@ class HadoopFileSystem::Impl {
106106
return st;
107107
}
108108
for (const auto& child_path_info : children) {
109-
// HDFS returns an absolute URI here, need to extract path relative to wd
110-
Uri uri;
111-
RETURN_NOT_OK(uri.Parse(child_path_info.name));
112-
std::string child_path = uri.path();
109+
// HDFS returns an absolute "URI" here, need to extract path relative to wd
110+
// XXX: unfortunately, this is not a real URI as special characters
111+
// are not %-escaped... hence parsing it as URI would fail.
112+
std::string child_path;
113113
if (!wd.empty()) {
114-
ARROW_ASSIGN_OR_RAISE(child_path, MakeAbstractPathRelative(wd, child_path));
114+
if (child_path_info.name.substr(0, wd.length()) != wd) {
115+
return Status::IOError("HDFS returned path '", child_path_info.name,
116+
"' that is not a child of '", wd, "'");
117+
}
118+
child_path = child_path_info.name.substr(wd.length());
119+
} else {
120+
child_path = child_path_info.name;
115121
}
116122

117123
FileInfo info;
@@ -134,21 +140,39 @@ class HadoopFileSystem::Impl {
134140
}
135141
std::vector<FileInfo> results;
136142

143+
// Fetch working directory.
144+
// If select.base_dir is relative, we need to trim it from the start
145+
// of paths returned by ListDirectory.
146+
// If select.base_dir is absolute, we need to trim the "URI authority"
147+
// portion of the working directory.
137148
std::string wd;
138-
if (select.base_dir.empty() || select.base_dir.front() != '/') {
139-
// Fetch working directory, because we need to trim it from the start
140-
// of paths returned by ListDirectory as select.base_dir is relative.
141-
RETURN_NOT_OK(client_->GetWorkingDirectory(&wd));
142-
Uri wd_uri;
143-
RETURN_NOT_OK(wd_uri.Parse(wd));
144-
wd = wd_uri.path();
149+
RETURN_NOT_OK(client_->GetWorkingDirectory(&wd));
150+
151+
if (!select.base_dir.empty() && select.base_dir.front() == '/') {
152+
// base_dir is absolute, only keep the URI authority portion.
153+
// As mentioned in StatSelector() above, the URI may contain unescaped
154+
// special chars and therefore may not be a valid URI, so we parse by hand.
155+
auto pos = wd.find("://"); // start of host:port portion
156+
if (pos == std::string::npos) {
157+
return Status::IOError("Unexpected HDFS working directory URI: ", wd);
158+
}
159+
pos = wd.find("/", pos + 3); // end of host:port portion
160+
if (pos == std::string::npos) {
161+
return Status::IOError("Unexpected HDFS working directory URI: ", wd);
162+
}
163+
wd = wd.substr(0, pos); // keep up until host:port (included)
164+
} else if (!wd.empty() && wd.back() != '/') {
165+
// For a relative lookup, trim leading slashes
166+
wd += '/';
145167
}
146168

147-
ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(select.base_dir));
148-
if (info.type() == FileType::File) {
149-
return Status::Invalid(
150-
"GetFileInfo expects base_dir of selector to be a directory, while '",
151-
select.base_dir, "' is a file");
169+
if (!select.base_dir.empty()) {
170+
ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(select.base_dir));
171+
if (info.type() == FileType::File) {
172+
return Status::IOError(
173+
"GetFileInfo expects base_dir of selector to be a directory, but '",
174+
select.base_dir, "' is a file");
175+
}
152176
}
153177
RETURN_NOT_OK(StatSelector(wd, select.base_dir, select, 0, &results));
154178
return results;
@@ -178,6 +202,10 @@ class HadoopFileSystem::Impl {
178202
}
179203

180204
Status DeleteDirContents(const std::string& path) {
205+
if (!IsDirectory(path)) {
206+
return Status::IOError("Cannot delete contents of directory '", path,
207+
"': not a directory");
208+
}
181209
std::vector<std::string> file_list;
182210
RETURN_NOT_OK(client_->GetChildren(path, &file_list));
183211
for (auto file : file_list) {
@@ -195,13 +223,17 @@ class HadoopFileSystem::Impl {
195223
}
196224

197225
Status Move(const std::string& src, const std::string& dest) {
198-
RETURN_NOT_OK(client_->Rename(src, dest));
199-
return Status::OK();
226+
auto st = client_->Rename(src, dest);
227+
if (st.IsIOError() && IsFile(src) && IsFile(dest)) {
228+
// Allow file -> file clobber
229+
RETURN_NOT_OK(client_->Delete(dest));
230+
st = client_->Rename(src, dest);
231+
}
232+
return st;
200233
}
201234

202235
Status CopyFile(const std::string& src, const std::string& dest) {
203-
// TODO implement this (but only if HDFS supports on-server copy)
204-
return Status::NotImplemented("HadoopFileSystem::CopyFile is not supported yet");
236+
return client_->Copy(src, dest);
205237
}
206238

207239
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const std::string& path) {
@@ -253,14 +285,16 @@ class HadoopFileSystem::Impl {
253285

254286
bool IsDirectory(const std::string& path) {
255287
io::HdfsPathInfo info;
256-
Status status = client_->GetPathInfo(path, &info);
257-
if (!status.ok()) {
258-
return false;
259-
}
260-
if (info.kind == io::ObjectType::DIRECTORY) {
261-
return true;
262-
}
263-
return false;
288+
return GetPathInfo(path, &info) && info.kind == io::ObjectType::DIRECTORY;
289+
}
290+
291+
bool IsFile(const std::string& path) {
292+
io::HdfsPathInfo info;
293+
return GetPathInfo(path, &info) && info.kind == io::ObjectType::FILE;
294+
}
295+
296+
bool GetPathInfo(const std::string& path, io::HdfsPathInfo* info) {
297+
return client_->GetPathInfo(path, info).ok();
264298
}
265299

266300
TimePoint ToTimePoint(int secs) {

cpp/src/arrow/filesystem/hdfs_test.cc

Lines changed: 55 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include <chrono>
1819
#include <memory>
1920
#include <sstream>
2021
#include <string>
@@ -66,9 +67,9 @@ TEST(TestHdfsOptions, FromUri) {
6667
ASSERT_EQ(options.connection_config.user, "");
6768
}
6869

69-
class TestHadoopFileSystem : public ::testing::Test {
70+
class HadoopFileSystemTestMixin {
7071
public:
71-
void SetUp() override {
72+
void MakeFileSystem() {
7273
const char* host = std::getenv("ARROW_HDFS_TEST_HOST");
7374
const char* port = std::getenv("ARROW_HDFS_TEST_PORT");
7475
const char* user = std::getenv("ARROW_HDFS_TEST_USER");
@@ -91,9 +92,19 @@ class TestHadoopFileSystem : public ::testing::Test {
9192
return;
9293
}
9394
loaded_driver_ = true;
94-
fs_ = std::make_shared<SubTreeFileSystem>("", *result);
95+
fs_ = *result;
9596
}
9697

98+
protected:
99+
HdfsOptions options_;
100+
bool loaded_driver_ = false;
101+
std::shared_ptr<FileSystem> fs_;
102+
};
103+
104+
class TestHadoopFileSystem : public ::testing::Test, public HadoopFileSystemTestMixin {
105+
public:
106+
void SetUp() override { MakeFileSystem(); }
107+
97108
void TestFileSystemFromUri() {
98109
std::stringstream ss;
99110
ss << "hdfs://" << options_.connection_config.host << ":"
@@ -176,17 +187,11 @@ class TestHadoopFileSystem : public ::testing::Test {
176187
ASSERT_OK(fs_->DeleteDir(base_dir + "AB"));
177188
AssertFileInfo(fs_.get(), base_dir + "AB", FileType::NotFound);
178189
}
179-
180-
protected:
181-
std::shared_ptr<FileSystem> fs_;
182-
HdfsOptions options_;
183-
bool loaded_driver_ = false;
184190
};
185191

186-
#define SKIP_IF_NO_DRIVER() \
187-
if (!this->loaded_driver_) { \
188-
ARROW_LOG(INFO) << "Driver not loaded, skipping"; \
189-
return; \
192+
#define SKIP_IF_NO_DRIVER() \
193+
if (!this->loaded_driver_) { \
194+
GTEST_SKIP() << "Driver not loaded, skipping"; \
190195
}
191196

192197
TEST_F(TestHadoopFileSystem, CreateDirDeleteDir) {
@@ -308,5 +313,43 @@ TEST_F(TestHadoopFileSystem, FileSystemFromUri) {
308313
this->TestFileSystemFromUri();
309314
}
310315

316+
class TestHadoopFileSystemGeneric : public ::testing::Test,
317+
public HadoopFileSystemTestMixin,
318+
public GenericFileSystemTest {
319+
public:
320+
void SetUp() override {
321+
MakeFileSystem();
322+
SKIP_IF_NO_DRIVER();
323+
timestamp_ =
324+
static_cast<int64_t>(std::chrono::time_point_cast<std::chrono::nanoseconds>(
325+
std::chrono::steady_clock::now())
326+
.time_since_epoch()
327+
.count());
328+
}
329+
330+
protected:
331+
bool allow_write_file_over_dir() const override { return true; }
332+
bool allow_move_dir_over_non_empty_dir() const override { return true; }
333+
bool have_implicit_directories() const override { return true; }
334+
bool allow_append_to_new_file() const override { return false; }
335+
336+
std::shared_ptr<FileSystem> GetEmptyFileSystem() override {
337+
// Since the HDFS contents are kept persistently between test runs,
338+
// make sure each test gets a pristine fresh directory.
339+
std::stringstream ss;
340+
ss << "GenericTest" << timestamp_ << "-" << test_num_++;
341+
const auto subdir = ss.str();
342+
ARROW_EXPECT_OK(fs_->CreateDir(subdir));
343+
return std::make_shared<SubTreeFileSystem>(subdir, fs_);
344+
}
345+
346+
static int test_num_;
347+
int64_t timestamp_;
348+
};
349+
350+
int TestHadoopFileSystemGeneric::test_num_ = 1;
351+
352+
GENERIC_FS_TEST_FUNCTIONS(TestHadoopFileSystemGeneric);
353+
311354
} // namespace fs
312355
} // namespace arrow

cpp/src/arrow/filesystem/test_util.cc

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -452,20 +452,28 @@ void GenericFileSystemTest::TestMoveDir(FileSystem* fs) {
452452
AssertAllDirs(fs, {"EF", "KL", "KL/CD"});
453453
AssertAllFiles(fs, {"EF/ghi", "KL/CD/def", "KL/abc"});
454454

455-
// Destination is a non-empty directory
456-
ASSERT_RAISES(IOError, fs->Move("KL", "EF"));
457-
AssertAllDirs(fs, {"EF", "KL", "KL/CD"});
458-
AssertAllFiles(fs, {"EF/ghi", "KL/CD/def", "KL/abc"});
459-
460455
// Cannot move directory inside itself
461456
ASSERT_RAISES(IOError, fs->Move("KL", "KL/ZZ"));
462457

463-
// (other errors tested in TestMoveFile)
464-
465458
// Contents didn't change
466459
AssertAllDirs(fs, {"EF", "KL", "KL/CD"});
467460
AssertFileContents(fs, "KL/abc", "abc data");
468461
AssertFileContents(fs, "KL/CD/def", "def data");
462+
463+
// Destination is a non-empty directory
464+
if (!allow_move_dir_over_non_empty_dir()) {
465+
ASSERT_RAISES(IOError, fs->Move("KL", "EF"));
466+
AssertAllDirs(fs, {"EF", "KL", "KL/CD"});
467+
AssertAllFiles(fs, {"EF/ghi", "KL/CD/def", "KL/abc"});
468+
} else {
469+
// In some filesystems such as HDFS, this operation is interpreted
470+
// as with the Unix `mv` command, i.e. move KL *inside* EF.
471+
ASSERT_OK(fs->Move("KL", "EF"));
472+
AssertAllDirs(fs, {"EF", "EF/KL", "EF/KL/CD"});
473+
AssertAllFiles(fs, {"EF/KL/CD/def", "EF/KL/abc", "EF/ghi"});
474+
}
475+
476+
// (other errors tested in TestMoveFile)
469477
}
470478

471479
void GenericFileSystemTest::TestCopyFile(FileSystem* fs) {
@@ -888,7 +896,11 @@ void GenericFileSystemTest::TestOpenAppendStream(FileSystem* fs) {
888896

889897
std::shared_ptr<io::OutputStream> stream;
890898

891-
ASSERT_OK_AND_ASSIGN(stream, fs->OpenAppendStream("abc"));
899+
if (allow_append_to_new_file()) {
900+
ASSERT_OK_AND_ASSIGN(stream, fs->OpenAppendStream("abc"));
901+
} else {
902+
ASSERT_OK_AND_ASSIGN(stream, fs->OpenOutputStream("abc"));
903+
}
892904
ASSERT_OK_AND_EQ(0, stream->Tell());
893905
ASSERT_OK(stream->Write("some "));
894906
ASSERT_OK(stream->Write(Buffer::FromString("data")));
@@ -1050,6 +1062,26 @@ void GenericFileSystemTest::TestOpenInputFileWithFileInfo(FileSystem* fs) {
10501062
ASSERT_RAISES(IOError, fs->OpenInputFile(info));
10511063
}
10521064

1065+
void GenericFileSystemTest::TestSpecialChars(FileSystem* fs) {
1066+
ASSERT_OK(fs->CreateDir("Blank Char"));
1067+
CreateFile(fs, "Blank Char/Special%Char.txt", "data");
1068+
std::vector<std::string> all_dirs{"Blank Char"};
1069+
1070+
AssertAllDirs(fs, all_dirs);
1071+
AssertAllFiles(fs, {"Blank Char/Special%Char.txt"});
1072+
AssertFileContents(fs, "Blank Char/Special%Char.txt", "data");
1073+
1074+
ASSERT_OK(fs->CopyFile("Blank Char/Special%Char.txt", "Special and%different.txt"));
1075+
AssertAllDirs(fs, all_dirs);
1076+
AssertAllFiles(fs, {"Blank Char/Special%Char.txt", "Special and%different.txt"});
1077+
AssertFileContents(fs, "Special and%different.txt", "data");
1078+
1079+
ASSERT_OK(fs->DeleteFile("Special and%different.txt"));
1080+
ASSERT_OK(fs->DeleteDir("Blank Char"));
1081+
AssertAllDirs(fs, {});
1082+
AssertAllFiles(fs, {});
1083+
}
1084+
10531085
#define GENERIC_FS_TEST_DEFINE(FUNC_NAME) \
10541086
void GenericFileSystemTest::FUNC_NAME() { FUNC_NAME(GetEmptyFileSystem().get()); }
10551087

@@ -1078,6 +1110,7 @@ GENERIC_FS_TEST_DEFINE(TestOpenInputStreamAsync)
10781110
GENERIC_FS_TEST_DEFINE(TestOpenInputFile)
10791111
GENERIC_FS_TEST_DEFINE(TestOpenInputFileWithFileInfo)
10801112
GENERIC_FS_TEST_DEFINE(TestOpenInputFileAsync)
1113+
GENERIC_FS_TEST_DEFINE(TestSpecialChars)
10811114

10821115
#undef GENERIC_FS_TEST_DEFINE
10831116

cpp/src/arrow/filesystem/test_util.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
121121
void TestOpenInputFile();
122122
void TestOpenInputFileWithFileInfo();
123123
void TestOpenInputFileAsync();
124+
void TestSpecialChars();
124125

125126
protected:
126127
// This function should return the filesystem under test.
@@ -134,8 +135,12 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
134135
virtual bool allow_write_file_over_dir() const { return false; }
135136
// - Whether the filesystem allows moving a directory
136137
virtual bool allow_move_dir() const { return true; }
138+
// - Whether the filesystem allows moving a directory "over" a non-empty destination
139+
virtual bool allow_move_dir_over_non_empty_dir() const { return false; }
137140
// - Whether the filesystem allows appending to a file
138141
virtual bool allow_append_to_file() const { return true; }
142+
// - Whether the filesystem allows appending to a new (not existent yet) file
143+
virtual bool allow_append_to_new_file() const { return true; }
139144
// - Whether the filesystem supports directory modification times
140145
virtual bool have_directory_mtimes() const { return true; }
141146
// - Whether some directory tree deletion tests may fail randomly
@@ -168,6 +173,7 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
168173
void TestOpenInputFile(FileSystem* fs);
169174
void TestOpenInputFileWithFileInfo(FileSystem* fs);
170175
void TestOpenInputFileAsync(FileSystem* fs);
176+
void TestSpecialChars(FileSystem* fs);
171177
};
172178

173179
#define GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NAME) \
@@ -198,7 +204,8 @@ class ARROW_TESTING_EXPORT GenericFileSystemTest {
198204
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamAsync) \
199205
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFile) \
200206
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileWithFileInfo) \
201-
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileAsync)
207+
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileAsync) \
208+
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, SpecialChars)
202209

203210
#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \
204211
GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_F, TEST_CLASS)

0 commit comments

Comments
 (0)