Skip to content

Commit e1d574c

Browse files
wesmxhochy
authored andcommitted
ARROW-1301: [C++/Python] More complete filesystem API for HDFS
This also includes a fair bit of API normalization and cleaning. Author: Wes McKinney <wes.mckinney@twosigma.com> Closes apache#926 from wesm/ARROW-1301 and squashes the following commits: bcc9310 [Wes McKinney] Add missing API 8bf51f5 [Wes McKinney] Add more filesystem methods, tests for HDFS 98847b5 [Wes McKinney] Some HDFS refactoring. Implement chmod, chown. Normalize Filesystem->FileSystem
1 parent 3a84653 commit e1d574c

19 files changed

Lines changed: 915 additions & 446 deletions

cpp/src/arrow/io/hdfs-internal.cc

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -462,21 +462,11 @@ tOffset LibHdfsShim::GetUsed(hdfsFS fs) { return this->hdfsGetUsed(fs); }
462462

463463
int LibHdfsShim::Chown(hdfsFS fs, const char* path, const char* owner,
464464
const char* group) {
465-
GET_SYMBOL(this, hdfsChown);
466-
if (this->hdfsChown) {
467-
return this->hdfsChown(fs, path, owner, group);
468-
} else {
469-
return 0;
470-
}
465+
return this->hdfsChown(fs, path, owner, group);
471466
}
472467

473468
int LibHdfsShim::Chmod(hdfsFS fs, const char* path, short mode) { // NOLINT
474-
GET_SYMBOL(this, hdfsChmod);
475-
if (this->hdfsChmod) {
476-
return this->hdfsChmod(fs, path, mode);
477-
} else {
478-
return 0;
479-
}
469+
return this->hdfsChmod(fs, path, mode);
480470
}
481471

482472
int LibHdfsShim::Utime(hdfsFS fs, const char* path, tTime mtime, tTime atime) {
@@ -504,6 +494,8 @@ Status LibHdfsShim::GetRequiredSymbols() {
504494
GET_SYMBOL_REQUIRED(this, hdfsGetUsed);
505495
GET_SYMBOL_REQUIRED(this, hdfsGetPathInfo);
506496
GET_SYMBOL_REQUIRED(this, hdfsListDirectory);
497+
GET_SYMBOL_REQUIRED(this, hdfsChown);
498+
GET_SYMBOL_REQUIRED(this, hdfsChmod);
507499

508500
// File methods
509501
GET_SYMBOL_REQUIRED(this, hdfsCloseFile);

cpp/src/arrow/io/hdfs.cc

Lines changed: 81 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -308,9 +308,9 @@ static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) {
308308
}
309309

310310
// Private implementation
311-
class HdfsClient::HdfsClientImpl {
311+
class HadoopFileSystem::HadoopFileSystemImpl {
312312
public:
313-
HdfsClientImpl() {}
313+
HadoopFileSystemImpl() {}
314314

315315
Status Connect(const HdfsConnectionConfig* config) {
316316
if (config->driver == HdfsDriver::LIBHDFS3) {
@@ -396,6 +396,24 @@ class HdfsClient::HdfsClientImpl {
396396
return Status::OK();
397397
}
398398

399+
Status Stat(const std::string& path, FileStatistics* stat) {
400+
HdfsPathInfo info;
401+
RETURN_NOT_OK(GetPathInfo(path, &info));
402+
403+
stat->size = info.size;
404+
stat->kind = info.kind;
405+
return Status::OK();
406+
}
407+
408+
Status GetChildren(const std::string& path, std::vector<std::string>* listing) {
409+
std::vector<HdfsPathInfo> detailed_listing;
410+
RETURN_NOT_OK(ListDirectory(path, &detailed_listing));
411+
for (const auto& info : detailed_listing) {
412+
listing->push_back(info.name);
413+
}
414+
return Status::OK();
415+
}
416+
399417
Status ListDirectory(const std::string& path, std::vector<HdfsPathInfo>* listing) {
400418
int num_entries = 0;
401419
hdfsFileInfo* entries = driver_->ListDirectory(fs_, path.c_str(), &num_entries);
@@ -476,6 +494,18 @@ class HdfsClient::HdfsClientImpl {
476494
return Status::OK();
477495
}
478496

497+
Status Chmod(const std::string& path, int mode) {
498+
int ret = driver_->Chmod(fs_, path.c_str(), static_cast<short>(mode)); // NOLINT
499+
CHECK_FAILURE(ret, "Chmod");
500+
return Status::OK();
501+
}
502+
503+
Status Chown(const std::string& path, const char* owner, const char* group) {
504+
int ret = driver_->Chown(fs_, path.c_str(), owner, group);
505+
CHECK_FAILURE(ret, "Chown");
506+
return Status::OK();
507+
}
508+
479509
private:
480510
LibHdfsShim* driver_;
481511

@@ -490,68 +520,92 @@ class HdfsClient::HdfsClientImpl {
490520
// ----------------------------------------------------------------------
491521
// Public API for HDFSClient
492522

493-
HdfsClient::HdfsClient() { impl_.reset(new HdfsClientImpl()); }
523+
HadoopFileSystem::HadoopFileSystem() { impl_.reset(new HadoopFileSystemImpl()); }
494524

495-
HdfsClient::~HdfsClient() {}
525+
HadoopFileSystem::~HadoopFileSystem() {}
496526

497-
Status HdfsClient::Connect(const HdfsConnectionConfig* config,
498-
std::shared_ptr<HdfsClient>* fs) {
527+
Status HadoopFileSystem::Connect(const HdfsConnectionConfig* config,
528+
std::shared_ptr<HadoopFileSystem>* fs) {
499529
// ctor is private, make_shared will not work
500-
*fs = std::shared_ptr<HdfsClient>(new HdfsClient());
530+
*fs = std::shared_ptr<HadoopFileSystem>(new HadoopFileSystem());
501531

502532
RETURN_NOT_OK((*fs)->impl_->Connect(config));
503533
return Status::OK();
504534
}
505535

506-
Status HdfsClient::MakeDirectory(const std::string& path) {
536+
Status HadoopFileSystem::MakeDirectory(const std::string& path) {
507537
return impl_->MakeDirectory(path);
508538
}
509539

510-
Status HdfsClient::Delete(const std::string& path, bool recursive) {
540+
Status HadoopFileSystem::Delete(const std::string& path, bool recursive) {
511541
return impl_->Delete(path, recursive);
512542
}
513543

514-
Status HdfsClient::Disconnect() { return impl_->Disconnect(); }
544+
Status HadoopFileSystem::DeleteDirectory(const std::string& path) {
545+
return Delete(path, true);
546+
}
547+
548+
Status HadoopFileSystem::Disconnect() { return impl_->Disconnect(); }
515549

516-
bool HdfsClient::Exists(const std::string& path) { return impl_->Exists(path); }
550+
bool HadoopFileSystem::Exists(const std::string& path) { return impl_->Exists(path); }
517551

518-
Status HdfsClient::GetPathInfo(const std::string& path, HdfsPathInfo* info) {
552+
Status HadoopFileSystem::GetPathInfo(const std::string& path, HdfsPathInfo* info) {
519553
return impl_->GetPathInfo(path, info);
520554
}
521555

522-
Status HdfsClient::GetCapacity(int64_t* nbytes) { return impl_->GetCapacity(nbytes); }
556+
Status HadoopFileSystem::Stat(const std::string& path, FileStatistics* stat) {
557+
return impl_->Stat(path, stat);
558+
}
559+
560+
Status HadoopFileSystem::GetCapacity(int64_t* nbytes) {
561+
return impl_->GetCapacity(nbytes);
562+
}
563+
564+
Status HadoopFileSystem::GetUsed(int64_t* nbytes) { return impl_->GetUsed(nbytes); }
523565

524-
Status HdfsClient::GetUsed(int64_t* nbytes) { return impl_->GetUsed(nbytes); }
566+
Status HadoopFileSystem::GetChildren(const std::string& path,
567+
std::vector<std::string>* listing) {
568+
return impl_->GetChildren(path, listing);
569+
}
525570

526-
Status HdfsClient::ListDirectory(const std::string& path,
527-
std::vector<HdfsPathInfo>* listing) {
571+
Status HadoopFileSystem::ListDirectory(const std::string& path,
572+
std::vector<HdfsPathInfo>* listing) {
528573
return impl_->ListDirectory(path, listing);
529574
}
530575

531-
Status HdfsClient::OpenReadable(const std::string& path, int32_t buffer_size,
532-
std::shared_ptr<HdfsReadableFile>* file) {
576+
Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size,
577+
std::shared_ptr<HdfsReadableFile>* file) {
533578
return impl_->OpenReadable(path, buffer_size, file);
534579
}
535580

536-
Status HdfsClient::OpenReadable(const std::string& path,
537-
std::shared_ptr<HdfsReadableFile>* file) {
581+
Status HadoopFileSystem::OpenReadable(const std::string& path,
582+
std::shared_ptr<HdfsReadableFile>* file) {
538583
return OpenReadable(path, kDefaultHdfsBufferSize, file);
539584
}
540585

541-
Status HdfsClient::OpenWriteable(const std::string& path, bool append,
542-
int32_t buffer_size, int16_t replication,
543-
int64_t default_block_size,
544-
std::shared_ptr<HdfsOutputStream>* file) {
586+
Status HadoopFileSystem::OpenWriteable(const std::string& path, bool append,
587+
int32_t buffer_size, int16_t replication,
588+
int64_t default_block_size,
589+
std::shared_ptr<HdfsOutputStream>* file) {
545590
return impl_->OpenWriteable(path, append, buffer_size, replication, default_block_size,
546591
file);
547592
}
548593

549-
Status HdfsClient::OpenWriteable(const std::string& path, bool append,
550-
std::shared_ptr<HdfsOutputStream>* file) {
594+
Status HadoopFileSystem::OpenWriteable(const std::string& path, bool append,
595+
std::shared_ptr<HdfsOutputStream>* file) {
551596
return OpenWriteable(path, append, 0, 0, 0, file);
552597
}
553598

554-
Status HdfsClient::Rename(const std::string& src, const std::string& dst) {
599+
Status HadoopFileSystem::Chmod(const std::string& path, int mode) {
600+
return impl_->Chmod(path, mode);
601+
}
602+
603+
Status HadoopFileSystem::Chown(const std::string& path, const char* owner,
604+
const char* group) {
605+
return impl_->Chown(path, owner, group);
606+
}
607+
608+
Status HadoopFileSystem::Rename(const std::string& src, const std::string& dst) {
555609
return impl_->Rename(src, dst);
556610
}
557611

cpp/src/arrow/io/hdfs.h

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class Status;
3434

3535
namespace io {
3636

37-
class HdfsClient;
37+
class HadoopFileSystem;
3838
class HdfsReadableFile;
3939
class HdfsOutputStream;
4040

@@ -66,30 +66,32 @@ struct HdfsConnectionConfig {
6666
HdfsDriver driver;
6767
};
6868

69-
class ARROW_EXPORT HdfsClient : public FileSystemClient {
69+
class ARROW_EXPORT HadoopFileSystem : public FileSystem {
7070
public:
71-
~HdfsClient();
71+
~HadoopFileSystem();
7272

7373
// Connect to an HDFS cluster given a configuration
7474
//
7575
// @param config (in): configuration for connecting
7676
// @param fs (out): the created client
7777
// @returns Status
7878
static Status Connect(const HdfsConnectionConfig* config,
79-
std::shared_ptr<HdfsClient>* fs);
79+
std::shared_ptr<HadoopFileSystem>* fs);
8080

8181
// Create directory and all parents
8282
//
8383
// @param path (in): absolute HDFS path
8484
// @returns Status
85-
Status MakeDirectory(const std::string& path);
85+
Status MakeDirectory(const std::string& path) override;
8686

8787
// Delete file or directory
8888
// @param path: absolute path to data
8989
// @param recursive: if path is a directory, delete contents as well
9090
// @returns error status on failure
9191
Status Delete(const std::string& path, bool recursive = false);
9292

93+
Status DeleteDirectory(const std::string& path) override;
94+
9395
// Disconnect from cluster
9496
//
9597
// @returns Status
@@ -112,18 +114,29 @@ class ARROW_EXPORT HdfsClient : public FileSystemClient {
112114
// @returns Status
113115
Status GetUsed(int64_t* nbytes);
114116

117+
Status GetChildren(const std::string& path, std::vector<std::string>* listing) override;
118+
115119
Status ListDirectory(const std::string& path, std::vector<HdfsPathInfo>* listing);
116120

117-
// @param path file path to change
118-
// @param owner pass nullptr for no change
119-
// @param group pass nullptr for no change
121+
/// Change
122+
///
123+
/// @param path file path to change
124+
/// @param owner pass nullptr for no change
125+
/// @param group pass nullptr for no change
120126
Status Chown(const std::string& path, const char* owner, const char* group);
121127

128+
/// Change path permissions
129+
///
130+
/// \param path Absolute path in file system
131+
/// \param mode Mode bitset
132+
/// \return Status
122133
Status Chmod(const std::string& path, int mode);
123134

124135
// Move file or directory from source path to destination path within the
125136
// current filesystem
126-
Status Rename(const std::string& src, const std::string& dst);
137+
Status Rename(const std::string& src, const std::string& dst) override;
138+
139+
Status Stat(const std::string& path, FileStatistics* stat) override;
127140

128141
// TODO(wesm): GetWorkingDirectory, SetWorkingDirectory
129142

@@ -152,13 +165,18 @@ class ARROW_EXPORT HdfsClient : public FileSystemClient {
152165
friend class HdfsReadableFile;
153166
friend class HdfsOutputStream;
154167

155-
class ARROW_NO_EXPORT HdfsClientImpl;
156-
std::unique_ptr<HdfsClientImpl> impl_;
168+
class ARROW_NO_EXPORT HadoopFileSystemImpl;
169+
std::unique_ptr<HadoopFileSystemImpl> impl_;
157170

158-
HdfsClient();
159-
DISALLOW_COPY_AND_ASSIGN(HdfsClient);
171+
HadoopFileSystem();
172+
DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem);
160173
};
161174

175+
// 0.6.0
176+
#ifndef ARROW_NO_DEPRECATED_API
177+
using HdfsClient = HadoopFileSystem;
178+
#endif
179+
162180
class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile {
163181
public:
164182
~HdfsReadableFile();
@@ -191,7 +209,7 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile {
191209
class ARROW_NO_EXPORT HdfsReadableFileImpl;
192210
std::unique_ptr<HdfsReadableFileImpl> impl_;
193211

194-
friend class HdfsClient::HdfsClientImpl;
212+
friend class HadoopFileSystem::HadoopFileSystemImpl;
195213

196214
DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile);
197215
};
@@ -216,7 +234,7 @@ class ARROW_EXPORT HdfsOutputStream : public OutputStream {
216234
class ARROW_NO_EXPORT HdfsOutputStreamImpl;
217235
std::unique_ptr<HdfsOutputStreamImpl> impl_;
218236

219-
friend class HdfsClient::HdfsClientImpl;
237+
friend class HadoopFileSystem::HadoopFileSystemImpl;
220238

221239
HdfsOutputStream();
222240

cpp/src/arrow/io/interfaces.h

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <memory>
2323
#include <mutex>
2424
#include <string>
25+
#include <vector>
2526

2627
#include "arrow/util/macros.h"
2728
#include "arrow/util/visibility.h"
@@ -42,9 +43,29 @@ struct ObjectType {
4243
enum type { FILE, DIRECTORY };
4344
};
4445

45-
class ARROW_EXPORT FileSystemClient {
46+
struct ARROW_EXPORT FileStatistics {
47+
/// Size of file, -1 if finding length is unsupported
48+
int64_t size;
49+
ObjectType::type kind;
50+
51+
FileStatistics() {}
52+
FileStatistics(int64_t size, ObjectType::type kind) : size(size), kind(kind) {}
53+
};
54+
55+
class ARROW_EXPORT FileSystem {
4656
public:
47-
virtual ~FileSystemClient() {}
57+
virtual ~FileSystem() {}
58+
59+
virtual Status MakeDirectory(const std::string& path) = 0;
60+
61+
virtual Status DeleteDirectory(const std::string& path) = 0;
62+
63+
virtual Status GetChildren(const std::string& path,
64+
std::vector<std::string>* listing) = 0;
65+
66+
virtual Status Rename(const std::string& src, const std::string& dst) = 0;
67+
68+
virtual Status Stat(const std::string& path, FileStatistics* stat) = 0;
4869
};
4970

5071
class ARROW_EXPORT FileInterface {

0 commit comments

Comments
 (0)