Skip to content

Commit 1d7bc3e

Browse files
ARROW-13572: [C++][Datasets] Add ORC support to Datasets API
Closes apache#10991 from jorisvandenbossche/ARROW-13572-dataset-orc Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
1 parent 2baf1a0 commit 1d7bc3e

16 files changed

Lines changed: 591 additions & 18 deletions

File tree

cpp/src/arrow/dataset/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ if(ARROW_CSV)
3535
set(ARROW_DATASET_SRCS ${ARROW_DATASET_SRCS} file_csv.cc)
3636
endif()
3737

38+
if(ARROW_ORC)
39+
set(ARROW_DATASET_SRCS ${ARROW_DATASET_SRCS} file_orc.cc)
40+
endif()
41+
3842
if(ARROW_PARQUET)
3943
set(ARROW_DATASET_LINK_STATIC ${ARROW_DATASET_LINK_STATIC} parquet_static)
4044
set(ARROW_DATASET_LINK_SHARED ${ARROW_DATASET_LINK_SHARED} parquet_shared)
@@ -116,6 +120,10 @@ if(ARROW_CSV)
116120
add_arrow_dataset_test(file_csv_test)
117121
endif()
118122

123+
if(ARROW_ORC)
124+
add_arrow_dataset_test(file_orc_test)
125+
endif()
126+
119127
if(ARROW_PARQUET)
120128
add_arrow_dataset_test(file_parquet_test)
121129
endif()

cpp/src/arrow/dataset/api.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,6 @@
2525
#include "arrow/dataset/file_base.h"
2626
#include "arrow/dataset/file_csv.h"
2727
#include "arrow/dataset/file_ipc.h"
28+
#include "arrow/dataset/file_orc.h"
2829
#include "arrow/dataset/file_parquet.h"
2930
#include "arrow/dataset/scanner.h"

cpp/src/arrow/dataset/file_orc.cc

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/dataset/file_orc.h"
19+
20+
#include <memory>
21+
22+
#include "arrow/adapters/orc/adapter.h"
23+
#include "arrow/dataset/dataset_internal.h"
24+
#include "arrow/dataset/file_base.h"
25+
#include "arrow/dataset/scanner.h"
26+
#include "arrow/util/checked_cast.h"
27+
#include "arrow/util/iterator.h"
28+
#include "arrow/util/logging.h"
29+
30+
namespace arrow {
31+
32+
using internal::checked_pointer_cast;
33+
34+
namespace dataset {
35+
36+
namespace {
37+
38+
inline Result<std::unique_ptr<arrow::adapters::orc::ORCFileReader>> OpenReader(
39+
const FileSource& source,
40+
const std::shared_ptr<ScanOptions>& scan_options = nullptr) {
41+
ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
42+
43+
arrow::MemoryPool* pool;
44+
if (scan_options) {
45+
pool = scan_options->pool;
46+
} else {
47+
pool = default_memory_pool();
48+
}
49+
50+
auto reader = arrow::adapters::orc::ORCFileReader::Open(std::move(input), pool);
51+
auto status = reader.status();
52+
if (!status.ok()) {
53+
return status.WithMessage("Could not open ORC input source '", source.path(),
54+
"': ", status.message());
55+
}
56+
return reader;
57+
}
58+
59+
/// \brief A ScanTask backed by an ORC file.
60+
class OrcScanTask : public ScanTask {
61+
public:
62+
OrcScanTask(std::shared_ptr<FileFragment> fragment,
63+
std::shared_ptr<ScanOptions> options)
64+
: ScanTask(std::move(options), fragment), source_(fragment->source()) {}
65+
66+
Result<RecordBatchIterator> Execute() override {
67+
struct Impl {
68+
static Result<RecordBatchIterator> Make(const FileSource& source,
69+
const FileFormat& format,
70+
const ScanOptions& scan_options) {
71+
ARROW_ASSIGN_OR_RAISE(
72+
auto reader, OpenReader(source, std::make_shared<ScanOptions>(scan_options)));
73+
int num_stripes = reader->NumberOfStripes();
74+
return RecordBatchIterator(Impl{std::move(reader), 0, num_stripes});
75+
}
76+
77+
Result<std::shared_ptr<RecordBatch>> Next() {
78+
if (i_ == num_stripes_) {
79+
return nullptr;
80+
}
81+
std::shared_ptr<RecordBatch> batch;
82+
// TODO (https://issues.apache.org/jira/browse/ARROW-13797)
83+
// determine included fields from options_->MaterializedFields() to
84+
// optimize the column selection (see _column_index_lookup in python
85+
// orc.py for custom logic)
86+
// std::vector<int> included_fields;
87+
// TODO (https://issues.apache.org/jira/browse/ARROW-14153)
88+
// pass scan_options_->batch_size
89+
return reader_->ReadStripe(i_++);
90+
}
91+
92+
std::unique_ptr<arrow::adapters::orc::ORCFileReader> reader_;
93+
int i_;
94+
int num_stripes_;
95+
};
96+
97+
return Impl::Make(source_, *checked_pointer_cast<FileFragment>(fragment_)->format(),
98+
*options_);
99+
}
100+
101+
private:
102+
FileSource source_;
103+
};
104+
105+
class OrcScanTaskIterator {
106+
public:
107+
static Result<ScanTaskIterator> Make(std::shared_ptr<ScanOptions> options,
108+
std::shared_ptr<FileFragment> fragment) {
109+
return ScanTaskIterator(OrcScanTaskIterator(std::move(options), std::move(fragment)));
110+
}
111+
112+
Result<std::shared_ptr<ScanTask>> Next() {
113+
if (once_) {
114+
// Iteration is done.
115+
return nullptr;
116+
}
117+
118+
once_ = true;
119+
return std::shared_ptr<ScanTask>(new OrcScanTask(fragment_, options_));
120+
}
121+
122+
private:
123+
OrcScanTaskIterator(std::shared_ptr<ScanOptions> options,
124+
std::shared_ptr<FileFragment> fragment)
125+
: options_(std::move(options)), fragment_(std::move(fragment)) {}
126+
127+
bool once_ = false;
128+
std::shared_ptr<ScanOptions> options_;
129+
std::shared_ptr<FileFragment> fragment_;
130+
};
131+
132+
} // namespace
133+
134+
Result<bool> OrcFileFormat::IsSupported(const FileSource& source) const {
135+
RETURN_NOT_OK(source.Open().status());
136+
return OpenReader(source).ok();
137+
}
138+
139+
Result<std::shared_ptr<Schema>> OrcFileFormat::Inspect(const FileSource& source) const {
140+
ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source));
141+
return reader->ReadSchema();
142+
}
143+
144+
Result<ScanTaskIterator> OrcFileFormat::ScanFile(
145+
const std::shared_ptr<ScanOptions>& options,
146+
const std::shared_ptr<FileFragment>& fragment) const {
147+
return OrcScanTaskIterator::Make(options, fragment);
148+
}
149+
150+
Future<util::optional<int64_t>> OrcFileFormat::CountRows(
151+
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
152+
const std::shared_ptr<ScanOptions>& options) {
153+
if (ExpressionHasFieldRefs(predicate)) {
154+
return Future<util::optional<int64_t>>::MakeFinished(util::nullopt);
155+
}
156+
auto self = checked_pointer_cast<OrcFileFormat>(shared_from_this());
157+
return DeferNotOk(options->io_context.executor()->Submit(
158+
[self, file]() -> Result<util::optional<int64_t>> {
159+
ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(file->source()));
160+
return reader->NumberOfRows();
161+
}));
162+
}
163+
164+
// //
165+
// // OrcFileWriter, OrcFileWriteOptions
166+
// //
167+
168+
std::shared_ptr<FileWriteOptions> OrcFileFormat::DefaultWriteOptions() {
169+
// TODO (https://issues.apache.org/jira/browse/ARROW-13796)
170+
return nullptr;
171+
}
172+
173+
Result<std::shared_ptr<FileWriter>> OrcFileFormat::MakeWriter(
174+
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
175+
std::shared_ptr<FileWriteOptions> options,
176+
fs::FileLocator destination_locator) const {
177+
// TODO (https://issues.apache.org/jira/browse/ARROW-13796)
178+
return Status::NotImplemented("ORC writer not yet implemented.");
179+
}
180+
181+
} // namespace dataset
182+
} // namespace arrow

cpp/src/arrow/dataset/file_orc.h

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
// This API is EXPERIMENTAL.
19+
20+
#pragma once
21+
22+
#include <memory>
23+
#include <string>
24+
25+
#include "arrow/dataset/file_base.h"
26+
#include "arrow/dataset/type_fwd.h"
27+
#include "arrow/dataset/visibility.h"
28+
#include "arrow/io/type_fwd.h"
29+
#include "arrow/result.h"
30+
31+
namespace arrow {
32+
namespace dataset {
33+
34+
/// \addtogroup dataset-file-formats
35+
///
36+
/// @{
37+
38+
constexpr char kOrcTypeName[] = "orc";
39+
40+
/// \brief A FileFormat implementation that reads from and writes to ORC files
41+
class ARROW_DS_EXPORT OrcFileFormat : public FileFormat {
42+
public:
43+
std::string type_name() const override { return kOrcTypeName; }
44+
45+
bool Equals(const FileFormat& other) const override {
46+
return type_name() == other.type_name();
47+
}
48+
49+
Result<bool> IsSupported(const FileSource& source) const override;
50+
51+
/// \brief Return the schema of the file if possible.
52+
Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
53+
54+
/// \brief Open a file for scanning
55+
Result<ScanTaskIterator> ScanFile(
56+
const std::shared_ptr<ScanOptions>& options,
57+
const std::shared_ptr<FileFragment>& fragment) const override;
58+
59+
// TODO add async version (https://issues.apache.org/jira/browse/ARROW-13795)
60+
// Result<RecordBatchGenerator> ScanBatchesAsync(
61+
// const std::shared_ptr<ScanOptions>& options,
62+
// const std::shared_ptr<FileFragment>& file) const override;
63+
64+
Future<util::optional<int64_t>> CountRows(
65+
const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
66+
const std::shared_ptr<ScanOptions>& options) override;
67+
68+
Result<std::shared_ptr<FileWriter>> MakeWriter(
69+
std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
70+
std::shared_ptr<FileWriteOptions> options,
71+
fs::FileLocator destination_locator) const override;
72+
73+
std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
74+
};
75+
76+
/// @}
77+
78+
} // namespace dataset
79+
} // namespace arrow
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/dataset/file_orc.h"
19+
20+
#include <memory>
21+
#include <utility>
22+
23+
#include "arrow/adapters/orc/adapter.h"
24+
#include "arrow/dataset/dataset_internal.h"
25+
#include "arrow/dataset/discovery.h"
26+
#include "arrow/dataset/file_base.h"
27+
#include "arrow/dataset/partition.h"
28+
#include "arrow/dataset/scanner_internal.h"
29+
#include "arrow/dataset/test_util.h"
30+
#include "arrow/io/memory.h"
31+
#include "arrow/record_batch.h"
32+
#include "arrow/table.h"
33+
#include "arrow/testing/gtest_util.h"
34+
#include "arrow/testing/util.h"
35+
36+
namespace arrow {
37+
namespace dataset {
38+
39+
class OrcFormatHelper {
40+
public:
41+
using FormatType = OrcFileFormat;
42+
static Result<std::shared_ptr<Buffer>> Write(RecordBatchReader* reader) {
43+
ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create());
44+
ARROW_ASSIGN_OR_RAISE(auto writer, adapters::orc::ORCFileWriter::Open(sink.get()));
45+
std::shared_ptr<Table> table;
46+
RETURN_NOT_OK(reader->ReadAll(&table));
47+
writer->Write(*table);
48+
RETURN_NOT_OK(writer->Close());
49+
return sink->Finish();
50+
}
51+
52+
static std::shared_ptr<OrcFileFormat> MakeFormat() {
53+
return std::make_shared<OrcFileFormat>();
54+
}
55+
};
56+
57+
class TestOrcFileFormat : public FileFormatFixtureMixin<OrcFormatHelper> {};
58+
59+
// TEST_F(TestOrcFileFormat, WriteRecordBatchReader) { TestWrite(); }
60+
61+
TEST_F(TestOrcFileFormat, InspectFailureWithRelevantError) {
62+
TestInspectFailureWithRelevantError(StatusCode::IOError, "ORC");
63+
}
64+
TEST_F(TestOrcFileFormat, Inspect) { TestInspect(); }
65+
TEST_F(TestOrcFileFormat, IsSupported) { TestIsSupported(); }
66+
TEST_F(TestOrcFileFormat, CountRows) { TestCountRows(); }
67+
68+
// TODO add TestOrcFileSystemDataset if write support is added
69+
70+
class TestOrcFileFormatScan : public FileFormatScanMixin<OrcFormatHelper> {};
71+
72+
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReader) { TestScan(); }
73+
TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderWithVirtualColumn) {
74+
TestScanWithVirtualColumn();
75+
}
76+
// TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderProjected) { TestScanProjected(); }
77+
// TEST_P(TestOrcFileFormatScan, ScanRecordBatchReaderProjectedMissingCols) {
78+
// TestScanProjectedMissingCols();
79+
// }
80+
INSTANTIATE_TEST_SUITE_P(TestScan, TestOrcFileFormatScan,
81+
::testing::ValuesIn(TestFormatParams::Values()),
82+
TestFormatParams::ToTestNameString);
83+
84+
} // namespace dataset
85+
} // namespace arrow

0 commit comments

Comments
 (0)