Skip to content

Commit fcde1eb

Browse files
wesmkounealrichardson
committed
ARROW-7979: [C++] Add experimental buffer compression to IPC write path. Add "field" selection to read path. Migrate some APIs to Result<T>. Read/write Message metadata
I apologize for the complexity of this patch, there was some necessary refactoring and then as I needed to add a `IpcReadOptions` struct, it made sense to port some APIs from Status to Result. Summary of what's here * Add IpcReadOptions struct * Rename IpcOptions to IpcWriteOptions * Serialize and write custom_metadata field in Message Flatbuffer struct * Move `MemoryPool*` arguments in IPC functions to options structs * Adds EXPERIMENTAL compression option to IpcOptions (which should be renamed IpcWriteOptions) which causes each buffer in a record batch body to be separately compressed with e.g. ZSTD. This is intended to be used internally in Feather V2 and not exported for general use. Based on the mailing list discussion, if we were to adopt this in the IPC protocol then this can be promoted to non-experimental * Add "included_fields" option to select a subset of fields to load from a record batch when doing an IPC load. The motivation for this was to avoid unnecessary decompression when reading a subset of columns from an IPC stream * Migrate most IPC read APIs to use Result, deprecate old methods * Deprecate Status-returning IPC write method Some other small changes: * Add `check_metadata` option to `RecordBatch::Equals` * Add `Codec::GetCompressionType` method for looking up `Compression::type` given a codec name I don't have size/perf benchmarks about how the compression helps with file sizes or read performance, so I'll do that next in the course of completing ARROW-5510 ("Feather V2" file format) Closes apache#6638 from wesm/ARROW-7979 Lead-authored-by: Wes McKinney <wesm+git@apache.org> Co-authored-by: Sutou Kouhei <kou@clear-code.com> Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com> Signed-off-by: Wes McKinney <wesm+git@apache.org>
1 parent 50c5daf commit fcde1eb

64 files changed

Lines changed: 1759 additions & 1295 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

c_glib/arrow-glib/reader.cpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -296,14 +296,12 @@ GArrowRecordBatchStreamReader *
296296
garrow_record_batch_stream_reader_new(GArrowInputStream *stream,
297297
GError **error)
298298
{
299-
using BaseType = arrow::ipc::RecordBatchReader;
300299
using ReaderType = arrow::ipc::RecordBatchStreamReader;
301300

302301
auto arrow_input_stream = garrow_input_stream_get_raw(stream);
303-
std::shared_ptr<BaseType> arrow_reader;
304-
auto status = ReaderType::Open(arrow_input_stream, &arrow_reader);
305-
if (garrow_error_check(error, status, "[record-batch-stream-reader][open]")) {
306-
auto subtype = std::dynamic_pointer_cast<ReaderType>(arrow_reader);
302+
auto arrow_reader = ReaderType::Open(arrow_input_stream);
303+
if (garrow::check(error, arrow_reader, "[record-batch-stream-reader][open]")) {
304+
auto subtype = std::dynamic_pointer_cast<ReaderType>(*arrow_reader);
307305
return garrow_record_batch_stream_reader_new_raw(&subtype);
308306
} else {
309307
return NULL;
@@ -411,14 +409,13 @@ GArrowRecordBatchFileReader *
411409
garrow_record_batch_file_reader_new(GArrowSeekableInputStream *file,
412410
GError **error)
413411
{
414-
auto arrow_random_access_file = garrow_seekable_input_stream_get_raw(file);
412+
using ReaderType = arrow::ipc::RecordBatchFileReader;
415413

416-
std::shared_ptr<arrow::ipc::RecordBatchFileReader> arrow_reader;
417-
auto status =
418-
arrow::ipc::RecordBatchFileReader::Open(arrow_random_access_file,
419-
&arrow_reader);
420-
if (garrow_error_check(error, status, "[record-batch-file-reader][open]")) {
421-
return garrow_record_batch_file_reader_new_raw(&arrow_reader);
414+
auto arrow_random_access_file = garrow_seekable_input_stream_get_raw(file);
415+
auto arrow_reader = ReaderType::Open(arrow_random_access_file);
416+
if (garrow::check(error, arrow_reader, "[record-batch-file-reader][open]")) {
417+
auto subtype = std::dynamic_pointer_cast<ReaderType>(*arrow_reader);
418+
return garrow_record_batch_file_reader_new_raw(&subtype);
422419
} else {
423420
return NULL;
424421
}

c_glib/arrow-glib/writer.cpp

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -238,17 +238,15 @@ garrow_record_batch_stream_writer_new(GArrowOutputStream *sink,
238238
GArrowSchema *schema,
239239
GError **error)
240240
{
241-
using BaseType = arrow::ipc::RecordBatchWriter;
242-
using WriterType = arrow::ipc::RecordBatchStreamWriter;
243-
244241
auto arrow_sink = garrow_output_stream_get_raw(sink).get();
245-
std::shared_ptr<BaseType> arrow_writer;
246-
auto status = WriterType::Open(arrow_sink,
247-
garrow_schema_get_raw(schema),
248-
&arrow_writer);
249-
if (garrow_error_check(error, status, "[record-batch-stream-writer][open]")) {
250-
auto subtype = std::dynamic_pointer_cast<WriterType>(arrow_writer);
251-
return garrow_record_batch_stream_writer_new_raw(&subtype);
242+
auto arrow_schema = garrow_schema_get_raw(schema);
243+
auto arrow_writer_result =
244+
arrow::ipc::NewStreamWriter(arrow_sink, arrow_schema);
245+
if (garrow::check(error,
246+
arrow_writer_result,
247+
"[record-batch-stream-writer][open]")) {
248+
auto arrow_writer = *arrow_writer_result;
249+
return garrow_record_batch_stream_writer_new_raw(&arrow_writer);
252250
} else {
253251
return NULL;
254252
}
@@ -285,17 +283,16 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink,
285283
GArrowSchema *schema,
286284
GError **error)
287285
{
288-
using BaseType = arrow::ipc::RecordBatchWriter;
289-
using WriterType = arrow::ipc::RecordBatchFileWriter;
290-
291-
auto arrow_sink = garrow_output_stream_get_raw(sink);
292-
std::shared_ptr<BaseType> arrow_writer;
293-
auto status = WriterType::Open(arrow_sink.get(),
294-
garrow_schema_get_raw(schema),
295-
&arrow_writer);
296-
if (garrow_error_check(error, status, "[record-batch-file-writer][open]")) {
297-
auto subtype = std::dynamic_pointer_cast<WriterType>(arrow_writer);
298-
return garrow_record_batch_file_writer_new_raw(&subtype);
286+
auto arrow_sink = garrow_output_stream_get_raw(sink).get();
287+
auto arrow_schema = garrow_schema_get_raw(schema);
288+
std::shared_ptr<arrow::ipc::RecordBatchWriter> arrow_writer;
289+
auto arrow_writer_result =
290+
arrow::ipc::NewFileWriter(arrow_sink, arrow_schema);
291+
if (garrow::check(error,
292+
arrow_writer_result,
293+
"[record-batch-file-writer][open]")) {
294+
auto arrow_writer = *arrow_writer_result;
295+
return garrow_record_batch_file_writer_new_raw(&arrow_writer);
299296
} else {
300297
return NULL;
301298
}
@@ -529,7 +526,7 @@ garrow_record_batch_writer_get_raw(GArrowRecordBatchWriter *writer)
529526
}
530527

531528
GArrowRecordBatchStreamWriter *
532-
garrow_record_batch_stream_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchStreamWriter> *arrow_writer)
529+
garrow_record_batch_stream_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer)
533530
{
534531
auto writer =
535532
GARROW_RECORD_BATCH_STREAM_WRITER(
@@ -540,7 +537,7 @@ garrow_record_batch_stream_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatc
540537
}
541538

542539
GArrowRecordBatchFileWriter *
543-
garrow_record_batch_file_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchFileWriter> *arrow_writer)
540+
garrow_record_batch_file_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer)
544541
{
545542
auto writer =
546543
GARROW_RECORD_BATCH_FILE_WRITER(

c_glib/arrow-glib/writer.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@
2828
GArrowRecordBatchWriter *garrow_record_batch_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);
2929
std::shared_ptr<arrow::ipc::RecordBatchWriter> garrow_record_batch_writer_get_raw(GArrowRecordBatchWriter *writer);
3030

31-
GArrowRecordBatchStreamWriter *garrow_record_batch_stream_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchStreamWriter> *arrow_writer);
31+
GArrowRecordBatchStreamWriter *garrow_record_batch_stream_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);
3232

33-
GArrowRecordBatchFileWriter *garrow_record_batch_file_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchFileWriter> *arrow_writer);
33+
GArrowRecordBatchFileWriter *garrow_record_batch_file_writer_new_raw(std::shared_ptr<arrow::ipc::RecordBatchWriter> *arrow_writer);
3434

3535
GArrowFeatherFileWriter *garrow_feather_file_writer_new_raw(arrow::ipc::feather::TableWriter *arrow_writer);
3636
arrow::ipc::feather::TableWriter *garrow_feather_file_writer_get_raw(GArrowFeatherFileWriter *writer);

cpp/src/arrow/array.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
#pragma once
1919

20-
#include <atomic>
20+
#include <atomic> // IWYU pragma: export
2121
#include <cstdint>
2222
#include <iosfwd>
2323
#include <memory>

cpp/src/arrow/compute/kernel.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -273,15 +273,24 @@ class ARROW_EXPORT BinaryKernel : public OpKernel {
273273
Datum* out) = 0;
274274
};
275275

276+
// TODO doxygen 1.8.16 does not like the following code
277+
///@cond INTERNAL
278+
276279
static inline bool CollectionEquals(const std::vector<Datum>& left,
277280
const std::vector<Datum>& right) {
278-
if (left.size() != right.size()) return false;
279-
280-
for (size_t i = 0; i < left.size(); i++)
281-
if (!left[i].Equals(right[i])) return false;
281+
if (left.size() != right.size()) {
282+
return false;
283+
}
282284

285+
for (size_t i = 0; i < left.size(); i++) {
286+
if (!left[i].Equals(right[i])) {
287+
return false;
288+
}
289+
}
283290
return true;
284291
}
285292

293+
///@endcond
294+
286295
} // namespace compute
287296
} // namespace arrow

cpp/src/arrow/dataset/file_ipc.cc

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,14 @@
1919

2020
#include <algorithm>
2121
#include <memory>
22-
#include <unordered_set>
2322
#include <utility>
2423
#include <vector>
2524

2625
#include "arrow/dataset/dataset_internal.h"
2726
#include "arrow/dataset/file_base.h"
28-
#include "arrow/dataset/filter.h"
2927
#include "arrow/dataset/scanner.h"
3028
#include "arrow/ipc/reader.h"
29+
#include "arrow/ipc/writer.h"
3130
#include "arrow/util/iterator.h"
3231

3332
namespace arrow {
@@ -40,12 +39,11 @@ Result<std::shared_ptr<ipc::RecordBatchFileReader>> OpenReader(
4039
}
4140

4241
std::shared_ptr<ipc::RecordBatchFileReader> reader;
43-
auto status = ipc::RecordBatchFileReader::Open(std::move(input), &reader);
42+
auto status = ipc::RecordBatchFileReader::Open(std::move(input)).Value(&reader);
4443
if (!status.ok()) {
4544
return status.WithMessage("Could not open IPC input source '", source.path(),
4645
"': ", status.message());
4746
}
48-
4947
return reader;
5048
}
5149

@@ -161,10 +159,8 @@ Result<std::shared_ptr<WriteTask>> IpcFileFormat::WriteFragment(
161159
RETURN_NOT_OK(CreateDestinationParentDir());
162160

163161
ARROW_ASSIGN_OR_RAISE(auto out_stream, destination_.OpenWritable());
164-
165-
ARROW_ASSIGN_OR_RAISE(auto writer, ipc::RecordBatchFileWriter::Open(
166-
out_stream.get(), fragment_->schema()));
167-
162+
ARROW_ASSIGN_OR_RAISE(auto writer,
163+
ipc::NewFileWriter(out_stream.get(), fragment_->schema()));
168164
ARROW_ASSIGN_OR_RAISE(auto scan_task_it, fragment_->Scan(scan_context_));
169165

170166
for (auto maybe_scan_task : scan_task_it) {

cpp/src/arrow/dataset/file_ipc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@
1919

2020
#include <memory>
2121
#include <string>
22-
#include <utility>
2322

2423
#include "arrow/dataset/file_base.h"
2524
#include "arrow/dataset/type_fwd.h"
2625
#include "arrow/dataset/visibility.h"
26+
#include "arrow/result.h"
2727

2828
namespace arrow {
2929
namespace dataset {

cpp/src/arrow/dataset/file_ipc_test.cc

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,7 @@ class ArrowIpcWriterMixin : public ::testing::Test {
4646
std::shared_ptr<Buffer> Write(RecordBatchReader* reader) {
4747
EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create());
4848

49-
EXPECT_OK_AND_ASSIGN(auto writer,
50-
ipc::RecordBatchFileWriter::Open(sink.get(), reader->schema()));
49+
EXPECT_OK_AND_ASSIGN(auto writer, ipc::NewFileWriter(sink.get(), reader->schema()));
5150

5251
std::vector<std::shared_ptr<RecordBatch>> batches;
5352
ARROW_EXPECT_OK(reader->ReadAll(&batches));
@@ -63,9 +62,7 @@ class ArrowIpcWriterMixin : public ::testing::Test {
6362

6463
std::shared_ptr<Buffer> Write(const Table& table) {
6564
EXPECT_OK_AND_ASSIGN(auto sink, io::BufferOutputStream::Create());
66-
67-
EXPECT_OK_AND_ASSIGN(auto writer,
68-
ipc::RecordBatchFileWriter::Open(sink.get(), table.schema()));
65+
EXPECT_OK_AND_ASSIGN(auto writer, ipc::NewFileWriter(sink.get(), table.schema()));
6966

7067
ARROW_EXPECT_OK(writer->WriteTable(table));
7168

cpp/src/arrow/extension_type_test.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -218,14 +218,14 @@ TEST_F(TestExtensionType, ExtensionTypeTest) {
218218
auto RoundtripBatch = [](const std::shared_ptr<RecordBatch>& batch,
219219
std::shared_ptr<RecordBatch>* out) {
220220
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
221-
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(),
221+
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
222222
out_stream.get()));
223223

224224
ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
225225

226226
io::BufferReader reader(complete_ipc_stream);
227227
std::shared_ptr<RecordBatchReader> batch_reader;
228-
ASSERT_OK(ipc::RecordBatchStreamReader::Open(&reader, &batch_reader));
228+
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
229229
ASSERT_OK(batch_reader->ReadNext(out));
230230
};
231231

@@ -256,7 +256,7 @@ TEST_F(TestExtensionType, UnrecognizedExtension) {
256256
// Write full IPC stream including schema, then unregister type, then read
257257
// and ensure that a plain instance of the storage type is created
258258
ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create());
259-
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcOptions::Defaults(),
259+
ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(),
260260
out_stream.get()));
261261

262262
ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish());
@@ -270,7 +270,7 @@ TEST_F(TestExtensionType, UnrecognizedExtension) {
270270

271271
io::BufferReader reader(complete_ipc_stream);
272272
std::shared_ptr<RecordBatchReader> batch_reader;
273-
ASSERT_OK(ipc::RecordBatchStreamReader::Open(&reader, &batch_reader));
273+
ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader));
274274
std::shared_ptr<RecordBatch> read_batch;
275275
ASSERT_OK(batch_reader->ReadNext(&read_batch));
276276
CompareBatch(*batch_no_ext, *read_batch);

cpp/src/arrow/flight/client.cc

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ class GrpcStreamReader : public FlightStreamReader {
265265

266266
private:
267267
friend class GrpcIpcMessageReader;
268-
std::unique_ptr<ipc::RecordBatchReader> batch_reader_;
268+
std::shared_ptr<ipc::RecordBatchReader> batch_reader_;
269269
std::shared_ptr<Buffer> last_app_metadata_;
270270
std::shared_ptr<ClientRpc> rpc_;
271271
};
@@ -327,8 +327,8 @@ Status GrpcStreamReader::Open(std::unique_ptr<ClientRpc> rpc,
327327
out->get()->rpc_ = std::move(rpc);
328328
std::unique_ptr<GrpcIpcMessageReader> message_reader(
329329
new GrpcIpcMessageReader(out->get(), out->get()->rpc_, std::move(stream)));
330-
return ipc::RecordBatchStreamReader::Open(std::move(message_reader),
331-
&(*out)->batch_reader_);
330+
return (ipc::RecordBatchStreamReader::Open(std::move(message_reader))
331+
.Value(&(*out)->batch_reader_));
332332
}
333333

334334
std::shared_ptr<Schema> GrpcStreamReader::schema() const {
@@ -385,9 +385,6 @@ class GrpcStreamWriter : public FlightStreamWriter {
385385
}
386386
return Status::OK();
387387
}
388-
void set_memory_pool(MemoryPool* pool) override {
389-
batch_writer_->set_memory_pool(pool);
390-
}
391388
Status Close() override { return batch_writer_->Close(); }
392389

393390
private:

0 commit comments

Comments
 (0)