Skip to content

Commit 7029f90

Browse files
committed
ARROW-14577: [C++] Enable fine grained IO for async IPC reader
In the end this PR only addresses projection pushdown on the asynchronous path. The approach laid out here could be used on the synchronous path to offer pre-buffering. Tests should be done to see if that is faster. The memory mapped / will need issue should be solved by making auto-will-need a property of the memory mapped file and will not be addressed here. Closes apache#11616 from westonpace/feature/ARROW-14577--simplify-ipc-reader-add-cache Authored-by: Weston Pace <weston.pace@gmail.com> Signed-off-by: Weston Pace <weston.pace@gmail.com>
1 parent f585a47 commit 7029f90

10 files changed

Lines changed: 715 additions & 70 deletions

File tree

cpp/src/arrow/io/caching.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,8 @@ struct RangeCacheEntry {
144144
};
145145

146146
struct ReadRangeCache::Impl {
147-
std::shared_ptr<RandomAccessFile> file;
147+
std::shared_ptr<RandomAccessFile> owned_file;
148+
RandomAccessFile* file;
148149
IOContext ctx;
149150
CacheOptions options;
150151

@@ -289,10 +290,12 @@ struct ReadRangeCache::LazyImpl : public ReadRangeCache::Impl {
289290
}
290291
};
291292

292-
ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
293+
ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> owned_file,
294+
RandomAccessFile* file, IOContext ctx,
293295
CacheOptions options)
294296
: impl_(options.lazy ? new LazyImpl() : new Impl()) {
295-
impl_->file = std::move(file);
297+
impl_->owned_file = std::move(owned_file);
298+
impl_->file = file;
296299
impl_->ctx = std::move(ctx);
297300
impl_->options = options;
298301
}

cpp/src/arrow/io/caching.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,17 @@ class ARROW_EXPORT ReadRangeCache {
104104

105105
/// Construct a read cache with default
106106
explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
107-
: ReadRangeCache(file, std::move(ctx), CacheOptions::Defaults()) {}
107+
: ReadRangeCache(file, file.get(), std::move(ctx), CacheOptions::Defaults()) {}
108108

109109
/// Construct a read cache with given options
110110
explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
111-
CacheOptions options);
111+
CacheOptions options)
112+
: ReadRangeCache(file, file.get(), ctx, options) {}
113+
114+
/// Construct a read cache with an unowned file
115+
ReadRangeCache(RandomAccessFile* file, IOContext ctx, CacheOptions options)
116+
: ReadRangeCache(NULLPTR, file, ctx, options) {}
117+
112118
~ReadRangeCache();
113119

114120
/// \brief Cache the given ranges in the background.
@@ -130,6 +136,9 @@ class ARROW_EXPORT ReadRangeCache {
130136
struct Impl;
131137
struct LazyImpl;
132138

139+
ReadRangeCache(std::shared_ptr<RandomAccessFile> owned_file, RandomAccessFile* file,
140+
IOContext ctx, CacheOptions options);
141+
133142
std::unique_ptr<Impl> impl_;
134143
};
135144

cpp/src/arrow/ipc/message.cc

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,50 @@ Status ReadFieldsSubset(int64_t offset, int32_t metadata_length,
311311
return Status::OK();
312312
}
313313

314+
Result<std::unique_ptr<Message>> ReadMessage(std::shared_ptr<Buffer> metadata,
315+
std::shared_ptr<Buffer> body) {
316+
std::unique_ptr<Message> result;
317+
auto listener = std::make_shared<AssignMessageDecoderListener>(&result);
318+
// If the user does not pass in a body buffer then we assume they are skipping it
319+
MessageDecoder decoder(listener, default_memory_pool(), body == nullptr);
320+
321+
if (metadata->size() < decoder.next_required_size()) {
322+
return Status::Invalid("metadata_length should be at least ",
323+
decoder.next_required_size());
324+
}
325+
326+
ARROW_RETURN_NOT_OK(decoder.Consume(metadata));
327+
328+
switch (decoder.state()) {
329+
case MessageDecoder::State::INITIAL:
330+
// Metadata did not request a body so we better not have provided one
331+
DCHECK_EQ(body, nullptr);
332+
return std::move(result);
333+
case MessageDecoder::State::METADATA_LENGTH:
334+
return Status::Invalid("metadata length is missing from the metadata buffer");
335+
case MessageDecoder::State::METADATA:
336+
return Status::Invalid("flatbuffer size ", decoder.next_required_size(),
337+
" invalid. Buffer size: ", metadata->size());
338+
case MessageDecoder::State::BODY: {
339+
if (body == nullptr) {
340+
// Caller didn't give a body so just give them a message without body
341+
return std::move(result);
342+
}
343+
if (body->size() != decoder.next_required_size()) {
344+
return Status::IOError("Expected body buffer to be ",
345+
decoder.next_required_size(),
346+
" bytes for message body, got ", body->size());
347+
}
348+
RETURN_NOT_OK(decoder.Consume(body));
349+
return std::move(result);
350+
}
351+
case MessageDecoder::State::EOS:
352+
return Status::Invalid("Unexpected empty message in IPC file format");
353+
default:
354+
return Status::Invalid("Unexpected state: ", decoder.state());
355+
}
356+
}
357+
314358
Result<std::unique_ptr<Message>> ReadMessage(int64_t offset, int32_t metadata_length,
315359
io::RandomAccessFile* file,
316360
const FieldsLoaderFunction& fields_loader) {
@@ -560,14 +604,15 @@ class MessageDecoder::MessageDecoderImpl {
560604
public:
561605
explicit MessageDecoderImpl(std::shared_ptr<MessageDecoderListener> listener,
562606
State initial_state, int64_t initial_next_required_size,
563-
MemoryPool* pool)
607+
MemoryPool* pool, bool skip_body)
564608
: listener_(std::move(listener)),
565609
pool_(pool),
566610
state_(initial_state),
567611
next_required_size_(initial_next_required_size),
568612
chunks_(),
569613
buffered_size_(0),
570-
metadata_(nullptr) {}
614+
metadata_(nullptr),
615+
skip_body_(skip_body) {}
571616

572617
Status ConsumeData(const uint8_t* data, int64_t size) {
573618
if (buffered_size_ == 0) {
@@ -798,7 +843,7 @@ class MessageDecoder::MessageDecoderImpl {
798843
RETURN_NOT_OK(CheckMetadataAndGetBodyLength(*metadata_, &body_length));
799844

800845
state_ = State::BODY;
801-
next_required_size_ = body_length;
846+
next_required_size_ = skip_body_ ? 0 : body_length;
802847
RETURN_NOT_OK(listener_->OnBody());
803848
if (next_required_size_ == 0) {
804849
ARROW_ASSIGN_OR_RAISE(auto body, AllocateBuffer(0, pool_));
@@ -894,19 +939,21 @@ class MessageDecoder::MessageDecoderImpl {
894939
std::vector<std::shared_ptr<Buffer>> chunks_;
895940
int64_t buffered_size_;
896941
std::shared_ptr<Buffer> metadata_; // Must be CPU buffer
942+
bool skip_body_;
897943
};
898944

899945
MessageDecoder::MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
900-
MemoryPool* pool) {
946+
MemoryPool* pool, bool skip_body) {
901947
impl_.reset(new MessageDecoderImpl(std::move(listener), State::INITIAL,
902-
kMessageDecoderNextRequiredSizeInitial, pool));
948+
kMessageDecoderNextRequiredSizeInitial, pool,
949+
skip_body));
903950
}
904951

905952
MessageDecoder::MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
906953
State initial_state, int64_t initial_next_required_size,
907-
MemoryPool* pool) {
954+
MemoryPool* pool, bool skip_body) {
908955
impl_.reset(new MessageDecoderImpl(std::move(listener), initial_state,
909-
initial_next_required_size, pool));
956+
initial_next_required_size, pool, skip_body));
910957
}
911958

912959
MessageDecoder::~MessageDecoder() {}

cpp/src/arrow/ipc/message.h

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,11 @@ class ARROW_EXPORT MessageDecoder {
266266
/// \param[in] listener a MessageDecoderListener that responds events from
267267
/// the decoder
268268
/// \param[in] pool an optional MemoryPool to copy metadata on the
269+
/// \param[in] skip_body if true the body will be skipped even if the message has a body
269270
/// CPU, if required
270271
explicit MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
271-
MemoryPool* pool = default_memory_pool());
272+
MemoryPool* pool = default_memory_pool(),
273+
bool skip_body = false);
272274

273275
/// \brief Construct a message decoder with the specified state.
274276
///
@@ -282,9 +284,10 @@ class ARROW_EXPORT MessageDecoder {
282284
/// to run the next action
283285
/// \param[in] pool an optional MemoryPool to copy metadata on the
284286
/// CPU, if required
287+
/// \param[in] skip_body if true the body will be skipped even if the message has a body
285288
MessageDecoder(std::shared_ptr<MessageDecoderListener> listener, State initial_state,
286289
int64_t initial_next_required_size,
287-
MemoryPool* pool = default_memory_pool());
290+
MemoryPool* pool = default_memory_pool(), bool skip_body = false);
288291

289292
virtual ~MessageDecoder();
290293

@@ -466,6 +469,25 @@ Result<std::unique_ptr<Message>> ReadMessage(
466469
const int64_t offset, const int32_t metadata_length, io::RandomAccessFile* file,
467470
const FieldsLoaderFunction& fields_loader = {});
468471

472+
/// \brief Read encapsulated RPC message from cached buffers
473+
///
474+
/// The buffers should contain an entire message. Partial reads are not handled.
475+
///
476+
/// This method can be used to read just the metadata by passing in a nullptr for the
477+
/// body. The body will then be skipped and the body size will not be validated.
478+
///
479+
/// If the body buffer is provided then it must be the complete body buffer
480+
///
481+
/// This is similar to Message::Open but performs slightly more validation (e.g. checks
482+
/// to see that the metadata length is correct and that the body is the size the metadata
483+
/// expected)
484+
///
485+
/// \param metadata The bytes for the metadata
486+
/// \param body The bytes for the body
487+
/// \return The message represented by the buffers
488+
ARROW_EXPORT Result<std::unique_ptr<Message>> ReadMessage(
489+
std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body);
490+
469491
ARROW_EXPORT
470492
Future<std::shared_ptr<Message>> ReadMessageAsync(
471493
const int64_t offset, const int32_t metadata_length, const int64_t body_length,

cpp/src/arrow/ipc/options.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <cstdint>
2121
#include <vector>
2222

23+
#include "arrow/io/caching.h"
2324
#include "arrow/ipc/type_fwd.h"
2425
#include "arrow/status.h"
2526
#include "arrow/type_fwd.h"
@@ -148,6 +149,11 @@ struct ARROW_EXPORT IpcReadOptions {
148149
/// RecordBatchStreamReader and StreamDecoder classes.
149150
bool ensure_native_endian = true;
150151

152+
/// \brief Options to control caching behavior when pre-buffering is requested
153+
///
154+
/// The lazy property will always be reset to true to deliver the expected behavior
155+
io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults();
156+
151157
static IpcReadOptions Defaults();
152158
};
153159

0 commit comments

Comments
 (0)