Skip to content

Commit 4efb4e7

Browse files
committed
Implement expanding-peek logic, change signature of InputStream::Peak to be able to return Status
1 parent db1877e commit 4efb4e7

8 files changed

Lines changed: 78 additions & 46 deletions

File tree

cpp/src/arrow/io/buffered-test.cc

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,6 @@ TEST_F(TestBufferedInputStream, BasicOperation) {
343343

344344
// Nothing in the buffer
345345
ASSERT_EQ(0, buffered_->bytes_buffered());
346-
util::string_view peek = buffered_->Peek(10);
347-
ASSERT_EQ(0, peek.size());
348346

349347
std::vector<char> buf(test_data_.size());
350348
int64_t bytes_read;
@@ -355,6 +353,10 @@ TEST_F(TestBufferedInputStream, BasicOperation) {
355353
// 6 bytes remaining in buffer
356354
ASSERT_EQ(6, buffered_->bytes_buffered());
357355

356+
util::string_view peek;
357+
ASSERT_OK(buffered_->Peek(6, &peek));
358+
ASSERT_EQ(6, peek.size());
359+
358360
// Buffered position is 4
359361
ASSERT_OK(buffered_->Tell(&stream_position));
360362
ASSERT_EQ(4, stream_position);
@@ -363,11 +365,6 @@ TEST_F(TestBufferedInputStream, BasicOperation) {
363365
ASSERT_OK(raw_->Tell(&stream_position));
364366
ASSERT_EQ(10, stream_position);
365367

366-
// Peek does not look beyond end of buffer
367-
peek = buffered_->Peek(10);
368-
ASSERT_EQ(6, peek.size());
369-
ASSERT_EQ(0, memcmp(peek.data(), test_data_.data() + 4, 6));
370-
371368
// Reading to end of buffered bytes does not cause any more data to be
372369
// buffered
373370
ASSERT_OK(buffered_->Read(6, &bytes_read, buf.data()));
@@ -389,7 +386,7 @@ TEST_F(TestBufferedInputStream, BasicOperation) {
389386
ASSERT_EQ(test_data_.size(), stream_position);
390387

391388
// Peek at EOF
392-
peek = buffered_->Peek(10);
389+
ASSERT_OK(buffered_->Peek(10, &peek));
393390
ASSERT_EQ(0, peek.size());
394391

395392
// Calling Close closes raw_
@@ -484,20 +481,17 @@ TEST_F(TestBufferedInputStreamBound, Basics) {
484481
std::shared_ptr<Buffer> buffer;
485482
util::string_view view;
486483

487-
// Trigger buffering
488-
ASSERT_OK(stream_->Read(1, &buffer));
489-
490484
// source is at offset 10
491-
view = stream_->Peek(9);
492-
ASSERT_EQ(9, view.size());
493-
for (int i = 0; i < 9; i++) {
494-
ASSERT_EQ(11 + i, view[i]) << i;
485+
ASSERT_OK(stream_->Peek(10, &view));
486+
ASSERT_EQ(10, view.size());
487+
for (int i = 0; i < 10; i++) {
488+
ASSERT_EQ(10 + i, view[i]) << i;
495489
}
496490

497-
ASSERT_OK(stream_->Read(9, &buffer));
498-
ASSERT_EQ(9, buffer->size());
499-
for (int i = 0; i < 9; i++) {
500-
ASSERT_EQ(11 + i, (*buffer)[i]) << i;
491+
ASSERT_OK(stream_->Read(10, &buffer));
492+
ASSERT_EQ(10, buffer->size());
493+
for (int i = 0; i < 10; i++) {
494+
ASSERT_EQ(10 + i, (*buffer)[i]) << i;
501495
}
502496

503497
ASSERT_OK(stream_->Read(10, &buffer));
@@ -545,17 +539,14 @@ TEST_F(TestBufferedInputStreamBound, LargeFirstPeek) {
545539
int64_t n = 70;
546540
ASSERT_GT(n, chunk_size_);
547541

548-
// Trigger buffering
549-
ASSERT_OK(stream_->Read(1, &buffer));
550-
551-
// source is at offset 11
552-
view = stream_->Peek(n - 1);
542+
// source is at offset 10
543+
ASSERT_OK(stream_->Peek(n, &view));
553544
ASSERT_EQ(n, static_cast<int>(view.size()));
554-
for (int i = 0; i < n - 1; i++) {
555-
ASSERT_EQ(11 + i, view[i]) << i;
545+
for (int i = 0; i < n; i++) {
546+
ASSERT_EQ(10 + i, view[i]) << i;
556547
}
557548

558-
view = stream_->Peek(n);
549+
ASSERT_OK(stream_->Peek(n, &view));
559550
ASSERT_EQ(n, static_cast<int>(view.size()));
560551
for (int i = 0; i < n; i++) {
561552
ASSERT_EQ(10 + i, view[i]) << i;

cpp/src/arrow/io/buffered.cc

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -259,10 +259,39 @@ class BufferedInputStream::Impl : public BufferedBase {
259259
return ResizeBuffer(new_buffer_size);
260260
}
261261

262-
util::string_view Peek(int64_t nbytes) const {
263-
int64_t peek_size = std::min(nbytes, bytes_buffered_);
264-
return util::string_view(reinterpret_cast<const char*>(buffer_data_ + buffer_pos_),
265-
static_cast<size_t>(peek_size));
262+
Status Peek(int64_t nbytes, util::string_view* out) {
263+
int64_t total_avail = bytes_buffered_;
264+
265+
if (raw_read_bound_ > 0) {
266+
total_avail += raw_read_bound_ - raw_read_total_;
267+
}
268+
// Do not try to peek more than the total remaining number of bytes.
269+
nbytes = std::min(nbytes, total_avail);
270+
271+
if (bytes_buffered_ == 0 && nbytes < buffer_size_) {
272+
// Pre-buffer for small reads
273+
RETURN_NOT_OK(BufferIfNeeded());
274+
}
275+
276+
// Increase the buffer size if needed
277+
if (nbytes > buffer_->size() - buffer_pos_) {
278+
RETURN_NOT_OK(SetBufferSize(nbytes + buffer_pos_));
279+
DCHECK(buffer_->size() - buffer_pos_ >= nbytes);
280+
}
281+
// Read more data when buffer has insufficient left
282+
if (nbytes > bytes_buffered_) {
283+
// Read as much as possible to fill the buffer, but not past stream end
284+
int64_t read_size = std::min(nbytes - bytes_buffered_, total_avail);
285+
int64_t bytes_read = -1;
286+
RETURN_NOT_OK(raw_->Read(read_size, &bytes_read,
287+
buffer_->mutable_data() + buffer_pos_ + bytes_buffered_));
288+
bytes_buffered_ += bytes_read;
289+
raw_read_total_ += bytes_read;
290+
}
291+
DCHECK(nbytes <= bytes_buffered_); // Enough bytes available
292+
*out = util::string_view(reinterpret_cast<const char*>(buffer_data_ + buffer_pos_),
293+
static_cast<size_t>(nbytes));
294+
return Status::OK();
266295
}
267296

268297
int64_t bytes_buffered() const { return bytes_buffered_; }
@@ -402,8 +431,8 @@ Status BufferedInputStream::Tell(int64_t* position) const {
402431
return impl_->Tell(position);
403432
}
404433

405-
util::string_view BufferedInputStream::Peek(int64_t nbytes) const {
406-
return impl_->Peek(nbytes);
434+
Status BufferedInputStream::Peek(int64_t nbytes, util::string_view* out) {
435+
return impl_->Peek(nbytes, out);
407436
}
408437

409438
Status BufferedInputStream::SetBufferSize(int64_t new_buffer_size) {

cpp/src/arrow/io/buffered.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,12 @@ class ARROW_EXPORT BufferedInputStream : public InputStream {
127127
std::shared_ptr<InputStream> raw() const;
128128

129129
// InputStream APIs
130-
util::string_view Peek(int64_t nbytes) const override;
130+
131+
/// \brief Return a zero-copy string view referencing buffered data,
132+
/// but do not advance the position of the stream. Buffers data and
133+
/// expands the buffer size if necessary
134+
Status Peek(int64_t nbytes, util::string_view* out) override;
135+
131136
Status Close() override;
132137
bool closed() const override;
133138

cpp/src/arrow/io/interfaces.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ Status InputStream::Advance(int64_t nbytes) {
3434
return Read(nbytes, &temp);
3535
}
3636

37-
util::string_view InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes)) const {
38-
return util::string_view(nullptr, 0);
37+
Status InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes), util::string_view* out) {
38+
*out = util::string_view(nullptr, 0);
39+
return Status::OK();
3940
}
4041

4142
bool InputStream::supports_zero_copy() const { return false; }

cpp/src/arrow/io/interfaces.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,14 @@ class ARROW_EXPORT InputStream : virtual public FileInterface, virtual public Re
122122
/// \return Status
123123
Status Advance(int64_t nbytes);
124124

125-
/// \brief Return string_view to any buffered bytes, up to the indicated
126-
/// number. View becomes invalid after any operation on file. If the
127-
/// InputStream is unbuffered, returns 0-length string_view
125+
/// \brief Return zero-copy string_view to upcoming bytes in the
126+
/// stream but do not modify stream position. View becomes invalid
127+
/// after any operation on file. If the InputStream is unbuffered,
128+
/// returns 0-length string_view. May trigger buffering if the
129+
/// requested size is larger than the number of buffered bytes
128130
/// \param[in] nbytes the maximum number of bytes to see
129131
/// \return arrow::util::string_view
130-
virtual util::string_view Peek(int64_t nbytes) const;
132+
virtual Status Peek(int64_t nbytes, util::string_view* out);
131133

132134
/// \brief Return true if InputStream is capable of zero copy Buffer reads
133135
virtual bool supports_zero_copy() const;

cpp/src/arrow/io/memory-test.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,14 @@ TEST(TestBufferReader, Peek) {
185185

186186
BufferReader reader(std::make_shared<Buffer>(data));
187187

188-
auto view = reader.Peek(4);
188+
util::string_view view;
189+
190+
ASSERT_OK(reader.Peek(4, &view));
189191

190192
ASSERT_EQ(4, view.size());
191193
ASSERT_EQ(data.substr(0, 4), view.to_string());
192194

193-
view = reader.Peek(20);
195+
ASSERT_OK(reader.Peek(20, &view));
194196
ASSERT_EQ(data.size(), view.size());
195197
ASSERT_EQ(data, view.to_string());
196198
}

cpp/src/arrow/io/memory.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -301,14 +301,16 @@ Status BufferReader::Tell(int64_t* position) const {
301301
return Status::OK();
302302
}
303303

304-
util::string_view BufferReader::Peek(int64_t nbytes) const {
304+
Status BufferReader::Peek(int64_t nbytes, util::string_view* out) {
305305
if (!is_open_) {
306-
return {};
306+
*out = {};
307+
return Status::OK();
307308
}
308309

309310
const int64_t bytes_available = std::min(nbytes, size_ - position_);
310-
return util::string_view(reinterpret_cast<const char*>(data_) + position_,
311+
*out = util::string_view(reinterpret_cast<const char*>(data_) + position_,
311312
static_cast<size_t>(bytes_available));
313+
return Status::OK();
312314
}
313315

314316
bool BufferReader::supports_zero_copy() const { return true; }

cpp/src/arrow/io/memory.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ class ARROW_EXPORT BufferReader : public RandomAccessFile {
152152
// Zero copy read
153153
Status Read(int64_t nbytes, std::shared_ptr<Buffer>* out) override;
154154

155-
util::string_view Peek(int64_t nbytes) const override;
155+
Status Peek(int64_t nbytes, util::string_view* out) override;
156156

157157
bool supports_zero_copy() const override;
158158

0 commit comments

Comments
 (0)