Skip to content

Commit 2c808a2

Browse files
committed
ARROW-6180: [C++][Parquet] Add RandomAccessFile::GetStream that returns InputStream that reads a file segment independent of the file's state, fix concurrent buffered Parquet column reads
This enables different functions to read portions of a `RandomAccessFile` as an InputStream without interfering with each other. This also addresses PARQUET-1636 and adds a unit test for buffered column chunk reads. In the refactor to use the Arrow IO interfaces, I broke this by allowing the raw RandomAccessFile to be passed into multiple `BufferedInputStream` at once, so the file position was being manipulated by different column readers. We didn't catch the problem because we didn't have any unit tests, so this patch addresses that deficiency. Closes apache#5085 from wesm/ARROW-6180 and squashes the following commits: e4ad370 <Wes McKinney> Code review comments 2645bec <Wes McKinney> Add unit test that exhibits PARQUET-1636 76dc71c <Wes McKinney> stub 3eb0136 <Wes McKinney> Finish basic unit tests 4fd3d61 <Wes McKinney> Start implementation Authored-by: Wes McKinney <wesm+git@apache.org> Signed-off-by: Wes McKinney <wesm+git@apache.org>
1 parent 4e51f98 commit 2c808a2

7 files changed

Lines changed: 262 additions & 19 deletions

File tree

cpp/src/arrow/io/interfaces.cc

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,15 @@
1717

1818
#include "arrow/io/interfaces.h"
1919

20+
#include <algorithm>
2021
#include <cstdint>
2122
#include <memory>
2223
#include <mutex>
24+
#include <utility>
2325

26+
#include "arrow/buffer.h"
2427
#include "arrow/status.h"
28+
#include "arrow/util/logging.h"
2529
#include "arrow/util/string_view.h"
2630

2731
namespace arrow {
@@ -70,5 +74,67 @@ Status Writable::Write(const std::string& data) {
7074

7175
Status Writable::Flush() { return Status::OK(); }
7276

77+
class FileSegmentReader : public InputStream {
78+
public:
79+
FileSegmentReader(std::shared_ptr<RandomAccessFile> file, int64_t file_offset,
80+
int64_t nbytes)
81+
: file_(std::move(file)),
82+
closed_(false),
83+
position_(0),
84+
file_offset_(file_offset),
85+
nbytes_(nbytes) {
86+
FileInterface::set_mode(FileMode::READ);
87+
}
88+
89+
Status CheckOpen() const {
90+
if (closed_) {
91+
return Status::IOError("Stream is closed");
92+
}
93+
return Status::OK();
94+
}
95+
96+
Status Close() override {
97+
closed_ = true;
98+
return Status::OK();
99+
}
100+
101+
Status Tell(int64_t* position) const override {
102+
RETURN_NOT_OK(CheckOpen());
103+
*position = position_;
104+
return Status::OK();
105+
}
106+
107+
bool closed() const override { return closed_; }
108+
109+
Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override {
110+
RETURN_NOT_OK(CheckOpen());
111+
int64_t bytes_to_read = std::min(nbytes, nbytes_ - position_);
112+
RETURN_NOT_OK(
113+
file_->ReadAt(file_offset_ + position_, bytes_to_read, bytes_read, out));
114+
position_ += *bytes_read;
115+
return Status::OK();
116+
}
117+
118+
Status Read(int64_t nbytes, std::shared_ptr<Buffer>* out) override {
119+
RETURN_NOT_OK(CheckOpen());
120+
int64_t bytes_to_read = std::min(nbytes, nbytes_ - position_);
121+
RETURN_NOT_OK(file_->ReadAt(file_offset_ + position_, bytes_to_read, out));
122+
position_ += (*out)->size();
123+
return Status::OK();
124+
}
125+
126+
private:
127+
std::shared_ptr<RandomAccessFile> file_;
128+
bool closed_;
129+
int64_t position_;
130+
int64_t file_offset_;
131+
int64_t nbytes_;
132+
};
133+
134+
std::shared_ptr<InputStream> RandomAccessFile::GetStream(
135+
std::shared_ptr<RandomAccessFile> file, int64_t file_offset, int64_t nbytes) {
136+
return std::make_shared<FileSegmentReader>(std::move(file), file_offset, nbytes);
137+
}
138+
73139
} // namespace io
74140
} // namespace arrow

cpp/src/arrow/io/interfaces.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,16 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
144144
/// Necessary because we hold a std::unique_ptr
145145
~RandomAccessFile() override;
146146

147+
/// \brief Create an isolated InputStream that reads a segment of a
148+
/// RandomAccessFile. Multiple such stream can be created and used
149+
/// independently without interference
150+
/// \param[in] file a file instance
151+
/// \param[in] file_offset the starting position in the file
152+
/// \param[in] nbytes the extent of bytes to read. The file should have
153+
/// sufficient bytes available
154+
static std::shared_ptr<InputStream> GetStream(std::shared_ptr<RandomAccessFile> file,
155+
int64_t file_offset, int64_t nbytes);
156+
147157
virtual Status GetSize(int64_t* size) = 0;
148158

149159
/// \brief Read nbytes at position, provide default implementations using

cpp/src/arrow/io/memory-test.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,73 @@ TEST(TestBufferReader, RetainParentReference) {
218218
ASSERT_EQ(0, std::memcmp(slice2->data(), data.c_str() + 4, 6));
219219
}
220220

221+
TEST(TestRandomAccessFile, GetStream) {
222+
std::string data = "data1data2data3data4data5";
223+
224+
auto buf = std::make_shared<Buffer>(data);
225+
auto file = std::make_shared<BufferReader>(buf);
226+
227+
std::shared_ptr<InputStream> stream1, stream2;
228+
229+
stream1 = RandomAccessFile::GetStream(file, 0, 10);
230+
stream2 = RandomAccessFile::GetStream(file, 9, 16);
231+
232+
int64_t position = -1;
233+
ASSERT_OK(stream1->Tell(&position));
234+
ASSERT_EQ(0, position);
235+
236+
std::shared_ptr<Buffer> buf2;
237+
uint8_t buf3[20];
238+
239+
int64_t bytes_read = -1;
240+
ASSERT_OK(stream2->Read(4, &bytes_read, buf3));
241+
ASSERT_EQ(4, bytes_read);
242+
ASSERT_EQ(0, std::memcmp(buf3, "2dat", 4));
243+
ASSERT_OK(stream2->Tell(&position));
244+
ASSERT_EQ(4, position);
245+
246+
ASSERT_OK(stream1->Read(6, &bytes_read, buf3));
247+
ASSERT_EQ(6, bytes_read);
248+
ASSERT_EQ(0, std::memcmp(buf3, "data1d", 6));
249+
ASSERT_OK(stream1->Tell(&position));
250+
ASSERT_EQ(6, position);
251+
252+
ASSERT_OK(stream1->Read(2, &buf2));
253+
ASSERT_TRUE(SliceBuffer(buf, 6, 2)->Equals(*buf2));
254+
255+
// Read to end of each stream
256+
ASSERT_OK(stream1->Read(4, &bytes_read, buf3));
257+
ASSERT_EQ(2, bytes_read);
258+
ASSERT_EQ(0, std::memcmp(buf3, "a2", 2));
259+
ASSERT_OK(stream1->Tell(&position));
260+
ASSERT_EQ(10, position);
261+
262+
ASSERT_OK(stream1->Read(1, &bytes_read, buf3));
263+
ASSERT_EQ(0, bytes_read);
264+
ASSERT_OK(stream1->Tell(&position));
265+
ASSERT_EQ(10, position);
266+
267+
// stream2 had its extent limited
268+
ASSERT_OK(stream2->Read(20, &buf2));
269+
ASSERT_TRUE(SliceBuffer(buf, 13, 12)->Equals(*buf2));
270+
271+
ASSERT_OK(stream2->Read(1, &buf2));
272+
ASSERT_EQ(0, buf2->size());
273+
ASSERT_OK(stream2->Tell(&position));
274+
ASSERT_EQ(16, position);
275+
276+
ASSERT_OK(stream1->Close());
277+
278+
// idempotent
279+
ASSERT_OK(stream1->Close());
280+
ASSERT_TRUE(stream1->closed());
281+
282+
// Check whether closed
283+
ASSERT_RAISES(IOError, stream1->Tell(&position));
284+
ASSERT_RAISES(IOError, stream1->Read(1, &buf2));
285+
ASSERT_RAISES(IOError, stream1->Read(1, &bytes_read, buf3));
286+
}
287+
221288
TEST(TestMemcopy, ParallelMemcopy) {
222289
#if defined(ARROW_VALGRIND)
223290
// Compensate for Valgrind's slowness

cpp/src/arrow/testing/random.h

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class ARROW_EXPORT RandomArrayGenerator {
5050
///
5151
/// \return a generated Array
5252
std::shared_ptr<arrow::Array> Boolean(int64_t size, double probability,
53-
double null_probability);
53+
double null_probability = 0);
5454

5555
/// \brief Generates a random UInt8Array
5656
///
@@ -61,7 +61,7 @@ class ARROW_EXPORT RandomArrayGenerator {
6161
///
6262
/// \return a generated Array
6363
std::shared_ptr<arrow::Array> UInt8(int64_t size, uint8_t min, uint8_t max,
64-
double null_probability);
64+
double null_probability = 0);
6565

6666
/// \brief Generates a random Int8Array
6767
///
@@ -72,7 +72,7 @@ class ARROW_EXPORT RandomArrayGenerator {
7272
///
7373
/// \return a generated Array
7474
std::shared_ptr<arrow::Array> Int8(int64_t size, int8_t min, int8_t max,
75-
double null_probability);
75+
double null_probability = 0);
7676

7777
/// \brief Generates a random UInt16Array
7878
///
@@ -83,7 +83,7 @@ class ARROW_EXPORT RandomArrayGenerator {
8383
///
8484
/// \return a generated Array
8585
std::shared_ptr<arrow::Array> UInt16(int64_t size, uint16_t min, uint16_t max,
86-
double null_probability);
86+
double null_probability = 0);
8787

8888
/// \brief Generates a random Int16Array
8989
///
@@ -94,7 +94,7 @@ class ARROW_EXPORT RandomArrayGenerator {
9494
///
9595
/// \return a generated Array
9696
std::shared_ptr<arrow::Array> Int16(int64_t size, int16_t min, int16_t max,
97-
double null_probability);
97+
double null_probability = 0);
9898

9999
/// \brief Generates a random UInt32Array
100100
///
@@ -105,7 +105,7 @@ class ARROW_EXPORT RandomArrayGenerator {
105105
///
106106
/// \return a generated Array
107107
std::shared_ptr<arrow::Array> UInt32(int64_t size, uint32_t min, uint32_t max,
108-
double null_probability);
108+
double null_probability = 0);
109109

110110
/// \brief Generates a random Int32Array
111111
///
@@ -116,7 +116,7 @@ class ARROW_EXPORT RandomArrayGenerator {
116116
///
117117
/// \return a generated Array
118118
std::shared_ptr<arrow::Array> Int32(int64_t size, int32_t min, int32_t max,
119-
double null_probability);
119+
double null_probability = 0);
120120

121121
/// \brief Generates a random UInt64Array
122122
///
@@ -127,7 +127,7 @@ class ARROW_EXPORT RandomArrayGenerator {
127127
///
128128
/// \return a generated Array
129129
std::shared_ptr<arrow::Array> UInt64(int64_t size, uint64_t min, uint64_t max,
130-
double null_probability);
130+
double null_probability = 0);
131131

132132
/// \brief Generates a random Int64Array
133133
///
@@ -138,7 +138,7 @@ class ARROW_EXPORT RandomArrayGenerator {
138138
///
139139
/// \return a generated Array
140140
std::shared_ptr<arrow::Array> Int64(int64_t size, int64_t min, int64_t max,
141-
double null_probability);
141+
double null_probability = 0);
142142

143143
/// \brief Generates a random FloatArray
144144
///
@@ -149,7 +149,7 @@ class ARROW_EXPORT RandomArrayGenerator {
149149
///
150150
/// \return a generated Array
151151
std::shared_ptr<arrow::Array> Float32(int64_t size, float min, float max,
152-
double null_probability);
152+
double null_probability = 0);
153153

154154
/// \brief Generates a random DoubleArray
155155
///
@@ -160,11 +160,11 @@ class ARROW_EXPORT RandomArrayGenerator {
160160
///
161161
/// \return a generated Array
162162
std::shared_ptr<arrow::Array> Float64(int64_t size, double min, double max,
163-
double null_probability);
163+
double null_probability = 0);
164164

165165
template <typename ArrowType, typename CType = typename ArrowType::c_type>
166166
std::shared_ptr<arrow::Array> Numeric(int64_t size, CType min, CType max,
167-
double null_probability) {
167+
double null_probability = 0) {
168168
switch (ArrowType::type_id) {
169169
case Type::UINT8:
170170
return UInt8(size, static_cast<uint8_t>(min), static_cast<uint8_t>(max),
@@ -212,7 +212,7 @@ class ARROW_EXPORT RandomArrayGenerator {
212212
///
213213
/// \return a generated Array
214214
std::shared_ptr<arrow::Array> String(int64_t size, int32_t min_length,
215-
int32_t max_length, double null_probability);
215+
int32_t max_length, double null_probability = 0);
216216

217217
/// \brief Generates a random LargeStringArray
218218
///
@@ -225,7 +225,8 @@ class ARROW_EXPORT RandomArrayGenerator {
225225
///
226226
/// \return a generated Array
227227
std::shared_ptr<arrow::Array> LargeString(int64_t size, int32_t min_length,
228-
int32_t max_length, double null_probability);
228+
int32_t max_length,
229+
double null_probability = 0);
229230

230231
/// \brief Generates a random StringArray with repeated values
231232
///
@@ -241,12 +242,12 @@ class ARROW_EXPORT RandomArrayGenerator {
241242
/// \return a generated Array
242243
std::shared_ptr<arrow::Array> StringWithRepeats(int64_t size, int64_t unique,
243244
int32_t min_length, int32_t max_length,
244-
double null_probability);
245+
double null_probability = 0);
245246

246247
/// \brief Like StringWithRepeats but return BinaryArray
247248
std::shared_ptr<arrow::Array> BinaryWithRepeats(int64_t size, int64_t unique,
248249
int32_t min_length, int32_t max_length,
249-
double null_probability);
250+
double null_probability = 0);
250251

251252
SeedType seed() { return seed_distribution_(seed_rng_); }
252253

cpp/src/parquet/properties.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@ namespace parquet {
2626
std::shared_ptr<ArrowInputStream> ReaderProperties::GetStream(
2727
std::shared_ptr<ArrowInputFile> source, int64_t start, int64_t num_bytes) {
2828
if (buffered_stream_enabled_) {
29+
// ARROW-6180 / PARQUET-1636 Create isolated reader that references segment
30+
// of source
31+
std::shared_ptr<::arrow::io::InputStream> safe_stream =
32+
::arrow::io::RandomAccessFile::GetStream(source, start, num_bytes);
2933
std::shared_ptr<::arrow::io::BufferedInputStream> stream;
30-
PARQUET_THROW_NOT_OK(source->Seek(start));
3134
PARQUET_THROW_NOT_OK(::arrow::io::BufferedInputStream::Create(
32-
buffer_size_, pool_, source, &stream, num_bytes));
35+
buffer_size_, pool_, safe_stream, &stream, num_bytes));
3336
return std::move(stream);
3437
} else {
3538
std::shared_ptr<Buffer> data;

cpp/src/parquet/properties.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ struct ParquetVersion {
3737
enum type { PARQUET_1_0, PARQUET_2_0 };
3838
};
3939

40-
static int64_t DEFAULT_BUFFER_SIZE = 0;
40+
static int64_t DEFAULT_BUFFER_SIZE = 1024;
4141
static bool DEFAULT_USE_BUFFERED_STREAM = false;
4242

4343
class PARQUET_EXPORT ReaderProperties {

0 commit comments

Comments
 (0)