Skip to content

Commit cd2a3cd

Browse files
committed
Add unit tests for legacy Parquet input/output wrappers
1 parent e03f07d commit cd2a3cd

10 files changed

Lines changed: 244 additions & 121 deletions

File tree

cpp/src/parquet/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,6 @@ endif()
271271

272272
add_subdirectory(api)
273273
add_subdirectory(arrow)
274-
add_subdirectory(util)
275274

276275
arrow_install_all_headers("parquet")
277276

cpp/src/parquet/arrow/reader.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@
4747
#include "parquet/file_reader.h"
4848
#include "parquet/metadata.h"
4949
#include "parquet/properties.h"
50+
#include "parquet/schema-internal.h"
5051
#include "parquet/schema.h"
5152
#include "parquet/types.h"
52-
#include "parquet/util/schema-util.h"
5353

5454
using arrow::Array;
5555
using arrow::BooleanArray;
@@ -399,7 +399,7 @@ Status FileReader::Impl::GetReaderForNode(
399399
std::unique_ptr<ColumnReader::ColumnReaderImpl>* out) {
400400
*out = nullptr;
401401

402-
if (IsSimpleStruct(node)) {
402+
if (schema::IsSimpleStruct(node)) {
403403
const schema::GroupNode* group = static_cast<const schema::GroupNode*>(node);
404404
std::vector<std::shared_ptr<ColumnReader::ColumnReaderImpl>> children;
405405
for (int i = 0; i < group->field_count(); i++) {
@@ -425,15 +425,15 @@ Status FileReader::Impl::GetReaderForNode(
425425
const Node* walker = node;
426426
while (!walker->is_primitive()) {
427427
DCHECK(walker->is_group());
428-
auto group = static_cast<const GroupNode*>(walker);
428+
auto group = static_cast<const schema::GroupNode*>(walker);
429429
if (group->field_count() != 1) {
430430
return Status::NotImplemented("lists with structs are not supported.");
431431
}
432432
walker = group->field(0).get();
433433
}
434434
auto column_index = reader_->metadata()->schema()->ColumnIndex(*walker);
435435

436-
// If the index of the column is found then a reader for the coliumn is needed.
436+
// If the index of the column is found then a reader for the column is needed.
437437
// Otherwise *out keeps the nullptr value.
438438
if (std::find(indices.begin(), indices.end(), column_index) != indices.end()) {
439439
std::unique_ptr<ColumnReader> reader;
@@ -555,8 +555,8 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index,
555555
// We only need to read schema fields which have columns indicated
556556
// in the indices vector
557557
std::vector<int> field_indices;
558-
if (!ColumnIndicesToFieldIndices(*reader_->metadata()->schema(), indices,
559-
&field_indices)) {
558+
if (!schema::ColumnIndicesToFieldIndices(*reader_->metadata()->schema(), indices,
559+
&field_indices)) {
560560
return Status::Invalid("Invalid column index");
561561
}
562562
int num_fields = static_cast<int>(field_indices.size());
@@ -612,8 +612,8 @@ Status FileReader::Impl::ReadTable(const std::vector<int>& indices,
612612
// We only need to read schema fields which have columns indicated
613613
// in the indices vector
614614
std::vector<int> field_indices;
615-
if (!ColumnIndicesToFieldIndices(*reader_->metadata()->schema(), indices,
616-
&field_indices)) {
615+
if (!schema::ColumnIndicesToFieldIndices(*reader_->metadata()->schema(), indices,
616+
&field_indices)) {
617617
return Status::Invalid("Invalid column index");
618618
}
619619

cpp/src/parquet/arrow/schema.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@
3030
#include "parquet/arrow/writer.h"
3131
#include "parquet/exception.h"
3232
#include "parquet/properties.h"
33+
#include "parquet/schema-internal.h"
3334
#include "parquet/types.h"
34-
#include "parquet/util/schema-util.h"
3535

3636
using arrow::Field;
3737
using arrow::Status;
@@ -255,7 +255,7 @@ Status NodeToList(const GroupNode& group,
255255
// Special case mentioned in the format spec:
256256
// If the name is array or ends in _tuple, this should be a list of struct
257257
// even for single child elements.
258-
if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
258+
if (list_group.field_count() == 1 && !schema::HasStructListName(list_group)) {
259259
// List of primitive type
260260
std::shared_ptr<Field> item_field;
261261
RETURN_NOT_OK(

cpp/src/parquet/arrow/writer.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ class OutputStream;
4646
namespace parquet {
4747

4848
class FileMetaData;
49-
class OutputStream;
5049
class ParquetFileWriter;
5150

5251
namespace arrow {

cpp/src/parquet/deprecated_io-test.cc

Lines changed: 172 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include <algorithm>
1819
#include <cstdint>
1920
#include <cstdio>
2021
#include <memory>
@@ -23,11 +24,181 @@
2324

2425
#include <gtest/gtest.h>
2526

27+
#include "arrow/testing/gtest_util.h"
28+
29+
#include "parquet/deprecated_io.h"
2630
#include "parquet/exception.h"
2731
#include "parquet/platform.h"
2832
#include "parquet/test-util.h"
2933

3034
using arrow::default_memory_pool;
3135
using arrow::MemoryPool;
3236

33-
namespace parquet {} // namespace parquet
37+
namespace parquet {
38+
39+
class MockRandomAccessSource : public RandomAccessSource {
40+
public:
41+
MockRandomAccessSource(const uint8_t* data, int64_t size)
42+
: data_(data), position_(0), size_(size) {}
43+
44+
int64_t Size() const override { return size_; }
45+
46+
int64_t Read(int64_t nbytes, uint8_t* out) override {
47+
ThrowIfClosed();
48+
int64_t bytes_to_read = std::min(nbytes, size_ - position_);
49+
if (bytes_to_read == 0) {
50+
return 0;
51+
}
52+
memcpy(out, data_ + position_, bytes_to_read);
53+
position_ += bytes_to_read;
54+
return bytes_to_read;
55+
}
56+
57+
std::shared_ptr<Buffer> Read(int64_t nbytes) override {
58+
ThrowIfClosed();
59+
int64_t bytes_to_read = std::min(nbytes, size_ - position_);
60+
std::shared_ptr<ResizableBuffer> out =
61+
AllocateBuffer(::arrow::default_memory_pool(), bytes_to_read);
62+
Read(bytes_to_read, out->mutable_data());
63+
return std::move(out);
64+
}
65+
66+
std::shared_ptr<Buffer> ReadAt(int64_t position, int64_t nbytes) override {
67+
ThrowIfClosed();
68+
position_ = position;
69+
return Read(nbytes);
70+
}
71+
72+
int64_t ReadAt(int64_t position, int64_t nbytes, uint8_t* out) override {
73+
ThrowIfClosed();
74+
position_ = position;
75+
return Read(nbytes, out);
76+
}
77+
78+
void Close() override { closed_ = true; }
79+
80+
int64_t Tell() override {
81+
ThrowIfClosed();
82+
return position_;
83+
}
84+
85+
bool closed() const { return closed_; }
86+
87+
private:
88+
const uint8_t* data_;
89+
int64_t position_;
90+
int64_t size_;
91+
bool closed_ = false;
92+
93+
void ThrowIfClosed() {
94+
if (closed_) {
95+
throw ParquetException("file is closed");
96+
}
97+
}
98+
};
99+
100+
TEST(ParquetInputWrapper, BasicOperation) {
101+
std::string data = "some example data";
102+
103+
auto source = std::unique_ptr<RandomAccessSource>(new MockRandomAccessSource(
104+
reinterpret_cast<const uint8_t*>(data.data()), static_cast<int64_t>(data.size())));
105+
ParquetInputWrapper wrapper(std::move(source));
106+
107+
ASSERT_FALSE(wrapper.closed());
108+
109+
int64_t position = -1;
110+
ASSERT_OK(wrapper.Tell(&position));
111+
ASSERT_EQ(0, position);
112+
113+
// Read into memory
114+
uint8_t buf[4] = {0};
115+
int64_t bytes_read = -1;
116+
ASSERT_OK(wrapper.Read(4, &bytes_read, buf));
117+
ASSERT_EQ(4, bytes_read);
118+
ASSERT_EQ(0, memcmp(buf, data.data(), 4));
119+
120+
ASSERT_OK(wrapper.Tell(&position));
121+
ASSERT_EQ(4, position);
122+
123+
// Seek
124+
ASSERT_RAISES(NotImplemented, wrapper.Seek(5));
125+
126+
// Read buffer
127+
std::shared_ptr<Buffer> buffer;
128+
ASSERT_OK(wrapper.Read(7, &buffer));
129+
ASSERT_EQ(0, memcmp(buffer->data(), data.data() + 4, 7));
130+
131+
// ReadAt
132+
ASSERT_OK(wrapper.ReadAt(13, 4, &buffer));
133+
ASSERT_EQ(4, buffer->size());
134+
ASSERT_EQ(0, memcmp(buffer->data(), data.data() + 13, 4));
135+
136+
// GetSize
137+
int64_t size = -1;
138+
ASSERT_OK(wrapper.GetSize(&size));
139+
ASSERT_EQ(static_cast<int64_t>(data.size()), size);
140+
141+
// Close
142+
ASSERT_OK(wrapper.Close());
143+
ASSERT_TRUE(wrapper.closed());
144+
}
145+
146+
class MockOutputStream : public OutputStream {
147+
public:
148+
MockOutputStream() {}
149+
150+
void Write(const uint8_t* data, int64_t length) override {
151+
ThrowIfClosed();
152+
size_ += length;
153+
}
154+
155+
void Close() override { closed_ = true; }
156+
157+
int64_t Tell() override {
158+
ThrowIfClosed();
159+
return size_;
160+
}
161+
162+
bool closed() const { return closed_; }
163+
164+
private:
165+
int64_t size_ = 0;
166+
bool closed_ = false;
167+
168+
void ThrowIfClosed() {
169+
if (closed_) {
170+
throw ParquetException("file is closed");
171+
}
172+
}
173+
};
174+
175+
TEST(ParquetOutputWrapper, BasicOperation) {
176+
auto stream = std::unique_ptr<OutputStream>(new MockOutputStream);
177+
ParquetOutputWrapper wrapper(std::move(stream));
178+
179+
int64_t position = -1;
180+
ASSERT_OK(wrapper.Tell(&position));
181+
ASSERT_EQ(0, position);
182+
183+
std::string data = "food";
184+
185+
ASSERT_OK(wrapper.Write(reinterpret_cast<const uint8_t*>(data.data()), 4));
186+
ASSERT_OK(wrapper.Tell(&position));
187+
ASSERT_EQ(4, position);
188+
189+
// Close
190+
ASSERT_OK(wrapper.Close());
191+
ASSERT_TRUE(wrapper.closed());
192+
193+
// Test catch exceptions
194+
ASSERT_RAISES(IOError, wrapper.Tell(&position));
195+
ASSERT_RAISES(IOError, wrapper.Write(reinterpret_cast<const uint8_t*>(data.data()), 4));
196+
}
197+
198+
TEST(ParquetOutputWrapper, DtorCloses) {
199+
MockOutputStream stream;
200+
{ ParquetOutputWrapper wrapper(&stream); }
201+
ASSERT_TRUE(stream.closed());
202+
}
203+
204+
} // namespace parquet

cpp/src/parquet/deprecated_io.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@ ParquetInputWrapper::ParquetInputWrapper(RandomAccessSource* source)
4040

4141
ParquetInputWrapper::~ParquetInputWrapper() {
4242
if (!closed_) {
43-
source_->Close();
43+
try {
44+
source_->Close();
45+
} catch (...) {
46+
}
4447
closed_ = true;
4548
}
4649
}
@@ -101,7 +104,10 @@ ParquetOutputWrapper::ParquetOutputWrapper(::parquet::OutputStream* sink)
101104

102105
ParquetOutputWrapper::~ParquetOutputWrapper() {
103106
if (!closed_) {
104-
sink_->Close();
107+
try {
108+
sink_->Close();
109+
} catch (...) {
110+
}
105111
closed_ = true;
106112
}
107113
}

cpp/src/parquet/file_writer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ class PARQUET_EXPORT ParquetFileWriter {
165165
const std::shared_ptr<WriterProperties>& properties = default_writer_properties(),
166166
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
167167

168+
ARROW_DEPRECATED("Use version with arrow::io::OutputStream")
168169
static std::unique_ptr<ParquetFileWriter> Open(
169170
const std::shared_ptr<OutputStream>& sink,
170171
const std::shared_ptr<schema::GroupNode>& schema,

cpp/src/parquet/schema-internal.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323

2424
#include <cstdint>
2525
#include <memory>
26+
#include <string>
27+
#include <unordered_set>
2628
#include <vector>
2729

2830
#include "parquet/platform.h"
@@ -37,6 +39,57 @@ class SchemaElement;
3739

3840
namespace schema {
3941

42+
inline bool str_endswith_tuple(const std::string& str) {
43+
if (str.size() >= 6) {
44+
return str.substr(str.size() - 6, 6) == "_tuple";
45+
}
46+
return false;
47+
}
48+
49+
// Special case mentioned in the format spec:
50+
// If the name is array or ends in _tuple, this should be a list of struct
51+
// even for single child elements.
52+
inline bool HasStructListName(const GroupNode& node) {
53+
return (node.name() == "array" || str_endswith_tuple(node.name()));
54+
}
55+
56+
// TODO(itaiin): This aux. function is to be deleted once repeated structs are supported
57+
inline bool IsSimpleStruct(const Node* node) {
58+
if (!node->is_group()) return false;
59+
if (node->is_repeated()) return false;
60+
if (node->logical_type() == LogicalType::LIST) return false;
61+
// Special case mentioned in the format spec:
62+
// If the name is array or ends in _tuple, this should be a list of struct
63+
// even for single child elements.
64+
auto group = static_cast<const GroupNode*>(node);
65+
if (group->field_count() == 1 && HasStructListName(*group)) return false;
66+
67+
return true;
68+
}
69+
70+
// Coalesce a list of schema fields indices which are the roots of the
71+
// columns referred by a list of column indices
72+
inline bool ColumnIndicesToFieldIndices(const SchemaDescriptor& descr,
73+
const std::vector<int>& column_indices,
74+
std::vector<int>* out) {
75+
const GroupNode* group = descr.group_node();
76+
std::unordered_set<int> already_added;
77+
out->clear();
78+
for (auto& column_idx : column_indices) {
79+
auto field_node = descr.GetColumnRoot(column_idx);
80+
auto field_idx = group->FieldIndex(*field_node);
81+
if (field_idx < 0) {
82+
return false;
83+
}
84+
auto insertion = already_added.insert(field_idx);
85+
if (insertion.second) {
86+
out->push_back(field_idx);
87+
}
88+
}
89+
90+
return true;
91+
}
92+
4093
// ----------------------------------------------------------------------
4194
// Conversion from Parquet Thrift metadata
4295

0 commit comments

Comments
 (0)