Skip to content

Commit 076fbc6

Browse files
author
Deepak Majeti
committed
PARQUET-979: Limit size of min, max or disable stats for long binary types
Author: Deepak Majeti <deepak.majeti@hpe.com> Closes apache#465 from majetideepak/PARQUET-979 and squashes the following commits: 3b18173 [Deepak Majeti] improve naming and ColumnProperties class a888aa4 [Deepak Majeti] Add an option to specify max stats size c103c4f [Deepak Majeti] make format cf0260c [Deepak Majeti] PARQUET-979: [C++] Limit size of min, max or disable stats for long binary types Change-Id: I2843608b1848b6d85ac226064dd4c5ec93e40f47
1 parent b9e80c8 commit 076fbc6

6 files changed

Lines changed: 148 additions & 33 deletions

File tree

cpp/src/parquet/arrow/arrow-reader-writer-test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1445,8 +1445,8 @@ TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) {
14451445
using ::arrow::Schema;
14461446
using ::arrow::Table;
14471447
using ::arrow::TimeUnit;
1448-
using ::arrow::TimestampType;
14491448
using ::arrow::TimestampBuilder;
1449+
using ::arrow::TimestampType;
14501450
using ::arrow::default_memory_pool;
14511451

14521452
auto timestamp_type = std::make_shared<TimestampType>(TimeUnit::NANO);

cpp/src/parquet/column_writer-test.cc

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,19 +71,21 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
7171
int64_t output_size = SMALL_SIZE,
7272
const ColumnProperties& column_properties = ColumnProperties()) {
7373
sink_.reset(new InMemoryOutputStream());
74-
metadata_ = ColumnChunkMetaDataBuilder::Make(
75-
writer_properties_, this->descr_, reinterpret_cast<uint8_t*>(&thrift_metadata_));
76-
std::unique_ptr<PageWriter> pager =
77-
PageWriter::Open(sink_.get(), column_properties.codec, metadata_.get());
7874
WriterProperties::Builder wp_builder;
79-
if (column_properties.encoding == Encoding::PLAIN_DICTIONARY ||
80-
column_properties.encoding == Encoding::RLE_DICTIONARY) {
75+
if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY ||
76+
column_properties.encoding() == Encoding::RLE_DICTIONARY) {
8177
wp_builder.enable_dictionary();
8278
} else {
8379
wp_builder.disable_dictionary();
84-
wp_builder.encoding(column_properties.encoding);
80+
wp_builder.encoding(column_properties.encoding());
8581
}
82+
wp_builder.max_statistics_size(column_properties.max_statistics_size());
8683
writer_properties_ = wp_builder.build();
84+
85+
metadata_ = ColumnChunkMetaDataBuilder::Make(
86+
writer_properties_, this->descr_, reinterpret_cast<uint8_t*>(&thrift_metadata_));
87+
std::unique_ptr<PageWriter> pager =
88+
PageWriter::Open(sink_.get(), column_properties.compression(), metadata_.get());
8789
std::shared_ptr<ColumnWriter> writer =
8890
ColumnWriter::Make(metadata_.get(), std::move(pager), writer_properties_.get());
8991
return std::static_pointer_cast<TypedColumnWriter<TestType>>(writer);
@@ -173,6 +175,16 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
173175
return metadata_accessor->num_values();
174176
}
175177

178+
bool metadata_is_stats_set() {
179+
// Metadata accessor must be created lazily.
180+
// This is because the ColumnChunkMetaData semantics dictate the metadata object is
181+
// complete (no changes to the metadata buffer can be made after instantiation)
182+
ApplicationVersion app_version(this->writer_properties_->created_by());
183+
auto metadata_accessor = ColumnChunkMetaData::Make(
184+
reinterpret_cast<const uint8_t*>(&thrift_metadata_), this->descr_, &app_version);
185+
return metadata_accessor->is_stats_set();
186+
}
187+
176188
std::vector<Encoding::type> metadata_encodings() {
177189
// Metadata accessor must be created lazily.
178190
// This is because the ColumnChunkMetaData semantics dictate the metadata object is
@@ -520,6 +532,50 @@ TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) {
520532
}
521533
}
522534

535+
// PARQUET-979
536+
// Prevent writing large stats
537+
using TestByteArrayValuesWriter = TestPrimitiveWriter<ByteArrayType>;
538+
TEST_F(TestByteArrayValuesWriter, OmitStats) {
539+
int min_len = 1024 * 4;
540+
int max_len = 1024 * 8;
541+
this->SetUpSchema(Repetition::REQUIRED);
542+
auto writer = this->BuildWriter();
543+
544+
values_.resize(SMALL_SIZE);
545+
InitWideByteArrayValues(SMALL_SIZE, this->values_, this->buffer_, min_len, max_len);
546+
writer->WriteBatch(SMALL_SIZE, nullptr, nullptr, this->values_.data());
547+
writer->Close();
548+
549+
ASSERT_FALSE(this->metadata_is_stats_set());
550+
}
551+
552+
TEST_F(TestByteArrayValuesWriter, LimitStats) {
553+
int min_len = 1024 * 4;
554+
int max_len = 1024 * 8;
555+
this->SetUpSchema(Repetition::REQUIRED);
556+
ColumnProperties column_properties;
557+
column_properties.set_max_statistics_size(static_cast<size_t>(max_len));
558+
auto writer = this->BuildWriter(SMALL_SIZE, column_properties);
559+
560+
values_.resize(SMALL_SIZE);
561+
InitWideByteArrayValues(SMALL_SIZE, this->values_, this->buffer_, min_len, max_len);
562+
writer->WriteBatch(SMALL_SIZE, nullptr, nullptr, this->values_.data());
563+
writer->Close();
564+
565+
ASSERT_TRUE(this->metadata_is_stats_set());
566+
}
567+
568+
TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) {
569+
this->SetUpSchema(Repetition::REQUIRED);
570+
auto writer = this->BuildWriter();
571+
this->GenerateData(SMALL_SIZE);
572+
573+
writer->WriteBatch(SMALL_SIZE, nullptr, nullptr, this->values_ptr_);
574+
writer->Close();
575+
576+
ASSERT_TRUE(this->metadata_is_stats_set());
577+
}
578+
523579
void GenerateLevels(int min_repeat_factor, int max_repeat_factor, int max_level,
524580
std::vector<int16_t>& input_levels) {
525581
// for each repetition count upto max_repeat_factor

cpp/src/parquet/column_writer.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,14 @@ int64_t ColumnWriter::Close() {
431431
FlushBufferedDataPages();
432432

433433
EncodedStatistics chunk_statistics = GetChunkStatistics();
434-
if (chunk_statistics.is_set()) {
434+
// From parquet-mr
435+
// Don't write stats larger than the max size rather than truncating. The
436+
// rationale is that some engines may use the minimum value in the page as
437+
// the true minimum for aggregations and there is no way to mark that a
438+
// value has been truncated and is a lower bound and not in the page.
439+
if (chunk_statistics.is_set() &&
440+
chunk_statistics.max_stat_length() <=
441+
properties_->max_statistics_size(descr_->path())) {
435442
metadata_->SetStatistics(SortOrder::SIGNED == descr_->sort_order(),
436443
chunk_statistics);
437444
}

cpp/src/parquet/properties.h

Lines changed: 64 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = DEFAULT_PAGE_SIZE;
8484
static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
8585
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
8686
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
87+
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
8788
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
8889
static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION =
8990
ParquetVersion::PARQUET_1_0;
@@ -95,16 +96,46 @@ class PARQUET_EXPORT ColumnProperties {
9596
ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
9697
Compression::type codec = DEFAULT_COMPRESSION_TYPE,
9798
bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
98-
bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED)
99-
: encoding(encoding),
100-
codec(codec),
101-
dictionary_enabled(dictionary_enabled),
102-
statistics_enabled(statistics_enabled) {}
103-
104-
Encoding::type encoding;
105-
Compression::type codec;
106-
bool dictionary_enabled;
107-
bool statistics_enabled;
99+
bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
100+
size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
101+
: encoding_(encoding),
102+
codec_(codec),
103+
dictionary_enabled_(dictionary_enabled),
104+
statistics_enabled_(statistics_enabled),
105+
max_stats_size_(max_stats_size) {}
106+
107+
void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
108+
109+
void set_compression(Compression::type codec) { codec_ = codec; }
110+
111+
void set_dictionary_enabled(bool dictionary_enabled) {
112+
dictionary_enabled_ = dictionary_enabled;
113+
}
114+
115+
void set_statistics_enabled(bool statistics_enabled) {
116+
statistics_enabled_ = statistics_enabled;
117+
}
118+
119+
void set_max_statistics_size(size_t max_stats_size) {
120+
max_stats_size_ = max_stats_size;
121+
}
122+
123+
Encoding::type encoding() const { return encoding_; }
124+
125+
Compression::type compression() const { return codec_; }
126+
127+
bool dictionary_enabled() const { return dictionary_enabled_; }
128+
129+
bool statistics_enabled() const { return statistics_enabled_; }
130+
131+
size_t max_statistics_size() const { return max_stats_size_; }
132+
133+
private:
134+
Encoding::type encoding_;
135+
Compression::type codec_;
136+
bool dictionary_enabled_;
137+
bool statistics_enabled_;
138+
size_t max_stats_size_;
108139
};
109140

110141
class PARQUET_EXPORT WriterProperties {
@@ -127,12 +158,12 @@ class PARQUET_EXPORT WriterProperties {
127158
}
128159

129160
Builder* enable_dictionary() {
130-
default_column_properties_.dictionary_enabled = true;
161+
default_column_properties_.set_dictionary_enabled(true);
131162
return this;
132163
}
133164

134165
Builder* disable_dictionary() {
135-
default_column_properties_.dictionary_enabled = false;
166+
default_column_properties_.set_dictionary_enabled(false);
136167
return this;
137168
}
138169

@@ -196,7 +227,7 @@ class PARQUET_EXPORT WriterProperties {
196227
throw ParquetException("Can't use dictionary encoding as fallback encoding");
197228
}
198229

199-
default_column_properties_.encoding = encoding_type;
230+
default_column_properties_.set_encoding(encoding_type);
200231
return this;
201232
}
202233

@@ -228,7 +259,12 @@ class PARQUET_EXPORT WriterProperties {
228259
}
229260

230261
Builder* compression(Compression::type codec) {
231-
default_column_properties_.codec = codec;
262+
default_column_properties_.set_compression(codec);
263+
return this;
264+
}
265+
266+
Builder* max_statistics_size(size_t max_stats_sz) {
267+
default_column_properties_.set_max_statistics_size(max_stats_sz);
232268
return this;
233269
}
234270

@@ -243,12 +279,12 @@ class PARQUET_EXPORT WriterProperties {
243279
}
244280

245281
Builder* enable_statistics() {
246-
default_column_properties_.statistics_enabled = true;
282+
default_column_properties_.set_statistics_enabled(true);
247283
return this;
248284
}
249285

250286
Builder* disable_statistics() {
251-
default_column_properties_.statistics_enabled = false;
287+
default_column_properties_.set_statistics_enabled(false);
252288
return this;
253289
}
254290

@@ -280,12 +316,12 @@ class PARQUET_EXPORT WriterProperties {
280316
return it->second;
281317
};
282318

283-
for (const auto& item : encodings_) get(item.first).encoding = item.second;
284-
for (const auto& item : codecs_) get(item.first).codec = item.second;
319+
for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
320+
for (const auto& item : codecs_) get(item.first).set_compression(item.second);
285321
for (const auto& item : dictionary_enabled_)
286-
get(item.first).dictionary_enabled = item.second;
322+
get(item.first).set_dictionary_enabled(item.second);
287323
for (const auto& item : statistics_enabled_)
288-
get(item.first).statistics_enabled = item.second;
324+
get(item.first).set_statistics_enabled(item.second);
289325

290326
return std::shared_ptr<WriterProperties>(
291327
new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_,
@@ -348,19 +384,23 @@ class PARQUET_EXPORT WriterProperties {
348384
}
349385

350386
Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
351-
return column_properties(path).encoding;
387+
return column_properties(path).encoding();
352388
}
353389

354390
Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
355-
return column_properties(path).codec;
391+
return column_properties(path).compression();
356392
}
357393

358394
bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
359-
return column_properties(path).dictionary_enabled;
395+
return column_properties(path).dictionary_enabled();
360396
}
361397

362398
bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
363-
return column_properties(path).statistics_enabled;
399+
return column_properties(path).statistics_enabled();
400+
}
401+
402+
size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
403+
return column_properties(path).max_statistics_size();
364404
}
365405

366406
private:

cpp/src/parquet/statistics.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#ifndef PARQUET_COLUMN_STATISTICS_H
1919
#define PARQUET_COLUMN_STATISTICS_H
2020

21+
#include <algorithm>
2122
#include <cstdint>
2223
#include <memory>
2324
#include <string>
@@ -52,6 +53,9 @@ class PARQUET_EXPORT EncodedStatistics {
5253
return has_min || has_max || has_null_count || has_distinct_count;
5354
}
5455

56+
// larger of the max_ and min_ stat values
57+
inline size_t max_stat_length() { return std::max(max_->length(), min_->length()); }
58+
5559
inline EncodedStatistics& set_max(const std::string& value) {
5660
*max_ = value;
5761
has_max = true;

cpp/src/parquet/test-specialization.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,14 @@ void inline InitValues<ByteArray>(int num_values, vector<ByteArray>& values,
5050
random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len);
5151
}
5252

53+
void inline InitWideByteArrayValues(int num_values, vector<ByteArray>& values,
54+
vector<uint8_t>& buffer, int min_len, int max_len) {
55+
int num_bytes = static_cast<int>(max_len + sizeof(uint32_t));
56+
size_t nbytes = num_values * num_bytes;
57+
buffer.resize(nbytes);
58+
random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len);
59+
}
60+
5361
template <>
5462
void inline InitValues<FLBA>(int num_values, vector<FLBA>& values,
5563
vector<uint8_t>& buffer) {

0 commit comments

Comments
 (0)