Skip to content

Commit 5904eea

Browse files
committed
ARROW-3324: [Python] Destroy temporary metadata builder classes more eagerly when building files to reduce memory usage
Destroy RowGroupMetadataBuilder after each row group is completed Author: Wes McKinney <wesm+git@apache.org> Closes apache#3261 from tanyaschlusser/ARROW-3324 and squashes the following commits: 5f38767 <Wes McKinney> Refine case a bit 4f2bdcd <Wes McKinney> Destroy RowGroupMetadataBuilder object after completing a row group to reduce memory usage
1 parent abde663 commit 5904eea

4 files changed

Lines changed: 93 additions & 67 deletions

File tree

cpp/src/parquet/metadata-test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ TEST(Metadata, TestBuildAccess) {
5959

6060
auto f_builder = FileMetaDataBuilder::Make(&schema, props);
6161
auto rg1_builder = f_builder->AppendRowGroup();
62-
auto rg2_builder = f_builder->AppendRowGroup();
6362

6463
// Write the metadata
6564
// rowgroup1 metadata
@@ -75,6 +74,7 @@ TEST(Metadata, TestBuildAccess) {
7574
rg1_builder->Finish(1024);
7675

7776
// rowgroup2 metadata
77+
auto rg2_builder = f_builder->AppendRowGroup();
7878
col1_builder = rg2_builder->NextColumnChunk();
7979
col2_builder = rg2_builder->NextColumnChunk();
8080
// column metadata

cpp/src/parquet/metadata.cc

Lines changed: 26 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
115115
}
116116
possible_stats_ = nullptr;
117117
}
118-
~ColumnChunkMetaDataImpl() {}
119118

120119
// column chunk
121120
inline int64_t file_offset() const { return column_->file_offset; }
@@ -197,13 +196,13 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
197196
};
198197

199198
std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
200-
const uint8_t* metadata, const ColumnDescriptor* descr,
199+
const void* metadata, const ColumnDescriptor* descr,
201200
const ApplicationVersion* writer_version) {
202201
return std::unique_ptr<ColumnChunkMetaData>(
203202
new ColumnChunkMetaData(metadata, descr, writer_version));
204203
}
205204

206-
ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata,
205+
ColumnChunkMetaData::ColumnChunkMetaData(const void* metadata,
207206
const ColumnDescriptor* descr,
208207
const ApplicationVersion* writer_version)
209208
: impl_{std::unique_ptr<ColumnChunkMetaDataImpl>(new ColumnChunkMetaDataImpl(
@@ -272,7 +271,6 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
272271
const SchemaDescriptor* schema,
273272
const ApplicationVersion* writer_version)
274273
: row_group_(row_group), schema_(schema), writer_version_(writer_version) {}
275-
~RowGroupMetaDataImpl() {}
276274

277275
inline int num_columns() const { return static_cast<int>(row_group_->columns.size()); }
278276

@@ -289,9 +287,8 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
289287
<< " columns, requested metadata for column: " << i;
290288
throw ParquetException(ss.str());
291289
}
292-
return ColumnChunkMetaData::Make(
293-
reinterpret_cast<const uint8_t*>(&row_group_->columns[i]), schema_->Column(i),
294-
writer_version_);
290+
return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i),
291+
writer_version_);
295292
}
296293

297294
private:
@@ -301,14 +298,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
301298
};
302299

303300
std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
304-
const uint8_t* metadata, const SchemaDescriptor* schema,
301+
const void* metadata, const SchemaDescriptor* schema,
305302
const ApplicationVersion* writer_version) {
306303
return std::unique_ptr<RowGroupMetaData>(
307304
new RowGroupMetaData(metadata, schema, writer_version));
308305
}
309306

310-
RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata,
311-
const SchemaDescriptor* schema,
307+
RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
312308
const ApplicationVersion* writer_version)
313309
: impl_{std::unique_ptr<RowGroupMetaDataImpl>(new RowGroupMetaDataImpl(
314310
reinterpret_cast<const format::RowGroup*>(metadata), schema, writer_version))} {
@@ -332,10 +328,11 @@ class FileMetaData::FileMetaDataImpl {
332328
public:
333329
FileMetaDataImpl() : metadata_len_(0) {}
334330

335-
explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len)
331+
explicit FileMetaDataImpl(const void* metadata, uint32_t* metadata_len)
336332
: metadata_len_(0) {
337333
metadata_.reset(new format::FileMetaData);
338-
DeserializeThriftMsg(metadata, metadata_len, metadata_.get());
334+
DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(metadata), metadata_len,
335+
metadata_.get());
339336
metadata_len_ = *metadata_len;
340337

341338
if (metadata_->__isset.created_by) {
@@ -348,7 +345,6 @@ class FileMetaData::FileMetaDataImpl {
348345
InitColumnOrders();
349346
InitKeyValueMetadata();
350347
}
351-
~FileMetaDataImpl() {}
352348

353349
inline uint32_t size() const { return metadata_len_; }
354350
inline int num_columns() const { return schema_.num_columns(); }
@@ -375,9 +371,7 @@ class FileMetaData::FileMetaDataImpl {
375371
<< " row groups, requested metadata for row group: " << i;
376372
throw ParquetException(ss.str());
377373
}
378-
return RowGroupMetaData::Make(
379-
reinterpret_cast<const uint8_t*>(&metadata_->row_groups[i]), &schema_,
380-
&writer_version_);
374+
return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_);
381375
}
382376

383377
const SchemaDescriptor* schema() const { return &schema_; }
@@ -429,13 +423,13 @@ class FileMetaData::FileMetaDataImpl {
429423
std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
430424
};
431425

432-
std::shared_ptr<FileMetaData> FileMetaData::Make(const uint8_t* metadata,
426+
std::shared_ptr<FileMetaData> FileMetaData::Make(const void* metadata,
433427
uint32_t* metadata_len) {
434428
// This FileMetaData ctor is private, not compatible with std::make_shared
435429
return std::shared_ptr<FileMetaData>(new FileMetaData(metadata, metadata_len));
436430
}
437431

438-
FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len)
432+
FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len)
439433
: impl_{std::unique_ptr<FileMetaDataImpl>(
440434
new FileMetaDataImpl(metadata, metadata_len))} {}
441435

@@ -606,11 +600,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
606600
Init(column_chunk);
607601
}
608602

609-
~ColumnChunkMetaDataBuilderImpl() {}
610-
611-
const uint8_t* contents() const {
612-
return reinterpret_cast<const uint8_t*>(column_chunk_);
613-
}
603+
const void* contents() const { return column_chunk_; }
614604

615605
// column chunk
616606
void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
@@ -699,7 +689,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
699689

700690
std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
701691
const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
702-
uint8_t* contents) {
692+
void* contents) {
703693
return std::unique_ptr<ColumnChunkMetaDataBuilder>(
704694
new ColumnChunkMetaDataBuilder(props, column, contents));
705695
}
@@ -717,14 +707,14 @@ ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
717707

718708
ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
719709
const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
720-
uint8_t* contents)
710+
void* contents)
721711
: impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
722712
new ColumnChunkMetaDataBuilderImpl(
723713
props, column, reinterpret_cast<format::ColumnChunk*>(contents)))} {}
724714

725715
ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {}
726716

727-
const uint8_t* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); }
717+
const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); }
728718

729719
void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
730720
impl_->set_file_path(path);
@@ -754,12 +744,11 @@ void ColumnChunkMetaDataBuilder::SetStatistics(bool is_signed,
754744
class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
755745
public:
756746
explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr<WriterProperties>& props,
757-
const SchemaDescriptor* schema, uint8_t* contents)
747+
const SchemaDescriptor* schema, void* contents)
758748
: properties_(props), schema_(schema), current_column_(0) {
759749
row_group_ = reinterpret_cast<format::RowGroup*>(contents);
760750
InitializeColumns(schema->num_columns());
761751
}
762-
~RowGroupMetaDataBuilderImpl() {}
763752

764753
ColumnChunkMetaDataBuilder* NextColumnChunk() {
765754
if (!(current_column_ < num_columns())) {
@@ -770,8 +759,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
770759
}
771760
auto column = schema_->Column(current_column_);
772761
auto column_builder = ColumnChunkMetaDataBuilder::Make(
773-
properties_, column,
774-
reinterpret_cast<uint8_t*>(&row_group_->columns[current_column_++]));
762+
properties_, column, &row_group_->columns[current_column_++]);
775763
auto column_builder_ptr = column_builder.get();
776764
column_builders_.push_back(std::move(column_builder));
777765
return column_builder_ptr;
@@ -820,14 +808,14 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
820808

821809
std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make(
822810
const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
823-
uint8_t* contents) {
811+
void* contents) {
824812
return std::unique_ptr<RowGroupMetaDataBuilder>(
825813
new RowGroupMetaDataBuilder(props, schema_, contents));
826814
}
827815

828816
RowGroupMetaDataBuilder::RowGroupMetaDataBuilder(
829817
const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
830-
uint8_t* contents)
818+
void* contents)
831819
: impl_{std::unique_ptr<RowGroupMetaDataBuilderImpl>(
832820
new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {}
833821

@@ -861,16 +849,12 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
861849
: properties_(props), schema_(schema), key_value_metadata_(key_value_metadata) {
862850
metadata_.reset(new format::FileMetaData());
863851
}
864-
~FileMetaDataBuilderImpl() {}
865852

866853
RowGroupMetaDataBuilder* AppendRowGroup() {
867-
auto row_group = std::unique_ptr<format::RowGroup>(new format::RowGroup());
868-
auto row_group_builder = RowGroupMetaDataBuilder::Make(
869-
properties_, schema_, reinterpret_cast<uint8_t*>(row_group.get()));
870-
RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get();
871-
row_group_builders_.push_back(std::move(row_group_builder));
872-
row_groups_.push_back(std::move(row_group));
873-
return row_group_ptr;
854+
row_groups_.emplace_back(new format::RowGroup);
855+
current_row_group_builder_ =
856+
RowGroupMetaDataBuilder::Make(properties_, schema_, row_groups_.back().get());
857+
return current_row_group_builder_.get();
874858
}
875859

876860
std::unique_ptr<FileMetaData> Finish() {
@@ -939,7 +923,8 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl {
939923
private:
940924
const std::shared_ptr<WriterProperties> properties_;
941925
std::vector<std::unique_ptr<format::RowGroup>> row_groups_;
942-
std::vector<std::unique_ptr<RowGroupMetaDataBuilder>> row_group_builders_;
926+
927+
std::unique_ptr<RowGroupMetaDataBuilder> current_row_group_builder_;
943928
const SchemaDescriptor* schema_;
944929
std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
945930
};

cpp/src/parquet/metadata.h

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class PARQUET_EXPORT ColumnChunkMetaData {
9393
public:
9494
// API convenience to get a MetaData accessor
9595
static std::unique_ptr<ColumnChunkMetaData> Make(
96-
const uint8_t* metadata, const ColumnDescriptor* descr,
96+
const void* metadata, const ColumnDescriptor* descr,
9797
const ApplicationVersion* writer_version = NULLPTR);
9898

9999
~ColumnChunkMetaData();
@@ -119,7 +119,7 @@ class PARQUET_EXPORT ColumnChunkMetaData {
119119
int64_t total_uncompressed_size() const;
120120

121121
private:
122-
explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr,
122+
explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr,
123123
const ApplicationVersion* writer_version = NULLPTR);
124124
// PIMPL Idiom
125125
class ColumnChunkMetaDataImpl;
@@ -130,7 +130,7 @@ class PARQUET_EXPORT RowGroupMetaData {
130130
public:
131131
// API convenience to get a MetaData accessor
132132
static std::unique_ptr<RowGroupMetaData> Make(
133-
const uint8_t* metadata, const SchemaDescriptor* schema,
133+
const void* metadata, const SchemaDescriptor* schema,
134134
const ApplicationVersion* writer_version = NULLPTR);
135135

136136
~RowGroupMetaData();
@@ -144,7 +144,7 @@ class PARQUET_EXPORT RowGroupMetaData {
144144
std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const;
145145

146146
private:
147-
explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema,
147+
explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
148148
const ApplicationVersion* writer_version = NULLPTR);
149149
// PIMPL Idiom
150150
class RowGroupMetaDataImpl;
@@ -156,7 +156,7 @@ class FileMetaDataBuilder;
156156
class PARQUET_EXPORT FileMetaData {
157157
public:
158158
// API convenience to get a MetaData accessor
159-
static std::shared_ptr<FileMetaData> Make(const uint8_t* serialized_metadata,
159+
static std::shared_ptr<FileMetaData> Make(const void* serialized_metadata,
160160
uint32_t* metadata_len);
161161

162162
~FileMetaData();
@@ -182,7 +182,7 @@ class PARQUET_EXPORT FileMetaData {
182182

183183
private:
184184
friend FileMetaDataBuilder;
185-
explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
185+
explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len);
186186

187187
// PIMPL Idiom
188188
FileMetaData();
@@ -199,7 +199,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
199199

200200
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
201201
const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
202-
uint8_t* contents);
202+
void* contents);
203203

204204
~ColumnChunkMetaDataBuilder();
205205

@@ -217,7 +217,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
217217
bool dictionary_fallback);
218218

219219
// The metadata contents, suitable for passing to ColumnChunkMetaData::Make
220-
const uint8_t* contents() const;
220+
const void* contents() const;
221221

222222
// For writing metadata at end of column chunk
223223
void WriteTo(OutputStream* sink);
@@ -226,7 +226,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
226226
explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
227227
const ColumnDescriptor* column);
228228
explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
229-
const ColumnDescriptor* column, uint8_t* contents);
229+
const ColumnDescriptor* column, void* contents);
230230
// PIMPL Idiom
231231
class ColumnChunkMetaDataBuilderImpl;
232232
std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
@@ -237,7 +237,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder {
237237
// API convenience to get a MetaData reader
238238
static std::unique_ptr<RowGroupMetaDataBuilder> Make(
239239
const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
240-
uint8_t* contents);
240+
void* contents);
241241

242242
~RowGroupMetaDataBuilder();
243243

@@ -253,7 +253,7 @@ class PARQUET_EXPORT RowGroupMetaDataBuilder {
253253

254254
private:
255255
explicit RowGroupMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
256-
const SchemaDescriptor* schema_, uint8_t* contents);
256+
const SchemaDescriptor* schema_, void* contents);
257257
// PIMPL Idiom
258258
class RowGroupMetaDataBuilderImpl;
259259
std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
@@ -268,9 +268,10 @@ class PARQUET_EXPORT FileMetaDataBuilder {
268268

269269
~FileMetaDataBuilder();
270270

271+
// The prior RowGroupMetaDataBuilder (if any) is destroyed
271272
RowGroupMetaDataBuilder* AppendRowGroup();
272273

273-
// commit the metadata
274+
// Complete the Thrift structure
274275
std::unique_ptr<FileMetaData> Finish();
275276

276277
private:

0 commit comments

Comments
 (0)