Skip to content

Commit 7a532ed

Browse files
committed
ARROW-8678: [C++/Python][Parquet] Remove old writer code path
The new code has been the default for 2 release. Closes apache#8184 from emkornfield/ARROW-8678 Authored-by: Micah Kornfield <emkornfield@gmail.com> Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
1 parent ca12cd1 commit 7a532ed

6 files changed

Lines changed: 5 additions & 372 deletions

File tree

cpp/src/parquet/arrow/writer.cc

Lines changed: 2 additions & 354 deletions
Original file line numberDiff line numberDiff line change
@@ -98,249 +98,6 @@ bool HasNullableRoot(const SchemaManifest& schema_manifest,
9898
return nullable;
9999
}
100100

101-
class LevelBuilder {
102-
public:
103-
explicit LevelBuilder(MemoryPool* pool, const SchemaField* schema_field,
104-
const SchemaManifest* schema_manifest)
105-
: def_levels_(pool),
106-
rep_levels_(pool),
107-
schema_field_(schema_field),
108-
schema_manifest_(schema_manifest) {}
109-
110-
Status VisitInline(const Array& array);
111-
112-
template <typename T>
113-
::arrow::enable_if_t<std::is_base_of<::arrow::FlatArray, T>::value, Status> Visit(
114-
const T& array) {
115-
array_offsets_.push_back(static_cast<int32_t>(array.offset()));
116-
valid_bitmaps_.push_back(array.null_bitmap_data());
117-
null_counts_.push_back(array.null_count());
118-
values_array_ = std::make_shared<T>(array.data());
119-
return Status::OK();
120-
}
121-
122-
Status Visit(const DictionaryArray& array) {
123-
// Only currently handle DictionaryArray where the dictionary is a
124-
// primitive type
125-
if (array.dict_type()->value_type()->num_fields() > 0) {
126-
return Status::NotImplemented(
127-
"Writing DictionaryArray with nested dictionary "
128-
"type not yet supported");
129-
}
130-
array_offsets_.push_back(static_cast<int32_t>(array.offset()));
131-
valid_bitmaps_.push_back(array.null_bitmap_data());
132-
null_counts_.push_back(array.null_count());
133-
values_array_ = std::make_shared<DictionaryArray>(array.data());
134-
return Status::OK();
135-
}
136-
137-
Status Visit(const ListArray& array) {
138-
array_offsets_.push_back(static_cast<int32_t>(array.offset()));
139-
valid_bitmaps_.push_back(array.null_bitmap_data());
140-
null_counts_.push_back(array.null_count());
141-
offsets_.push_back(array.raw_value_offsets());
142-
143-
// Min offset isn't always zero in the case of sliced Arrays.
144-
min_offset_idx_ = array.value_offset(min_offset_idx_);
145-
max_offset_idx_ = array.value_offset(max_offset_idx_);
146-
147-
return VisitInline(*array.values());
148-
}
149-
150-
Status Visit(const ExtensionArray& array) { return VisitInline(*array.storage()); }
151-
152-
#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
153-
Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
154-
return Status::NotImplemented("Level generation for " #ArrowTypePrefix \
155-
" not supported yet"); \
156-
}
157-
158-
// See ARROW-1644
159-
NOT_IMPLEMENTED_VISIT(LargeList)
160-
NOT_IMPLEMENTED_VISIT(Map)
161-
NOT_IMPLEMENTED_VISIT(FixedSizeList)
162-
NOT_IMPLEMENTED_VISIT(Struct)
163-
NOT_IMPLEMENTED_VISIT(Union)
164-
165-
#undef NOT_IMPLEMENTED_VISIT
166-
167-
Status ExtractNullability() {
168-
// Walk upwards to extract nullability
169-
const SchemaField* current_field = schema_field_;
170-
while (current_field != nullptr) {
171-
nullable_.push_front(current_field->field->nullable());
172-
if (current_field->field->type()->num_fields() > 1) {
173-
return Status::NotImplemented(
174-
"Fields with more than one child are not supported.");
175-
} else {
176-
current_field = schema_manifest_->GetParent(current_field);
177-
}
178-
}
179-
return Status::OK();
180-
}
181-
182-
Status GenerateLevels(const Array& array, int64_t* values_offset, int64_t* num_values,
183-
int64_t* num_levels,
184-
const std::shared_ptr<ResizableBuffer>& def_levels_scratch,
185-
std::shared_ptr<Buffer>* def_levels_out,
186-
std::shared_ptr<Buffer>* rep_levels_out,
187-
std::shared_ptr<Array>* values_array) {
188-
// Work downwards to extract bitmaps and offsets
189-
min_offset_idx_ = 0;
190-
max_offset_idx_ = array.length();
191-
RETURN_NOT_OK(VisitInline(array));
192-
*num_values = max_offset_idx_ - min_offset_idx_;
193-
*values_offset = min_offset_idx_;
194-
*values_array = values_array_;
195-
196-
RETURN_NOT_OK(ExtractNullability());
197-
198-
// Generate the levels.
199-
if (nullable_.size() == 1) {
200-
// We have a PrimitiveArray
201-
*rep_levels_out = nullptr;
202-
if (nullable_[0]) {
203-
RETURN_NOT_OK(
204-
def_levels_scratch->Resize(array.length() * sizeof(int16_t), false));
205-
auto def_levels_ptr =
206-
reinterpret_cast<int16_t*>(def_levels_scratch->mutable_data());
207-
if (array.null_count() == 0) {
208-
std::fill(def_levels_ptr, def_levels_ptr + array.length(), 1);
209-
} else if (array.null_count() == array.length()) {
210-
std::fill(def_levels_ptr, def_levels_ptr + array.length(), 0);
211-
} else {
212-
::arrow::internal::BitmapReader valid_bits_reader(
213-
array.null_bitmap_data(), array.offset(), array.length());
214-
for (int i = 0; i < array.length(); i++) {
215-
def_levels_ptr[i] = valid_bits_reader.IsSet() ? 1 : 0;
216-
valid_bits_reader.Next();
217-
}
218-
}
219-
220-
*def_levels_out = def_levels_scratch;
221-
} else {
222-
*def_levels_out = nullptr;
223-
}
224-
*num_levels = array.length();
225-
} else {
226-
// Note it is hard to estimate memory consumption due to zero length
227-
// arrays otherwise we would preallocate. An upper boun on memory
228-
// is the sum of the length of each list array + number of elements
229-
// but this might be too loose of an upper bound so we choose to use
230-
// safe methods.
231-
RETURN_NOT_OK(rep_levels_.Append(0));
232-
RETURN_NOT_OK(HandleListEntries(0, 0, 0, array.length()));
233-
234-
RETURN_NOT_OK(def_levels_.Finish(def_levels_out));
235-
RETURN_NOT_OK(rep_levels_.Finish(rep_levels_out));
236-
*num_levels = (*rep_levels_out)->size() / sizeof(int16_t);
237-
}
238-
239-
return Status::OK();
240-
}
241-
242-
Status HandleList(int16_t def_level, int16_t rep_level, int64_t index) {
243-
if (nullable_[rep_level]) {
244-
if (null_counts_[rep_level] == 0 ||
245-
BitUtil::GetBit(valid_bitmaps_[rep_level], index + array_offsets_[rep_level])) {
246-
return HandleNonNullList(static_cast<int16_t>(def_level + 1), rep_level, index);
247-
} else {
248-
return def_levels_.Append(def_level);
249-
}
250-
} else {
251-
return HandleNonNullList(def_level, rep_level, index);
252-
}
253-
}
254-
255-
Status HandleNonNullList(int16_t def_level, int16_t rep_level, int64_t index) {
256-
const int32_t inner_offset = offsets_[rep_level][index];
257-
const int32_t inner_length = offsets_[rep_level][index + 1] - inner_offset;
258-
const int64_t recursion_level = rep_level + 1;
259-
if (inner_length == 0) {
260-
return def_levels_.Append(def_level);
261-
}
262-
if (recursion_level < static_cast<int64_t>(offsets_.size())) {
263-
return HandleListEntries(static_cast<int16_t>(def_level + 1),
264-
static_cast<int16_t>(rep_level + 1), inner_offset,
265-
inner_length);
266-
}
267-
// We have reached the leaf: primitive list, handle remaining nullables
268-
const bool nullable_level = nullable_[recursion_level];
269-
const int64_t level_null_count = null_counts_[recursion_level];
270-
const uint8_t* level_valid_bitmap = valid_bitmaps_[recursion_level];
271-
272-
if (inner_length >= 1) {
273-
RETURN_NOT_OK(
274-
rep_levels_.Append(inner_length - 1, static_cast<int16_t>(rep_level + 1)));
275-
}
276-
277-
// Special case: this is a null array (all elements are null)
278-
if (level_null_count && level_valid_bitmap == nullptr) {
279-
return def_levels_.Append(inner_length, static_cast<int16_t>(def_level + 1));
280-
}
281-
for (int64_t i = 0; i < inner_length; i++) {
282-
if (nullable_level &&
283-
((level_null_count == 0) ||
284-
BitUtil::GetBit(level_valid_bitmap,
285-
inner_offset + i + array_offsets_[recursion_level]))) {
286-
// Non-null element in a null level
287-
RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 2)));
288-
} else {
289-
// This can be produced in two cases:
290-
// * elements are nullable and this one is null
291-
// (i.e. max_def_level = def_level + 2)
292-
// * elements are non-nullable (i.e. max_def_level = def_level + 1)
293-
RETURN_NOT_OK(def_levels_.Append(static_cast<int16_t>(def_level + 1)));
294-
}
295-
}
296-
return Status::OK();
297-
}
298-
299-
Status HandleListEntries(int16_t def_level, int16_t rep_level, int64_t offset,
300-
int64_t length) {
301-
for (int64_t i = 0; i < length; i++) {
302-
if (i > 0) {
303-
RETURN_NOT_OK(rep_levels_.Append(rep_level));
304-
}
305-
RETURN_NOT_OK(HandleList(def_level, rep_level, offset + i));
306-
}
307-
return Status::OK();
308-
}
309-
310-
private:
311-
Int16BufferBuilder def_levels_;
312-
Int16BufferBuilder rep_levels_;
313-
314-
const SchemaField* schema_field_;
315-
const SchemaManifest* schema_manifest_;
316-
317-
std::vector<int64_t> null_counts_;
318-
std::vector<const uint8_t*> valid_bitmaps_;
319-
std::vector<const int32_t*> offsets_;
320-
std::vector<int32_t> array_offsets_;
321-
std::deque<bool> nullable_;
322-
323-
int64_t min_offset_idx_;
324-
int64_t max_offset_idx_;
325-
std::shared_ptr<Array> values_array_;
326-
};
327-
328-
Status LevelBuilder::VisitInline(const Array& array) {
329-
return VisitArrayInline(array, this);
330-
}
331-
332-
Status GetLeafType(const ::arrow::DataType& type, ::arrow::Type::type* leaf_type) {
333-
if (type.id() == ::arrow::Type::LIST || type.id() == ::arrow::Type::STRUCT) {
334-
if (type.num_fields() != 1) {
335-
return Status::Invalid("Nested column branch had multiple children: ", type);
336-
}
337-
return GetLeafType(*type.field(0)->type(), leaf_type);
338-
} else {
339-
*leaf_type = type.id();
340-
return Status::OK();
341-
}
342-
}
343-
344101
// Manages writing nested parquet columns with support for all nested types
345102
// supported by parquet.
346103
class ArrowColumnWriterV2 {
@@ -488,104 +245,6 @@ class ArrowColumnWriterV2 {
488245
RowGroupWriter* row_group_writer_;
489246
};
490247

491-
class ArrowColumnWriter {
492-
public:
493-
ArrowColumnWriter(ArrowWriteContext* ctx, ColumnWriter* column_writer,
494-
const SchemaField* schema_field,
495-
const SchemaManifest* schema_manifest)
496-
: ctx_(ctx),
497-
writer_(column_writer),
498-
schema_field_(schema_field),
499-
schema_manifest_(schema_manifest) {}
500-
501-
Status Write(const Array& data) {
502-
if (data.length() == 0) {
503-
// Write nothing when length is 0
504-
return Status::OK();
505-
}
506-
507-
::arrow::Type::type values_type;
508-
RETURN_NOT_OK(GetLeafType(*data.type(), &values_type));
509-
510-
std::shared_ptr<Array> _values_array;
511-
int64_t values_offset = 0;
512-
int64_t num_levels = 0;
513-
int64_t num_values = 0;
514-
LevelBuilder level_builder(ctx_->memory_pool, schema_field_, schema_manifest_);
515-
std::shared_ptr<Buffer> def_levels_buffer, rep_levels_buffer;
516-
RETURN_NOT_OK(level_builder.GenerateLevels(
517-
data, &values_offset, &num_values, &num_levels, ctx_->def_levels_buffer,
518-
&def_levels_buffer, &rep_levels_buffer, &_values_array));
519-
const int16_t* def_levels = nullptr;
520-
if (def_levels_buffer) {
521-
def_levels = reinterpret_cast<const int16_t*>(def_levels_buffer->data());
522-
}
523-
const int16_t* rep_levels = nullptr;
524-
if (rep_levels_buffer) {
525-
rep_levels = reinterpret_cast<const int16_t*>(rep_levels_buffer->data());
526-
}
527-
std::shared_ptr<Array> values_array = _values_array->Slice(values_offset, num_values);
528-
return writer_->WriteArrow(def_levels, rep_levels, num_levels, *values_array, ctx_);
529-
}
530-
531-
Status Write(const ChunkedArray& data, int64_t offset, const int64_t size) {
532-
if (data.length() == 0) {
533-
return Status::OK();
534-
}
535-
536-
int64_t absolute_position = 0;
537-
int chunk_index = 0;
538-
int64_t chunk_offset = 0;
539-
while (chunk_index < data.num_chunks() && absolute_position < offset) {
540-
const int64_t chunk_length = data.chunk(chunk_index)->length();
541-
if (absolute_position + chunk_length > offset) {
542-
// Relative offset into the chunk to reach the desired start offset for
543-
// writing
544-
chunk_offset = offset - absolute_position;
545-
break;
546-
} else {
547-
++chunk_index;
548-
absolute_position += chunk_length;
549-
}
550-
}
551-
552-
if (absolute_position >= data.length()) {
553-
return Status::Invalid("Cannot write data at offset past end of chunked array");
554-
}
555-
556-
int64_t values_written = 0;
557-
while (values_written < size) {
558-
const Array& chunk = *data.chunk(chunk_index);
559-
const int64_t available_values = chunk.length() - chunk_offset;
560-
const int64_t chunk_write_size = std::min(size - values_written, available_values);
561-
562-
// The chunk offset here will be 0 except for possibly the first chunk
563-
// because of the advancing logic above
564-
std::shared_ptr<Array> array_to_write = chunk.Slice(chunk_offset, chunk_write_size);
565-
RETURN_NOT_OK(Write(*array_to_write));
566-
567-
if (chunk_write_size == available_values) {
568-
chunk_offset = 0;
569-
++chunk_index;
570-
}
571-
values_written += chunk_write_size;
572-
}
573-
574-
return Status::OK();
575-
}
576-
577-
Status Close() {
578-
PARQUET_CATCH_NOT_OK(writer_->Close());
579-
return Status::OK();
580-
}
581-
582-
private:
583-
ArrowWriteContext* ctx_;
584-
ColumnWriter* writer_;
585-
const SchemaField* schema_field_;
586-
const SchemaManifest* schema_manifest_;
587-
};
588-
589248
} // namespace
590249

591250
// ----------------------------------------------------------------------
@@ -637,19 +296,8 @@ class FileWriterImpl : public FileWriter {
637296

638297
Status WriteColumnChunk(const std::shared_ptr<ChunkedArray>& data, int64_t offset,
639298
int64_t size) override {
640-
if (arrow_properties_->engine_version() == ArrowWriterProperties::V1) {
641-
ColumnWriter* column_writer;
642-
PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
643-
644-
const SchemaField* schema_field = nullptr;
645-
RETURN_NOT_OK(schema_manifest_.GetColumnField(row_group_writer_->current_column(),
646-
&schema_field));
647-
648-
ArrowColumnWriter arrow_writer(&column_write_context_, column_writer, schema_field,
649-
&schema_manifest_);
650-
RETURN_NOT_OK(arrow_writer.Write(*data, offset, size));
651-
return arrow_writer.Close();
652-
} else if (arrow_properties_->engine_version() == ArrowWriterProperties::V2) {
299+
if (arrow_properties_->engine_version() == ArrowWriterProperties::V2 ||
300+
arrow_properties_->engine_version() == ArrowWriterProperties::V1) {
653301
ARROW_ASSIGN_OR_RAISE(
654302
std::unique_ptr<ArrowColumnWriterV2> writer,
655303
ArrowColumnWriterV2::Make(*data, offset, size, schema_manifest_,

cpp/src/parquet/arrow/writer.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ ::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
9494
::arrow::io::OutputStream* sink);
9595

9696
/// \brief Write a Table to Parquet.
97-
///
98-
/// The table shall only consist of columns of primitive type or of primitive lists.
9997
::arrow::Status PARQUET_EXPORT
10098
WriteTable(const ::arrow::Table& table, MemoryPool* pool,
10199
std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,

python/pyarrow/_dataset.pyx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,8 +1096,7 @@ cdef class ParquetFileFormat(FileFormat):
10961096
coerce_timestamps=write_options.get("coerce_timestamps", None),
10971097
allow_truncated_timestamps=write_options.get(
10981098
"allow_truncated_timestamps", False),
1099-
writer_engine_version=os.environ.get(
1100-
"ARROW_PARQUET_WRITER_ENGINE", "V2")
1099+
writer_engine_version="V2"
11011100
)
11021101
wrapped.get().writer_properties = properties
11031102
wrapped.get().arrow_writer_properties = arrow_properties

0 commit comments

Comments
 (0)