@@ -98,249 +98,6 @@ bool HasNullableRoot(const SchemaManifest& schema_manifest,
9898 return nullable;
9999}
100100
101- class LevelBuilder {
102- public:
103- explicit LevelBuilder (MemoryPool* pool, const SchemaField* schema_field,
104- const SchemaManifest* schema_manifest)
105- : def_levels_(pool),
106- rep_levels_(pool),
107- schema_field_(schema_field),
108- schema_manifest_(schema_manifest) {}
109-
110- Status VisitInline (const Array& array);
111-
112- template <typename T>
113- ::arrow::enable_if_t <std::is_base_of<::arrow::FlatArray, T>::value, Status> Visit (
114- const T& array) {
115- array_offsets_.push_back (static_cast <int32_t >(array.offset ()));
116- valid_bitmaps_.push_back (array.null_bitmap_data ());
117- null_counts_.push_back (array.null_count ());
118- values_array_ = std::make_shared<T>(array.data ());
119- return Status::OK ();
120- }
121-
122- Status Visit (const DictionaryArray& array) {
123- // Only currently handle DictionaryArray where the dictionary is a
124- // primitive type
125- if (array.dict_type ()->value_type ()->num_fields () > 0 ) {
126- return Status::NotImplemented (
127- " Writing DictionaryArray with nested dictionary "
128- " type not yet supported" );
129- }
130- array_offsets_.push_back (static_cast <int32_t >(array.offset ()));
131- valid_bitmaps_.push_back (array.null_bitmap_data ());
132- null_counts_.push_back (array.null_count ());
133- values_array_ = std::make_shared<DictionaryArray>(array.data ());
134- return Status::OK ();
135- }
136-
137- Status Visit (const ListArray& array) {
138- array_offsets_.push_back (static_cast <int32_t >(array.offset ()));
139- valid_bitmaps_.push_back (array.null_bitmap_data ());
140- null_counts_.push_back (array.null_count ());
141- offsets_.push_back (array.raw_value_offsets ());
142-
143- // Min offset isn't always zero in the case of sliced Arrays.
144- min_offset_idx_ = array.value_offset (min_offset_idx_);
145- max_offset_idx_ = array.value_offset (max_offset_idx_);
146-
147- return VisitInline (*array.values ());
148- }
149-
150- Status Visit (const ExtensionArray& array) { return VisitInline (*array.storage ()); }
151-
152- #define NOT_IMPLEMENTED_VISIT (ArrowTypePrefix ) \
153- Status Visit (const ::arrow::ArrowTypePrefix##Array& array) { \
154- return Status::NotImplemented (" Level generation for " #ArrowTypePrefix \
155- " not supported yet" ); \
156- }
157-
158- // See ARROW-1644
159- NOT_IMPLEMENTED_VISIT (LargeList)
160- NOT_IMPLEMENTED_VISIT (Map)
161- NOT_IMPLEMENTED_VISIT (FixedSizeList)
162- NOT_IMPLEMENTED_VISIT (Struct)
163- NOT_IMPLEMENTED_VISIT (Union)
164-
165- #undef NOT_IMPLEMENTED_VISIT
166-
167- Status ExtractNullability () {
168- // Walk upwards to extract nullability
169- const SchemaField* current_field = schema_field_;
170- while (current_field != nullptr ) {
171- nullable_.push_front (current_field->field ->nullable ());
172- if (current_field->field ->type ()->num_fields () > 1 ) {
173- return Status::NotImplemented (
174- " Fields with more than one child are not supported." );
175- } else {
176- current_field = schema_manifest_->GetParent (current_field);
177- }
178- }
179- return Status::OK ();
180- }
181-
182- Status GenerateLevels (const Array& array, int64_t * values_offset, int64_t * num_values,
183- int64_t * num_levels,
184- const std::shared_ptr<ResizableBuffer>& def_levels_scratch,
185- std::shared_ptr<Buffer>* def_levels_out,
186- std::shared_ptr<Buffer>* rep_levels_out,
187- std::shared_ptr<Array>* values_array) {
188- // Work downwards to extract bitmaps and offsets
189- min_offset_idx_ = 0 ;
190- max_offset_idx_ = array.length ();
191- RETURN_NOT_OK (VisitInline (array));
192- *num_values = max_offset_idx_ - min_offset_idx_;
193- *values_offset = min_offset_idx_;
194- *values_array = values_array_;
195-
196- RETURN_NOT_OK (ExtractNullability ());
197-
198- // Generate the levels.
199- if (nullable_.size () == 1 ) {
200- // We have a PrimitiveArray
201- *rep_levels_out = nullptr ;
202- if (nullable_[0 ]) {
203- RETURN_NOT_OK (
204- def_levels_scratch->Resize (array.length () * sizeof (int16_t ), false ));
205- auto def_levels_ptr =
206- reinterpret_cast <int16_t *>(def_levels_scratch->mutable_data ());
207- if (array.null_count () == 0 ) {
208- std::fill (def_levels_ptr, def_levels_ptr + array.length (), 1 );
209- } else if (array.null_count () == array.length ()) {
210- std::fill (def_levels_ptr, def_levels_ptr + array.length (), 0 );
211- } else {
212- ::arrow::internal::BitmapReader valid_bits_reader (
213- array.null_bitmap_data (), array.offset (), array.length ());
214- for (int i = 0 ; i < array.length (); i++) {
215- def_levels_ptr[i] = valid_bits_reader.IsSet () ? 1 : 0 ;
216- valid_bits_reader.Next ();
217- }
218- }
219-
220- *def_levels_out = def_levels_scratch;
221- } else {
222- *def_levels_out = nullptr ;
223- }
224- *num_levels = array.length ();
225- } else {
226- // Note it is hard to estimate memory consumption due to zero length
227- // arrays otherwise we would preallocate. An upper boun on memory
228- // is the sum of the length of each list array + number of elements
229- // but this might be too loose of an upper bound so we choose to use
230- // safe methods.
231- RETURN_NOT_OK (rep_levels_.Append (0 ));
232- RETURN_NOT_OK (HandleListEntries (0 , 0 , 0 , array.length ()));
233-
234- RETURN_NOT_OK (def_levels_.Finish (def_levels_out));
235- RETURN_NOT_OK (rep_levels_.Finish (rep_levels_out));
236- *num_levels = (*rep_levels_out)->size () / sizeof (int16_t );
237- }
238-
239- return Status::OK ();
240- }
241-
242- Status HandleList (int16_t def_level, int16_t rep_level, int64_t index) {
243- if (nullable_[rep_level]) {
244- if (null_counts_[rep_level] == 0 ||
245- BitUtil::GetBit (valid_bitmaps_[rep_level], index + array_offsets_[rep_level])) {
246- return HandleNonNullList (static_cast <int16_t >(def_level + 1 ), rep_level, index);
247- } else {
248- return def_levels_.Append (def_level);
249- }
250- } else {
251- return HandleNonNullList (def_level, rep_level, index);
252- }
253- }
254-
255- Status HandleNonNullList (int16_t def_level, int16_t rep_level, int64_t index) {
256- const int32_t inner_offset = offsets_[rep_level][index];
257- const int32_t inner_length = offsets_[rep_level][index + 1 ] - inner_offset;
258- const int64_t recursion_level = rep_level + 1 ;
259- if (inner_length == 0 ) {
260- return def_levels_.Append (def_level);
261- }
262- if (recursion_level < static_cast <int64_t >(offsets_.size ())) {
263- return HandleListEntries (static_cast <int16_t >(def_level + 1 ),
264- static_cast <int16_t >(rep_level + 1 ), inner_offset,
265- inner_length);
266- }
267- // We have reached the leaf: primitive list, handle remaining nullables
268- const bool nullable_level = nullable_[recursion_level];
269- const int64_t level_null_count = null_counts_[recursion_level];
270- const uint8_t * level_valid_bitmap = valid_bitmaps_[recursion_level];
271-
272- if (inner_length >= 1 ) {
273- RETURN_NOT_OK (
274- rep_levels_.Append (inner_length - 1 , static_cast <int16_t >(rep_level + 1 )));
275- }
276-
277- // Special case: this is a null array (all elements are null)
278- if (level_null_count && level_valid_bitmap == nullptr ) {
279- return def_levels_.Append (inner_length, static_cast <int16_t >(def_level + 1 ));
280- }
281- for (int64_t i = 0 ; i < inner_length; i++) {
282- if (nullable_level &&
283- ((level_null_count == 0 ) ||
284- BitUtil::GetBit (level_valid_bitmap,
285- inner_offset + i + array_offsets_[recursion_level]))) {
286- // Non-null element in a null level
287- RETURN_NOT_OK (def_levels_.Append (static_cast <int16_t >(def_level + 2 )));
288- } else {
289- // This can be produced in two cases:
290- // * elements are nullable and this one is null
291- // (i.e. max_def_level = def_level + 2)
292- // * elements are non-nullable (i.e. max_def_level = def_level + 1)
293- RETURN_NOT_OK (def_levels_.Append (static_cast <int16_t >(def_level + 1 )));
294- }
295- }
296- return Status::OK ();
297- }
298-
299- Status HandleListEntries (int16_t def_level, int16_t rep_level, int64_t offset,
300- int64_t length) {
301- for (int64_t i = 0 ; i < length; i++) {
302- if (i > 0 ) {
303- RETURN_NOT_OK (rep_levels_.Append (rep_level));
304- }
305- RETURN_NOT_OK (HandleList (def_level, rep_level, offset + i));
306- }
307- return Status::OK ();
308- }
309-
310- private:
311- Int16BufferBuilder def_levels_;
312- Int16BufferBuilder rep_levels_;
313-
314- const SchemaField* schema_field_;
315- const SchemaManifest* schema_manifest_;
316-
317- std::vector<int64_t > null_counts_;
318- std::vector<const uint8_t *> valid_bitmaps_;
319- std::vector<const int32_t *> offsets_;
320- std::vector<int32_t > array_offsets_;
321- std::deque<bool > nullable_;
322-
323- int64_t min_offset_idx_;
324- int64_t max_offset_idx_;
325- std::shared_ptr<Array> values_array_;
326- };
327-
328- Status LevelBuilder::VisitInline (const Array& array) {
329- return VisitArrayInline (array, this );
330- }
331-
332- Status GetLeafType (const ::arrow::DataType& type, ::arrow::Type::type* leaf_type) {
333- if (type.id () == ::arrow::Type::LIST || type.id () == ::arrow::Type::STRUCT ) {
334- if (type.num_fields () != 1 ) {
335- return Status::Invalid (" Nested column branch had multiple children: " , type);
336- }
337- return GetLeafType (*type.field (0 )->type (), leaf_type);
338- } else {
339- *leaf_type = type.id ();
340- return Status::OK ();
341- }
342- }
343-
344101// Manages writing nested parquet columns with support for all nested types
345102// supported by parquet.
346103class ArrowColumnWriterV2 {
@@ -488,104 +245,6 @@ class ArrowColumnWriterV2 {
488245 RowGroupWriter* row_group_writer_;
489246};
490247
491- class ArrowColumnWriter {
492- public:
493- ArrowColumnWriter (ArrowWriteContext* ctx, ColumnWriter* column_writer,
494- const SchemaField* schema_field,
495- const SchemaManifest* schema_manifest)
496- : ctx_(ctx),
497- writer_ (column_writer),
498- schema_field_(schema_field),
499- schema_manifest_(schema_manifest) {}
500-
501- Status Write (const Array& data) {
502- if (data.length () == 0 ) {
503- // Write nothing when length is 0
504- return Status::OK ();
505- }
506-
507- ::arrow::Type::type values_type;
508- RETURN_NOT_OK (GetLeafType (*data.type (), &values_type));
509-
510- std::shared_ptr<Array> _values_array;
511- int64_t values_offset = 0 ;
512- int64_t num_levels = 0 ;
513- int64_t num_values = 0 ;
514- LevelBuilder level_builder (ctx_->memory_pool , schema_field_, schema_manifest_);
515- std::shared_ptr<Buffer> def_levels_buffer, rep_levels_buffer;
516- RETURN_NOT_OK (level_builder.GenerateLevels (
517- data, &values_offset, &num_values, &num_levels, ctx_->def_levels_buffer ,
518- &def_levels_buffer, &rep_levels_buffer, &_values_array));
519- const int16_t * def_levels = nullptr ;
520- if (def_levels_buffer) {
521- def_levels = reinterpret_cast <const int16_t *>(def_levels_buffer->data ());
522- }
523- const int16_t * rep_levels = nullptr ;
524- if (rep_levels_buffer) {
525- rep_levels = reinterpret_cast <const int16_t *>(rep_levels_buffer->data ());
526- }
527- std::shared_ptr<Array> values_array = _values_array->Slice (values_offset, num_values);
528- return writer_->WriteArrow (def_levels, rep_levels, num_levels, *values_array, ctx_);
529- }
530-
531- Status Write (const ChunkedArray& data, int64_t offset, const int64_t size) {
532- if (data.length () == 0 ) {
533- return Status::OK ();
534- }
535-
536- int64_t absolute_position = 0 ;
537- int chunk_index = 0 ;
538- int64_t chunk_offset = 0 ;
539- while (chunk_index < data.num_chunks () && absolute_position < offset) {
540- const int64_t chunk_length = data.chunk (chunk_index)->length ();
541- if (absolute_position + chunk_length > offset) {
542- // Relative offset into the chunk to reach the desired start offset for
543- // writing
544- chunk_offset = offset - absolute_position;
545- break ;
546- } else {
547- ++chunk_index;
548- absolute_position += chunk_length;
549- }
550- }
551-
552- if (absolute_position >= data.length ()) {
553- return Status::Invalid (" Cannot write data at offset past end of chunked array" );
554- }
555-
556- int64_t values_written = 0 ;
557- while (values_written < size) {
558- const Array& chunk = *data.chunk (chunk_index);
559- const int64_t available_values = chunk.length () - chunk_offset;
560- const int64_t chunk_write_size = std::min (size - values_written, available_values);
561-
562- // The chunk offset here will be 0 except for possibly the first chunk
563- // because of the advancing logic above
564- std::shared_ptr<Array> array_to_write = chunk.Slice (chunk_offset, chunk_write_size);
565- RETURN_NOT_OK (Write (*array_to_write));
566-
567- if (chunk_write_size == available_values) {
568- chunk_offset = 0 ;
569- ++chunk_index;
570- }
571- values_written += chunk_write_size;
572- }
573-
574- return Status::OK ();
575- }
576-
577- Status Close () {
578- PARQUET_CATCH_NOT_OK (writer_->Close ());
579- return Status::OK ();
580- }
581-
582- private:
583- ArrowWriteContext* ctx_;
584- ColumnWriter* writer_;
585- const SchemaField* schema_field_;
586- const SchemaManifest* schema_manifest_;
587- };
588-
589248} // namespace
590249
591250// ----------------------------------------------------------------------
@@ -637,19 +296,8 @@ class FileWriterImpl : public FileWriter {
637296
638297 Status WriteColumnChunk (const std::shared_ptr<ChunkedArray>& data, int64_t offset,
639298 int64_t size) override {
640- if (arrow_properties_->engine_version () == ArrowWriterProperties::V1 ) {
641- ColumnWriter* column_writer;
642- PARQUET_CATCH_NOT_OK (column_writer = row_group_writer_->NextColumn ());
643-
644- const SchemaField* schema_field = nullptr ;
645- RETURN_NOT_OK (schema_manifest_.GetColumnField (row_group_writer_->current_column (),
646- &schema_field));
647-
648- ArrowColumnWriter arrow_writer (&column_write_context_, column_writer, schema_field,
649- &schema_manifest_);
650- RETURN_NOT_OK (arrow_writer.Write (*data, offset, size));
651- return arrow_writer.Close ();
652- } else if (arrow_properties_->engine_version () == ArrowWriterProperties::V2 ) {
299+ if (arrow_properties_->engine_version () == ArrowWriterProperties::V2 ||
300+ arrow_properties_->engine_version () == ArrowWriterProperties::V1 ) {
653301 ARROW_ASSIGN_OR_RAISE (
654302 std::unique_ptr<ArrowColumnWriterV2> writer,
655303 ArrowColumnWriterV2::Make (*data, offset, size, schema_manifest_,
0 commit comments