forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuilder_binary.h
More file actions
587 lines (486 loc) · 19.3 KB
/
Copy pathbuilder_binary.h
File metadata and controls
587 lines (486 loc) · 19.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <limits>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "arrow/array.h"
#include "arrow/array/builder_base.h"
#include "arrow/buffer-builder.h"
#include "arrow/status.h"
#include "arrow/type_traits.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h" // IWYU pragma: export
namespace arrow {
constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
// ----------------------------------------------------------------------
// Binary and String
template <typename TYPE>
class BaseBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
: ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {}
Status Append(const uint8_t* value, offset_type length) {
ARROW_RETURN_NOT_OK(Reserve(1));
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
UnsafeAppendToBitmap(true);
return Status::OK();
}
Status Append(const char* value, offset_type length) {
return Append(reinterpret_cast<const uint8_t*>(value), length);
}
Status Append(util::string_view value) {
return Append(value.data(), static_cast<offset_type>(value.size()));
}
Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) {
return AppendOverflow(num_bytes);
}
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
UnsafeAppendToBitmap(length, false);
return Status::OK();
}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(AppendNextOffset());
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(false);
return Status::OK();
}
/// \brief Append without checking capacity
///
/// Offsets and data should have been presized using Reserve() and
/// ReserveData(), respectively.
void UnsafeAppend(const uint8_t* value, offset_type length) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(value, length);
UnsafeAppendToBitmap(true);
}
void UnsafeAppend(const char* value, offset_type length) {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
}
void UnsafeAppend(const std::string& value) {
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
}
void UnsafeAppend(util::string_view value) {
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}
void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}
/// \brief Append a sequence of strings in one shot.
///
/// \param[in] values a vector of strings
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const std::vector<std::string>& values,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = std::accumulate(
values.begin(), values.end(), 0ULL,
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
ARROW_RETURN_NOT_OK(Reserve(values.size()));
ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
if (valid_bytes != NULLPTR) {
for (std::size_t i = 0; i < values.size(); ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
value_data_builder_.UnsafeAppend(
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
}
}
} else {
for (std::size_t i = 0; i < values.size(); ++i) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
}
}
UnsafeAppendToBitmap(valid_bytes, values.size());
return Status::OK();
}
/// \brief Append a sequence of nul-terminated strings in one shot.
/// If one of the values is NULL, it is processed as a null
/// value even if the corresponding valid_bytes entry is 1.
///
/// \param[in] values a contiguous C array of nul-terminated char *
/// \param[in] length the number of values to append
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
/// indicates a valid (non-null) value
/// \return Status
Status AppendValues(const char** values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
std::size_t total_length = 0;
std::vector<std::size_t> value_lengths(length);
bool have_null_value = false;
for (int64_t i = 0; i < length; ++i) {
if (values[i] != NULLPTR) {
auto value_length = strlen(values[i]);
value_lengths[i] = value_length;
total_length += value_length;
} else {
have_null_value = true;
}
}
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(ReserveData(total_length));
if (valid_bytes) {
int64_t valid_bytes_offset = 0;
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (valid_bytes[i]) {
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
} else {
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
i - valid_bytes_offset);
UnsafeAppendToBitmap(false);
valid_bytes_offset = i + 1;
}
}
}
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
} else {
if (have_null_value) {
std::vector<uint8_t> valid_vector(length, 0);
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
if (values[i]) {
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
valid_vector[i] = 1;
}
}
UnsafeAppendToBitmap(valid_vector.data(), length);
} else {
for (int64_t i = 0; i < length; ++i) {
UnsafeAppendNextOffset();
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
value_lengths[i]);
}
UnsafeAppendToBitmap(NULLPTR, length);
}
}
return Status::OK();
}
void Reset() override {
ArrayBuilder::Reset();
offsets_builder_.Reset();
value_data_builder_.Reset();
}
Status Resize(int64_t capacity) override {
// XXX Why is this check necessary? There is no reason to disallow, say,
// binary arrays with more than 2**31 empty or null values.
if (capacity > memory_limit()) {
return Status::CapacityError("BinaryBuilder cannot reserve space for more than ",
memory_limit(), " child elements, got ", capacity);
}
ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
// One more than requested for offsets
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
const int64_t size = value_data_length() + elements;
ARROW_RETURN_IF(size > memory_limit(),
Status::CapacityError("Cannot reserve capacity larger than ",
memory_limit(), " bytes"));
return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements)
: Status::OK();
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
// Write final offset (values length)
ARROW_RETURN_NOT_OK(AppendNextOffset());
// These buffers' padding zeroed by BufferBuilder
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
*out = ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data},
null_count_, 0);
Reset();
return Status::OK();
}
/// \return data pointer of the value date builder
const uint8_t* value_data() const { return value_data_builder_.data(); }
/// \return size of values buffer so far
int64_t value_data_length() const { return value_data_builder_.length(); }
/// \return capacity of values buffer
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
/// \return data pointer of the value date builder
const offset_type* offsets_data() const { return offsets_builder_.data(); }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
const offset_type* offsets = offsets_builder_.data();
const auto offset = offsets[i];
if (i == (length_ - 1)) {
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
} else {
*out_length = offsets[i + 1] - offset;
}
return value_data_builder_.data() + offset;
}
offset_type offset(int64_t i) const { return offsets_data()[i]; }
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const {
offset_type value_length;
const uint8_t* value_data = GetValue(i, &value_length);
return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
}
// Cannot make this a static attribute because of linking issues
static constexpr int64_t memory_limit() {
return std::numeric_limits<offset_type>::max() - 1;
}
protected:
TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
Status AppendOverflow(int64_t num_bytes) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", num_bytes);
}
Status AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) {
return AppendOverflow(num_bytes);
}
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
}
void UnsafeAppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
};
/// \class BinaryBuilder
/// \brief Builder class for variable-length binary data
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
public:
explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
protected:
using BaseBinaryBuilder::BaseBinaryBuilder;
};
/// \class StringBuilder
/// \brief Builder class for UTF8 strings
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
public:
using BinaryBuilder::BinaryBuilder;
explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
};
/// \class LargeBinaryBuilder
/// \brief Builder class for large variable-length binary data
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
public:
explicit LargeBinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
protected:
using BaseBinaryBuilder::BaseBinaryBuilder;
};
/// \class LargeStringBuilder
/// \brief Builder class for large UTF8 strings
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
public:
using LargeBinaryBuilder::LargeBinaryBuilder;
explicit LargeStringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
};
// ----------------------------------------------------------------------
// FixedSizeBinaryBuilder
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = FixedSizeBinaryType;
FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
Status Append(const uint8_t* value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
return Status::OK();
}
Status Append(const char* value) {
return Append(reinterpret_cast<const uint8_t*>(value));
}
Status Append(const util::string_view& view) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(view);
return Status::OK();
}
Status Append(const std::string& s) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(s);
return Status::OK();
}
template <size_t NBYTES>
Status Append(const std::array<uint8_t, NBYTES>& value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(
util::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
return Status::OK();
}
Status AppendValues(const uint8_t* data, int64_t length,
const uint8_t* valid_bytes = NULLPTR);
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
void UnsafeAppend(const uint8_t* value) {
UnsafeAppendToBitmap(true);
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
byte_builder_.UnsafeAppend(value, byte_width_);
}
}
void UnsafeAppend(util::string_view value) {
#ifndef NDEBUG
CheckValueSize(static_cast<size_t>(value.size()));
#endif
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
}
void UnsafeAppendNull() {
UnsafeAppendToBitmap(false);
byte_builder_.UnsafeAdvance(byte_width_);
}
void Reset() override;
Status Resize(int64_t capacity) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
/// \cond FALSE
using ArrayBuilder::Finish;
/// \endcond
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
/// \return size of values buffer so far
int64_t value_data_length() const { return byte_builder_.length(); }
int32_t byte_width() const { return byte_width_; }
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
const uint8_t* GetValue(int64_t i) const;
/// Temporary access to a value.
///
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const;
static constexpr int64_t memory_limit() {
return std::numeric_limits<int64_t>::max() - 1;
}
protected:
int32_t byte_width_;
BufferBuilder byte_builder_;
/// Temporary access to a value.
///
/// This pointer becomes invalid on the next modifying operation.
uint8_t* GetMutableValue(int64_t i) {
uint8_t* data_ptr = byte_builder_.mutable_data();
return data_ptr + i * byte_width_;
}
#ifndef NDEBUG
void CheckValueSize(int64_t size);
#endif
};
// ----------------------------------------------------------------------
// Chunked builders: build a sequence of BinaryArray or StringArray that are
// limited to a particular size (to the upper limit of 2GB)
namespace internal {
class ARROW_EXPORT ChunkedBinaryBuilder {
public:
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
virtual ~ChunkedBinaryBuilder() = default;
Status Append(const uint8_t* value, int32_t length) {
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
max_chunk_value_length_)) {
if (builder_->value_data_length() == 0) {
// The current item is larger than max_chunk_size_;
// this chunk will be oversize and hold *only* this item
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
return NextChunk();
}
// The current item would cause builder_->value_data_length() to exceed
// max_chunk_size_, so finish this chunk and append the current item to the next
// chunk
ARROW_RETURN_NOT_OK(NextChunk());
return Append(value, length);
}
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
// The current item would cause builder_->length() to exceed max_chunk_length_, so
// finish this chunk and append the current item to the next chunk
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->Append(value, length);
}
Status Append(const util::string_view& value) {
return Append(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<int32_t>(value.size()));
}
Status AppendNull() {
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
ARROW_RETURN_NOT_OK(NextChunk());
}
return builder_->AppendNull();
}
Status Reserve(int64_t values);
virtual Status Finish(ArrayVector* out);
protected:
Status NextChunk();
// maximum total character data size per chunk
int64_t max_chunk_value_length_;
// maximum elements allowed per chunk
int64_t max_chunk_length_ = kListMaximumElements;
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
// add to extra_capacity_ instead and wait to reserve until the next chunk
int64_t extra_capacity_ = 0;
std::unique_ptr<BinaryBuilder> builder_;
std::vector<std::shared_ptr<Array>> chunks_;
};
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
public:
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
Status Finish(ArrayVector* out) override;
};
} // namespace internal
} // namespace arrow