Skip to content

Commit 096b877

Browse files
wesmxhochy
authored andcommitted
ARROW-1601: [C++] Do not read extra byte from validity bitmap, add internal::BitmapReader in lieu of macros
@xhochy since this is causing the crash reported in ARROW-1601 we may want to do a patch release 0.7.1 and parquet-cpp 1.3.1 Author: Wes McKinney <wes.mckinney@twosigma.com> Closes apache#1126 from wesm/ARROW-1601 and squashes the following commits: 6cec81c [Wes McKinney] Fix RleDecoder logic with BitmapReader ba58b8a [Wes McKinney] Fix test name fa47865 [Wes McKinney] Add BitmapReader class to replace the bitset macros
1 parent b41a4ee commit 096b877

5 files changed

Lines changed: 98 additions & 28 deletions

File tree

cpp/build-support/run_clang_format.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/python
1+
#!/usr/bin/env python
22
# Licensed to the Apache Software Foundation (ASF) under one
33
# or more contributor license agreements. See the NOTICE file
44
# distributed with this work for additional information
@@ -58,8 +58,9 @@
5858
# fi
5959

6060
try:
61-
subprocess.check_output([CLANG_FORMAT, '-i'] + files_to_format,
62-
stderr=subprocess.STDOUT)
61+
cmd = [CLANG_FORMAT, '-i'] + files_to_format
62+
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
6363
except Exception as e:
6464
print(e)
65+
print(' '.join(cmd))
6566
raise

cpp/src/arrow/compute/cast.cc

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -261,8 +261,9 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
261261
const FixedSizeBinaryArray& dictionary,
262262
ArrayData* output) {
263263
using index_c_type = typename IndexType::c_type;
264-
const uint8_t* valid_bits = indices.null_bitmap_data();
265-
INIT_BITSET(valid_bits, indices.offset());
264+
265+
internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
266+
indices.length());
266267

267268
const index_c_type* in =
268269
reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
@@ -271,11 +272,11 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
271272
int32_t byte_width =
272273
static_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
273274
for (int64_t i = 0; i < indices.length(); ++i) {
274-
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
275+
if (valid_bits_reader.IsSet()) {
275276
const uint8_t* value = dictionary.Value(in[i]);
276277
memcpy(out + i * byte_width, value, byte_width);
277278
}
278-
READ_NEXT_BITSET(valid_bits);
279+
valid_bits_reader.Next();
279280
}
280281
}
281282

@@ -293,8 +294,7 @@ struct CastFunctor<
293294

294295
// Check if values and output type match
295296
DCHECK(values_type.Equals(*output->type))
296-
<< "Dictionary type: " << values_type
297-
<< " target type: " << (*output->type);
297+
<< "Dictionary type: " << values_type << " target type: " << (*output->type);
298298

299299
const Array& indices = *dict_array.indices();
300300
switch (indices.type()->id()) {
@@ -327,21 +327,21 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices,
327327
RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), output->type, &builder));
328328
BinaryBuilder* binary_builder = static_cast<BinaryBuilder*>(builder.get());
329329

330-
const uint8_t* valid_bits = indices.null_bitmap_data();
331-
INIT_BITSET(valid_bits, indices.offset());
330+
internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
331+
indices.length());
332332

333333
const index_c_type* in =
334334
reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
335335
indices.offset();
336336
for (int64_t i = 0; i < indices.length(); ++i) {
337-
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
337+
if (valid_bits_reader.IsSet()) {
338338
int32_t length;
339339
const uint8_t* value = dictionary.GetValue(in[i], &length);
340340
RETURN_NOT_OK(binary_builder->Append(value, length));
341341
} else {
342342
RETURN_NOT_OK(binary_builder->AppendNull());
343343
}
344-
READ_NEXT_BITSET(valid_bits);
344+
valid_bits_reader.Next();
345345
}
346346

347347
std::shared_ptr<Array> plain_array;
@@ -366,8 +366,7 @@ struct CastFunctor<T, DictionaryType,
366366

367367
// Check if values and output type match
368368
DCHECK(values_type.Equals(*output->type))
369-
<< "Dictionary type: " << values_type
370-
<< " target type: " << (*output->type);
369+
<< "Dictionary type: " << values_type << " target type: " << (*output->type);
371370

372371
const Array& indices = *dict_array.indices();
373372
switch (indices.type()->id()) {
@@ -401,17 +400,17 @@ void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
401400
c_type* out) {
402401
using index_c_type = typename IndexType::c_type;
403402

404-
const uint8_t* valid_bits = indices.null_bitmap_data();
405-
INIT_BITSET(valid_bits, indices.offset());
403+
internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
404+
indices.length());
406405

407406
const index_c_type* in =
408407
reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
409408
indices.offset();
410409
for (int64_t i = 0; i < indices.length(); ++i) {
411-
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
410+
if (valid_bits_reader.IsSet()) {
412411
out[i] = dictionary[in[i]];
413412
}
414-
READ_NEXT_BITSET(valid_bits);
413+
valid_bits_reader.Next();
415414
}
416415
}
417416

@@ -429,8 +428,7 @@ struct CastFunctor<T, DictionaryType,
429428

430429
// Check if values and output type match
431430
DCHECK(values_type.Equals(*output->type))
432-
<< "Dictionary type: " << values_type
433-
<< " target type: " << (*output->type);
431+
<< "Dictionary type: " << values_type << " target type: " << (*output->type);
434432

435433
auto dictionary =
436434
reinterpret_cast<const c_type*>(type.dictionary()->data()->buffers[1]->data()) +

cpp/src/arrow/util/bit-util-test.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,27 @@ TEST(BitUtilTests, TestNextPower2) {
7272
ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1));
7373
}
7474

75+
TEST(BitmapReader, DoesNotReadOutOfBounds) {
76+
uint8_t bitmap[16] = {0};
77+
78+
const int length = 128;
79+
80+
internal::BitmapReader r1(bitmap, 0, length);
81+
82+
// If this were to read out of bounds, valgrind would tell us
83+
for (int i = 0; i < length; ++i) {
84+
ASSERT_TRUE(r1.IsNotSet());
85+
r1.Next();
86+
}
87+
88+
internal::BitmapReader r2(bitmap, 5, length - 5);
89+
90+
for (int i = 0; i < (length - 5); ++i) {
91+
ASSERT_TRUE(r2.IsNotSet());
92+
r2.Next();
93+
}
94+
}
95+
7596
static inline int64_t SlowCountBits(const uint8_t* data, int64_t bit_offset,
7697
int64_t length) {
7798
int64_t count = 0;

cpp/src/arrow/util/bit-util.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,53 @@
4848
#endif
4949

5050
namespace arrow {
51+
namespace internal {
52+
53+
class BitmapReader {
54+
public:
55+
BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
56+
: bitmap_(bitmap), position_(0), length_(length) {
57+
byte_offset_ = start_offset / 8;
58+
bit_offset_ = start_offset % 8;
59+
current_byte_ = bitmap[byte_offset_];
60+
}
61+
62+
#if defined(_MSC_VER)
63+
// MSVC is finicky about this cast
64+
bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
65+
#else
66+
bool IsSet() const { return current_byte_ & (1 << bit_offset_); }
67+
#endif
68+
69+
bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
70+
71+
void Next() {
72+
++bit_offset_;
73+
++position_;
74+
if (bit_offset_ == 8) {
75+
bit_offset_ = 0;
76+
++byte_offset_;
77+
if (ARROW_PREDICT_TRUE(position_ < length_)) {
78+
current_byte_ = bitmap_[byte_offset_];
79+
}
80+
}
81+
}
82+
83+
private:
84+
const uint8_t* bitmap_;
85+
int64_t position_;
86+
int64_t length_;
87+
88+
uint8_t current_byte_;
89+
int64_t byte_offset_;
90+
int64_t bit_offset_;
91+
};
92+
93+
} // namespace internal
94+
95+
#ifndef ARROW_NO_DEPRECATED_API
96+
97+
// \deprecated Since > 0.7.0
5198

5299
#define INIT_BITSET(valid_bits_vector, valid_bits_index) \
53100
int64_t byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
@@ -62,6 +109,8 @@ namespace arrow {
62109
bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; \
63110
}
64111

112+
#endif
113+
65114
// TODO(wesm): The source from Impala was depending on boost::make_unsigned
66115
//
67116
// We add a partial stub implementation here

cpp/src/arrow/util/rle-encoding.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -352,11 +352,12 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values,
352352
DCHECK_GE(bit_width_, 0);
353353
int values_read = 0;
354354
int remaining_nulls = null_count;
355-
INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));
355+
356+
internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size);
356357

357358
while (values_read < batch_size) {
358-
bool is_valid = (bitset_valid_bits & (1 << bit_offset_valid_bits)) != 0;
359-
READ_NEXT_BITSET(valid_bits);
359+
bool is_valid = bit_reader.IsSet();
360+
bit_reader.Next();
360361

361362
if (is_valid) {
362363
if ((repeat_count_ == 0) && (literal_count_ == 0)) {
@@ -369,14 +370,14 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values,
369370
repeat_count_--;
370371

371372
while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
372-
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
373+
if (bit_reader.IsSet()) {
373374
repeat_count_--;
374375
} else {
375376
remaining_nulls--;
376377
}
377378
repeat_batch++;
378379

379-
READ_NEXT_BITSET(valid_bits);
380+
bit_reader.Next();
380381
}
381382
std::fill(values + values_read, values + values_read + repeat_batch, value);
382383
values_read += repeat_batch;
@@ -397,15 +398,15 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values,
397398

398399
// Read the first bitset to the end
399400
while (literals_read < literal_batch) {
400-
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
401+
if (bit_reader.IsSet()) {
401402
values[values_read + literals_read + skipped] =
402403
dictionary[indices[literals_read]];
403404
literals_read++;
404405
} else {
405406
skipped++;
406407
}
407408

408-
READ_NEXT_BITSET(valid_bits);
409+
bit_reader.Next();
409410
}
410411
literal_count_ -= literal_batch;
411412
values_read += literal_batch + skipped;

0 commit comments

Comments
 (0)