Skip to content

Commit ea271b3

Browse files
bkietzpitrou
authored andcommitted
ARROW-1280: [C++] add fixed size list type
- FixedSizeListType, parameterized by the list size (similar to FixedSizeBinaryType), constructed with `fixed_size_list(<value field or type>, list_size)` - FixedSizeListArray, including Equal and pretty print impls - FixedSizeListBuilder, works with MakeBuilder Author: Benjamin Kietzman <bengilgit@gmail.com> Closes apache#4278 from bkietz/1280-Implement-Fixed-Size-List-type and squashes the following commits: 920a251 <Benjamin Kietzman> correct ListArray equality comparison ca02bda <Benjamin Kietzman> add test and missing impl for ipc/json+fixed_size_list 17b55c5 <Benjamin Kietzman> Fix underline in Layout.rst ddb46fd <Benjamin Kietzman> Add pretty-print test for fixed size list 3256051 <Benjamin Kietzman> Add ArrayFromJSON impl for fixed size list 59ce758 <Benjamin Kietzman> fixing some typos/copypasta dae1275 <Benjamin Kietzman> removing unused offset buffer from constructor f9258dd <Benjamin Kietzman> add fixed size list type
1 parent 3939252 commit ea271b3

31 files changed

Lines changed: 984 additions & 16 deletions

cpp/src/arrow/array-list-test.cc

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,14 @@ TEST_F(TestListArray, Equality) {
124124
ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice));
125125
}
126126

127+
TEST_F(TestListArray, ValuesEquality) {
128+
auto type = list(int32());
129+
auto left = ArrayFromJSON(type, "[[1, 2], [3], [0]]");
130+
auto right = ArrayFromJSON(type, "[[1, 2], [3], [100000]]");
131+
auto offset = 2;
132+
EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset)));
133+
}
134+
127135
TEST_F(TestListArray, TestResize) {}
128136

129137
TEST_F(TestListArray, TestFromArrays) {
@@ -332,4 +340,210 @@ TEST_F(TestListArray, TestBuilderPreserveFieleName) {
332340
ASSERT_EQ("counts", type.value_field()->name());
333341
}
334342

343+
// ----------------------------------------------------------------------
344+
// FixedSizeList tests
345+
346+
class TestFixedSizeListArray : public TestBuilder {
347+
public:
348+
void SetUp() {
349+
TestBuilder::SetUp();
350+
351+
value_type_ = int32();
352+
type_ = fixed_size_list(value_type_, list_size());
353+
354+
std::unique_ptr<ArrayBuilder> tmp;
355+
ASSERT_OK(MakeBuilder(pool_, type_, &tmp));
356+
builder_.reset(checked_cast<FixedSizeListBuilder*>(tmp.release()));
357+
}
358+
359+
void Done() {
360+
std::shared_ptr<Array> out;
361+
FinishAndCheckPadding(builder_.get(), &out);
362+
result_ = std::dynamic_pointer_cast<FixedSizeListArray>(out);
363+
}
364+
365+
protected:
366+
static constexpr int32_t list_size() { return 2; }
367+
std::shared_ptr<DataType> value_type_;
368+
369+
std::shared_ptr<FixedSizeListBuilder> builder_;
370+
std::shared_ptr<FixedSizeListArray> result_;
371+
};
372+
373+
TEST_F(TestFixedSizeListArray, Equality) {
374+
Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
375+
376+
std::shared_ptr<Array> array, equal_array, unequal_array;
377+
std::vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6};
378+
std::vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5, 2};
379+
380+
// setup two equal arrays
381+
ASSERT_OK(builder_->AppendValues(equal_values.size() / list_size()));
382+
ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));
383+
ASSERT_OK(builder_->Finish(&array));
384+
385+
ASSERT_OK(builder_->AppendValues(equal_values.size() / list_size()));
386+
ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));
387+
388+
ASSERT_OK(builder_->Finish(&equal_array));
389+
390+
// now an unequal one
391+
ASSERT_OK(builder_->AppendValues(unequal_values.size() / list_size()));
392+
ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size()));
393+
ASSERT_OK(builder_->Finish(&unequal_array));
394+
395+
// Test array equality
396+
AssertArraysEqual(*array, *array);
397+
AssertArraysEqual(*array, *equal_array);
398+
EXPECT_FALSE(equal_array->Equals(unequal_array));
399+
EXPECT_FALSE(unequal_array->Equals(equal_array));
400+
401+
// Test range equality
402+
EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array));
403+
EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
404+
EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
405+
EXPECT_TRUE(array->RangeEquals(1, 3, 2, unequal_array));
406+
}
407+
408+
TEST_F(TestFixedSizeListArray, TestAppendNull) {
409+
ASSERT_OK(builder_->AppendNull());
410+
ASSERT_OK(builder_->AppendNull());
411+
412+
Done();
413+
414+
ASSERT_OK(ValidateArray(*result_));
415+
ASSERT_TRUE(result_->IsNull(0));
416+
ASSERT_TRUE(result_->IsNull(1));
417+
418+
ASSERT_EQ(0, result_->value_offset(0));
419+
ASSERT_EQ(list_size(), result_->value_offset(1));
420+
421+
auto values = result_->values();
422+
ASSERT_EQ(list_size() * 2, values->length());
423+
}
424+
425+
TEST_F(TestFixedSizeListArray, TestAppendNulls) {
426+
ASSERT_OK(builder_->AppendNulls(3));
427+
428+
Done();
429+
430+
ASSERT_OK(ValidateArray(*result_));
431+
ASSERT_EQ(result_->length(), 3);
432+
ASSERT_EQ(result_->null_count(), 3);
433+
ASSERT_TRUE(result_->IsNull(0));
434+
ASSERT_TRUE(result_->IsNull(1));
435+
ASSERT_TRUE(result_->IsNull(2));
436+
437+
ASSERT_EQ(0, result_->value_offset(0));
438+
ASSERT_EQ(list_size(), result_->value_offset(1));
439+
ASSERT_EQ(list_size() * 2, result_->value_offset(2));
440+
441+
auto values = result_->values();
442+
ASSERT_EQ(list_size() * 3, values->length());
443+
}
444+
445+
void ValidateBasicFixedSizeListArray(const FixedSizeListArray* result,
446+
const std::vector<int32_t>& values,
447+
const std::vector<uint8_t>& is_valid) {
448+
ASSERT_OK(ValidateArray(*result));
449+
ASSERT_EQ(1, result->null_count());
450+
ASSERT_LE(result->values()->null_count(), 2);
451+
452+
ASSERT_EQ(3, result->length());
453+
for (int32_t i = 0; i < 3; ++i) {
454+
ASSERT_EQ(i * result->value_length(), result->value_offset(i));
455+
}
456+
457+
for (int i = 0; i < result->length(); ++i) {
458+
ASSERT_EQ(is_valid[i] == 0, result->IsNull(i));
459+
}
460+
461+
ASSERT_EQ(result->length() * result->value_length(), result->values()->length());
462+
auto varr = std::dynamic_pointer_cast<Int32Array>(result->values());
463+
464+
for (size_t i = 0; i < values.size(); ++i) {
465+
if (is_valid[i / result->value_length()] == 0) {
466+
continue;
467+
}
468+
ASSERT_EQ(values[i], varr->Value(i));
469+
}
470+
}
471+
472+
TEST_F(TestFixedSizeListArray, TestBasics) {
473+
std::vector<int32_t> values = {0, 1, 2, 3, 4, 5};
474+
std::vector<uint8_t> is_valid = {1, 0, 1};
475+
476+
Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
477+
478+
int pos = 0;
479+
for (size_t i = 0; i < values.size() / list_size(); ++i) {
480+
if (is_valid[i] == 0) {
481+
ASSERT_OK(builder_->AppendNull());
482+
pos += list_size();
483+
continue;
484+
}
485+
ASSERT_OK(builder_->Append());
486+
for (int j = 0; j < list_size(); ++j) {
487+
ASSERT_OK(vb->Append(values[pos++]));
488+
}
489+
}
490+
491+
Done();
492+
ValidateBasicFixedSizeListArray(result_.get(), values, is_valid);
493+
}
494+
495+
TEST_F(TestFixedSizeListArray, BulkAppend) {
496+
std::vector<int32_t> values = {0, 1, 2, 3, 4, 5};
497+
std::vector<uint8_t> is_valid = {1, 0, 1};
498+
499+
Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
500+
501+
ASSERT_OK(builder_->AppendValues(values.size() / list_size(), is_valid.data()));
502+
for (int32_t value : values) {
503+
ASSERT_OK(vb->Append(value));
504+
}
505+
Done();
506+
ValidateBasicFixedSizeListArray(result_.get(), values, is_valid);
507+
}
508+
509+
TEST_F(TestFixedSizeListArray, BulkAppendInvalid) {
510+
std::vector<int32_t> values = {0, 1, 2, 3, 4, 5};
511+
std::vector<uint8_t> is_valid = {1, 0, 1};
512+
513+
Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
514+
515+
ASSERT_OK(builder_->AppendValues(values.size() / list_size(), is_valid.data()));
516+
for (int32_t value : values) {
517+
ASSERT_OK(vb->Append(value));
518+
}
519+
for (int32_t value : values) {
520+
ASSERT_OK(vb->Append(value));
521+
}
522+
523+
Done();
524+
ASSERT_RAISES(Invalid, ValidateArray(*result_));
525+
}
526+
527+
TEST_F(TestFixedSizeListArray, TestZeroLength) {
528+
// All buffers are null
529+
Done();
530+
ASSERT_OK(ValidateArray(*result_));
531+
}
532+
533+
TEST_F(TestFixedSizeListArray, TestBuilderPreserveFieleName) {
534+
auto list_type_with_name = fixed_size_list(field("counts", int32()), list_size());
535+
536+
std::unique_ptr<ArrayBuilder> tmp;
537+
ASSERT_OK(MakeBuilder(pool_, list_type_with_name, &tmp));
538+
builder_.reset(checked_cast<FixedSizeListBuilder*>(tmp.release()));
539+
540+
ASSERT_OK(builder_->AppendValues(4));
541+
542+
std::shared_ptr<Array> list_array;
543+
ASSERT_OK(builder_->Finish(&list_array));
544+
545+
const auto& type = checked_cast<FixedSizeListType&>(*list_array->type());
546+
ASSERT_EQ("counts", type.value_field()->name());
547+
}
548+
335549
} // namespace arrow

cpp/src/arrow/array.cc

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,44 @@ std::shared_ptr<DataType> ListArray::value_type() const {
295295

296296
std::shared_ptr<Array> ListArray::values() const { return values_; }
297297

298+
// ----------------------------------------------------------------------
299+
// FixedSizeListArray
300+
301+
FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<ArrayData>& data) {
302+
SetData(data);
303+
}
304+
305+
FixedSizeListArray::FixedSizeListArray(const std::shared_ptr<DataType>& type,
306+
int64_t length,
307+
const std::shared_ptr<Array>& values,
308+
const std::shared_ptr<Buffer>& null_bitmap,
309+
int64_t null_count, int64_t offset) {
310+
auto internal_data = ArrayData::Make(type, length, {null_bitmap}, null_count, offset);
311+
internal_data->child_data.emplace_back(values->data());
312+
SetData(internal_data);
313+
}
314+
315+
void FixedSizeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
316+
DCHECK_EQ(data->type->id(), Type::FIXED_SIZE_LIST);
317+
this->Array::SetData(data);
318+
319+
DCHECK(list_type()->value_type()->Equals(data->child_data[0]->type));
320+
list_size_ = list_type()->list_size();
321+
322+
DCHECK_EQ(data_->child_data.size(), 1);
323+
values_ = MakeArray(data_->child_data[0]);
324+
}
325+
326+
const FixedSizeListType* FixedSizeListArray::list_type() const {
327+
return checked_cast<const FixedSizeListType*>(data_->type.get());
328+
}
329+
330+
std::shared_ptr<DataType> FixedSizeListArray::value_type() const {
331+
return list_type()->value_type();
332+
}
333+
334+
std::shared_ptr<Array> FixedSizeListArray::values() const { return values_; }
335+
298336
// ----------------------------------------------------------------------
299337
// String and binary
300338

@@ -839,6 +877,22 @@ struct ValidateVisitor {
839877
return ValidateOffsets(array);
840878
}
841879

880+
Status Visit(const FixedSizeListArray& array) {
881+
if (array.length() < 0) {
882+
return Status::Invalid("Length was negative");
883+
}
884+
if (!array.values()) {
885+
return Status::Invalid("values was null");
886+
}
887+
if (array.values()->length() != array.length() * array.value_length()) {
888+
return Status::Invalid(
889+
"Values Length (", array.values()->length(), ") was not equal to the length (",
890+
array.length(), ") multiplied by the list size (", array.value_length(), ")");
891+
}
892+
893+
return Status::OK();
894+
}
895+
842896
Status Visit(const StructArray& array) {
843897
if (array.length() < 0) {
844898
return Status::Invalid("Length was negative");

cpp/src/arrow/array.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,43 @@ class ARROW_EXPORT ListArray : public Array {
522522
std::shared_ptr<Array> values_;
523523
};
524524

525+
// ----------------------------------------------------------------------
526+
// FixedSizeListArray
527+
528+
/// Concrete Array class for fixed size list data
529+
class ARROW_EXPORT FixedSizeListArray : public Array {
530+
public:
531+
using TypeClass = FixedSizeListType;
532+
533+
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
534+
535+
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
536+
const std::shared_ptr<Array>& values,
537+
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
538+
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
539+
540+
const FixedSizeListType* list_type() const;
541+
542+
/// \brief Return array object containing the list's values
543+
std::shared_ptr<Array> values() const;
544+
545+
std::shared_ptr<DataType> value_type() const;
546+
547+
// Neither of these functions will perform boundschecking
548+
int32_t value_offset(int64_t i) const {
549+
i += data_->offset;
550+
return static_cast<int32_t>(list_size_ * i);
551+
}
552+
int32_t value_length(int64_t i = 0) const { return list_size_; }
553+
554+
protected:
555+
void SetData(const std::shared_ptr<ArrayData>& data);
556+
int32_t list_size_;
557+
558+
private:
559+
std::shared_ptr<Array> values_;
560+
};
561+
525562
// ----------------------------------------------------------------------
526563
// Binary and String
527564

0 commit comments

Comments
 (0)