Skip to content

Commit 43caa2a

Browse files
committed
ARROW-12065: [C++][Python] Fix segfault reading JSON file
A segfault would occur when a field is inferred as null in a first block and then as list in a second block. Also re-enable `chunked_builder_test.cc`, which wasn't compiled. Closes apache#9783 from pitrou/ARROW-12065-json-segfault Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 61d7cd3 commit 43caa2a

10 files changed

Lines changed: 153 additions & 91 deletions

File tree

cpp/src/arrow/json/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
add_arrow_test(test
1919
SOURCES
20+
chunked_builder_test.cc
2021
chunker_test.cc
2122
converter_test.cc
2223
parser_test.cc

cpp/src/arrow/json/chunked_builder.cc

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,13 @@
2727
#include "arrow/buffer.h"
2828
#include "arrow/json/converter.h"
2929
#include "arrow/table.h"
30+
#include "arrow/util/checked_cast.h"
3031
#include "arrow/util/logging.h"
3132
#include "arrow/util/task_group.h"
3233

3334
namespace arrow {
3435

36+
using internal::checked_cast;
3537
using internal::TaskGroup;
3638

3739
namespace json {
@@ -199,15 +201,6 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder {
199201
const std::shared_ptr<Array>& unconverted) override {
200202
std::unique_lock<std::mutex> lock(mutex_);
201203

202-
auto list_array = static_cast<const ListArray*>(unconverted.get());
203-
204-
if (null_bitmap_chunks_.size() <= static_cast<size_t>(block_index)) {
205-
null_bitmap_chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr);
206-
offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr);
207-
}
208-
null_bitmap_chunks_[block_index] = unconverted->null_bitmap();
209-
offset_chunks_[block_index] = list_array->value_offsets();
210-
211204
if (unconverted->type_id() == Type::NA) {
212205
auto st = InsertNull(block_index, unconverted->length());
213206
if (!st.ok()) {
@@ -217,8 +210,17 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder {
217210
}
218211

219212
DCHECK_EQ(unconverted->type_id(), Type::LIST);
220-
value_builder_->Insert(block_index, list_array->list_type()->value_field(),
221-
list_array->values());
213+
const auto& list_array = checked_cast<const ListArray&>(*unconverted);
214+
215+
if (null_bitmap_chunks_.size() <= static_cast<size_t>(block_index)) {
216+
null_bitmap_chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr);
217+
offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr);
218+
}
219+
null_bitmap_chunks_[block_index] = unconverted->null_bitmap();
220+
offset_chunks_[block_index] = list_array.value_offsets();
221+
222+
value_builder_->Insert(block_index, list_array.list_type()->value_field(),
223+
list_array.values());
222224
}
223225

224226
Status Finish(std::shared_ptr<ChunkedArray>* out) override {
@@ -305,17 +307,17 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder {
305307
return;
306308
}
307309

308-
auto struct_array = std::static_pointer_cast<StructArray>(unconverted);
310+
const auto& struct_array = checked_cast<const StructArray&>(*unconverted);
309311
if (promotion_graph_ == nullptr) {
310312
// If unexpected fields are ignored or result in an error then all parsers will emit
311313
// columns exclusively in the ordering specified in ParseOptions::explicit_schema,
312314
// so child_builders_ is immutable and no associative lookup is necessary.
313315
for (int i = 0; i < unconverted->num_fields(); ++i) {
314316
child_builders_[i]->Insert(block_index, unconverted->type()->field(i),
315-
struct_array->field(i));
317+
struct_array.field(i));
316318
}
317319
} else {
318-
auto st = InsertChildren(block_index, struct_array.get());
320+
auto st = InsertChildren(block_index, struct_array);
319321
if (!st.ok()) {
320322
return task_group_->Append([st] { return st; });
321323
}
@@ -383,10 +385,10 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder {
383385
// Insert children associatively by name; the unconverted block may have unexpected or
384386
// differently ordered fields
385387
// call from Insert() only, with mutex_ locked
386-
Status InsertChildren(int64_t block_index, const StructArray* unconverted) {
387-
const auto& fields = unconverted->type()->fields();
388+
Status InsertChildren(int64_t block_index, const StructArray& unconverted) {
389+
const auto& fields = unconverted.type()->fields();
388390

389-
for (int i = 0; i < unconverted->num_fields(); ++i) {
391+
for (int i = 0; i < unconverted.num_fields(); ++i) {
390392
auto it = name_to_index_.find(fields[i]->name());
391393

392394
if (it == name_to_index_.end()) {
@@ -405,9 +407,9 @@ class ChunkedStructArrayBuilder : public ChunkedArrayBuilder {
405407
child_builders_.emplace_back(std::move(child_builder));
406408
}
407409

408-
auto unconverted_field = unconverted->type()->field(i);
410+
auto unconverted_field = unconverted.type()->field(i);
409411
child_builders_[it->second]->Insert(block_index, unconverted_field,
410-
unconverted->field(i));
412+
unconverted.field(i));
411413

412414
child_absent_[block_index].resize(child_builders_.size(), true);
413415
child_absent_[block_index][it->second] = false;
@@ -444,12 +446,12 @@ Status MakeChunkedArrayBuilder(const std::shared_ptr<TaskGroup>& task_group,
444446
return Status::OK();
445447
}
446448
if (type->id() == Type::LIST) {
447-
auto list_type = static_cast<const ListType*>(type.get());
449+
const auto& list_type = checked_cast<const ListType&>(*type);
448450
std::shared_ptr<ChunkedArrayBuilder> value_builder;
449451
RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group, pool, promotion_graph,
450-
list_type->value_type(), &value_builder));
452+
list_type.value_type(), &value_builder));
451453
*out = std::make_shared<ChunkedListArrayBuilder>(
452-
task_group, pool, std::move(value_builder), list_type->value_field());
454+
task_group, pool, std::move(value_builder), list_type.value_field());
453455
return Status::OK();
454456
}
455457
std::shared_ptr<Converter> converter;

cpp/src/arrow/json/chunked_builder_test.cc

Lines changed: 56 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ using internal::checked_cast;
4040
using internal::GetCpuThreadPool;
4141
using internal::TaskGroup;
4242

43-
void AssertBuilding(const std::unique_ptr<ChunkedArrayBuilder>& builder,
43+
void AssertBuilding(const std::shared_ptr<ChunkedArrayBuilder>& builder,
4444
const std::vector<std::string>& chunks,
4545
std::shared_ptr<ChunkedArray>* out) {
4646
ArrayVector unconverted;
@@ -67,9 +67,8 @@ std::shared_ptr<ChunkedArray> ExtractField(const std::string& name,
6767
for (auto& chunk : chunks) {
6868
chunk = checked_cast<const StructArray&>(*chunk).GetFieldByName(name);
6969
}
70-
auto struct_type = static_cast<const StructType*>(columns.type().get());
71-
return std::make_shared<ChunkedArray>(chunks,
72-
struct_type->GetFieldByName(name)->type());
70+
const auto& struct_type = checked_cast<const StructType&>(*columns.type());
71+
return std::make_shared<ChunkedArray>(chunks, struct_type.GetFieldByName(name)->type());
7372
}
7473

7574
void AssertFieldEqual(const std::vector<std::string>& path,
@@ -83,27 +82,9 @@ void AssertFieldEqual(const std::vector<std::string>& path,
8382
AssertChunkedEqual(expected, *actual);
8483
}
8584

86-
template <typename T>
87-
std::string RowsOfOneColumn(string_view name, std::initializer_list<T> values,
88-
decltype(std::to_string(*values.begin()))* = nullptr) {
89-
std::stringstream ss;
90-
for (auto value : values) {
91-
ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
92-
}
93-
return ss.str();
94-
}
95-
96-
std::string RowsOfOneColumn(string_view name, std::initializer_list<std::string> values) {
97-
std::stringstream ss;
98-
for (auto value : values) {
99-
ss << R"({")" << name << R"(":)" << value << "}\n";
100-
}
101-
return ss.str();
102-
}
103-
10485
TEST(ChunkedArrayBuilder, Empty) {
10586
auto tg = TaskGroup::MakeSerial();
106-
std::unique_ptr<ChunkedArrayBuilder> builder;
87+
std::shared_ptr<ChunkedArrayBuilder> builder;
10788
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
10889
struct_({field("a", int32())}), &builder));
10990

@@ -116,7 +97,7 @@ TEST(ChunkedArrayBuilder, Empty) {
11697

11798
TEST(ChunkedArrayBuilder, Basics) {
11899
auto tg = TaskGroup::MakeSerial();
119-
std::unique_ptr<ChunkedArrayBuilder> builder;
100+
std::shared_ptr<ChunkedArrayBuilder> builder;
120101
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
121102
struct_({field("a", int32())}), &builder));
122103

@@ -130,7 +111,7 @@ TEST(ChunkedArrayBuilder, Basics) {
130111

131112
TEST(ChunkedArrayBuilder, Insert) {
132113
auto tg = TaskGroup::MakeSerial();
133-
std::unique_ptr<ChunkedArrayBuilder> builder;
114+
std::shared_ptr<ChunkedArrayBuilder> builder;
134115
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
135116
struct_({field("a", int32())}), &builder));
136117

@@ -151,7 +132,7 @@ TEST(ChunkedArrayBuilder, Insert) {
151132

152133
TEST(ChunkedArrayBuilder, MultipleChunks) {
153134
auto tg = TaskGroup::MakeSerial();
154-
std::unique_ptr<ChunkedArrayBuilder> builder;
135+
std::shared_ptr<ChunkedArrayBuilder> builder;
155136
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
156137
struct_({field("a", int32())}), &builder));
157138

@@ -170,7 +151,7 @@ TEST(ChunkedArrayBuilder, MultipleChunks) {
170151

171152
TEST(ChunkedArrayBuilder, MultipleChunksParallel) {
172153
auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
173-
std::unique_ptr<ChunkedArrayBuilder> builder;
154+
std::shared_ptr<ChunkedArrayBuilder> builder;
174155
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), nullptr,
175156
struct_({field("a", int32())}), &builder));
176157

@@ -194,7 +175,7 @@ TEST(ChunkedArrayBuilder, MultipleChunksParallel) {
194175

195176
TEST(InferringChunkedArrayBuilder, Empty) {
196177
auto tg = TaskGroup::MakeSerial();
197-
std::unique_ptr<ChunkedArrayBuilder> builder;
178+
std::shared_ptr<ChunkedArrayBuilder> builder;
198179
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
199180
struct_({}), &builder));
200181

@@ -207,7 +188,7 @@ TEST(InferringChunkedArrayBuilder, Empty) {
207188

208189
TEST(InferringChunkedArrayBuilder, SingleChunkNull) {
209190
auto tg = TaskGroup::MakeSerial();
210-
std::unique_ptr<ChunkedArrayBuilder> builder;
191+
std::shared_ptr<ChunkedArrayBuilder> builder;
211192
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
212193
struct_({}), &builder));
213194

@@ -224,7 +205,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkNull) {
224205

225206
TEST(InferringChunkedArrayBuilder, MultipleChunkNull) {
226207
auto tg = TaskGroup::MakeSerial();
227-
std::unique_ptr<ChunkedArrayBuilder> builder;
208+
std::shared_ptr<ChunkedArrayBuilder> builder;
228209
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
229210
struct_({}), &builder));
230211

@@ -244,7 +225,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkNull) {
244225

245226
TEST(InferringChunkedArrayBuilder, SingleChunkInteger) {
246227
auto tg = TaskGroup::MakeSerial();
247-
std::unique_ptr<ChunkedArrayBuilder> builder;
228+
std::shared_ptr<ChunkedArrayBuilder> builder;
248229
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
249230
struct_({}), &builder));
250231

@@ -264,7 +245,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkInteger) {
264245

265246
TEST(InferringChunkedArrayBuilder, MultipleChunkInteger) {
266247
auto tg = TaskGroup::MakeSerial();
267-
std::unique_ptr<ChunkedArrayBuilder> builder;
248+
std::shared_ptr<ChunkedArrayBuilder> builder;
268249
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
269250
struct_({}), &builder));
270251

@@ -285,7 +266,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkInteger) {
285266

286267
TEST(InferringChunkedArrayBuilder, SingleChunkDouble) {
287268
auto tg = TaskGroup::MakeSerial();
288-
std::unique_ptr<ChunkedArrayBuilder> builder;
269+
std::shared_ptr<ChunkedArrayBuilder> builder;
289270
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
290271
struct_({}), &builder));
291272

@@ -305,7 +286,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkDouble) {
305286

306287
TEST(InferringChunkedArrayBuilder, MultipleChunkDouble) {
307288
auto tg = TaskGroup::MakeSerial();
308-
std::unique_ptr<ChunkedArrayBuilder> builder;
289+
std::shared_ptr<ChunkedArrayBuilder> builder;
309290
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
310291
struct_({}), &builder));
311292

@@ -327,7 +308,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkDouble) {
327308

328309
TEST(InferringChunkedArrayBuilder, SingleChunkTimestamp) {
329310
auto tg = TaskGroup::MakeSerial();
330-
std::unique_ptr<ChunkedArrayBuilder> builder;
311+
std::shared_ptr<ChunkedArrayBuilder> builder;
331312
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
332313
struct_({}), &builder));
333314

@@ -348,7 +329,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkTimestamp) {
348329

349330
TEST(InferringChunkedArrayBuilder, MultipleChunkTimestamp) {
350331
auto tg = TaskGroup::MakeSerial();
351-
std::unique_ptr<ChunkedArrayBuilder> builder;
332+
std::shared_ptr<ChunkedArrayBuilder> builder;
352333
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
353334
struct_({}), &builder));
354335

@@ -371,7 +352,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkTimestamp) {
371352

372353
TEST(InferringChunkedArrayBuilder, SingleChunkString) {
373354
auto tg = TaskGroup::MakeSerial();
374-
std::unique_ptr<ChunkedArrayBuilder> builder;
355+
std::shared_ptr<ChunkedArrayBuilder> builder;
375356
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
376357
struct_({}), &builder));
377358

@@ -392,7 +373,7 @@ TEST(InferringChunkedArrayBuilder, SingleChunkString) {
392373

393374
TEST(InferringChunkedArrayBuilder, MultipleChunkString) {
394375
auto tg = TaskGroup::MakeSerial();
395-
std::unique_ptr<ChunkedArrayBuilder> builder;
376+
std::shared_ptr<ChunkedArrayBuilder> builder;
396377
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
397378
struct_({}), &builder));
398379

@@ -415,7 +396,7 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkString) {
415396

416397
TEST(InferringChunkedArrayBuilder, MultipleChunkIntegerParallel) {
417398
auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
418-
std::unique_ptr<ChunkedArrayBuilder> builder;
399+
std::shared_ptr<ChunkedArrayBuilder> builder;
419400
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
420401
struct_({}), &builder));
421402

@@ -433,5 +414,41 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkIntegerParallel) {
433414
AssertFieldEqual({"a"}, actual, *expected);
434415
}
435416

417+
TEST(InferringChunkedArrayBuilder, SingleChunkList) {
418+
auto tg = TaskGroup::MakeSerial();
419+
std::shared_ptr<ChunkedArrayBuilder> builder;
420+
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
421+
struct_({}), &builder));
422+
423+
std::shared_ptr<ChunkedArray> actual;
424+
AssertBuilding(builder,
425+
{
426+
std::string("{}\n") + "{\"a\": []}\n" + "{\"a\": [1, 2]}\n",
427+
},
428+
&actual);
429+
430+
auto expected = ChunkedArrayFromJSON(list(int64()), {"[null, [], [1, 2]]"});
431+
AssertFieldEqual({"a"}, actual, *expected);
432+
}
433+
434+
TEST(InferringChunkedArrayBuilder, MultipleChunkList) {
435+
auto tg = TaskGroup::MakeSerial();
436+
std::shared_ptr<ChunkedArrayBuilder> builder;
437+
ASSERT_OK(MakeChunkedArrayBuilder(tg, default_memory_pool(), GetPromotionGraph(),
438+
struct_({}), &builder));
439+
440+
std::shared_ptr<ChunkedArray> actual;
441+
AssertBuilding(builder,
442+
{
443+
"{}\n",
444+
"{\"a\": []}\n",
445+
"{\"a\": [1, 2]}\n",
446+
},
447+
&actual);
448+
449+
auto expected = ChunkedArrayFromJSON(list(int64()), {"[null]", "[[]]", "[[1, 2]]"});
450+
AssertFieldEqual({"a"}, actual, *expected);
451+
}
452+
436453
} // namespace json
437454
} // namespace arrow

cpp/src/arrow/json/converter.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,17 @@ namespace {
4848

4949
const DictionaryArray& GetDictionaryArray(const std::shared_ptr<Array>& in) {
5050
DCHECK_EQ(in->type_id(), Type::DICTIONARY);
51-
auto dict_type = static_cast<const DictionaryType*>(in->type().get());
51+
auto dict_type = checked_cast<const DictionaryType*>(in->type().get());
5252
DCHECK_EQ(dict_type->index_type()->id(), Type::INT32);
5353
DCHECK_EQ(dict_type->value_type()->id(), Type::STRING);
54-
return static_cast<const DictionaryArray&>(*in);
54+
return checked_cast<const DictionaryArray&>(*in);
5555
}
5656

5757
template <typename ValidVisitor, typename NullVisitor>
5858
Status VisitDictionaryEntries(const DictionaryArray& dict_array,
5959
ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
60-
const StringArray& dict = static_cast<const StringArray&>(*dict_array.dictionary());
61-
const Int32Array& indices = static_cast<const Int32Array&>(*dict_array.indices());
60+
const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
61+
const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
6262
for (int64_t i = 0; i < indices.length(); ++i) {
6363
if (indices.IsValid(i)) {
6464
RETURN_NOT_OK(visit_valid(dict.GetView(indices.GetView(i))));
@@ -281,8 +281,8 @@ const PromotionGraph* GetPromotionGraph() {
281281
return timestamp(TimeUnit::SECOND);
282282

283283
case Kind::kArray: {
284-
auto type = static_cast<const ListType*>(unexpected_field->type().get());
285-
auto value_field = type->value_field();
284+
const auto& type = checked_cast<const ListType&>(*unexpected_field->type());
285+
auto value_field = type.value_field();
286286
return list(value_field->WithType(Infer(value_field)));
287287
}
288288
case Kind::kObject: {

0 commit comments

Comments
 (0)