Skip to content

Commit e1883ae

Browse files
benibuspitrou
andauthored
ARROW-18184: [C++] Improve JSON parser benchmarks (apache#14552)
See: [ARROW-18184](https://issues.apache.org/jira/browse/ARROW-18184). For context, this is based on [pull/14100](apache#14100). This enhances the capabilities of the existing random JSON generators and adds benchmark coverage for ordered/unordered fields, missing fields, and inferred schemas over a range of field counts. Lead-authored-by: benibus <bpharks@gmx.com> Co-authored-by: Antoine Pitrou <antoine@python.org> Co-authored-by: Ben Harkins <60872452+benibus@users.noreply.github.com> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent b2fd881 commit e1883ae

2 files changed

Lines changed: 158 additions & 45 deletions

File tree

cpp/src/arrow/json/parser_benchmark.cc

Lines changed: 98 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
#include "benchmark/benchmark.h"
1919

20-
#include <string>
20+
#include <unordered_set>
2121

2222
#include "arrow/json/chunker.h"
2323
#include "arrow/json/options.h"
@@ -30,29 +30,62 @@
3030
namespace arrow {
3131
namespace json {
3232

33-
std::shared_ptr<Schema> TestSchema() {
34-
return schema({field("int", int32()), field("str", utf8())});
35-
}
36-
37-
constexpr int seed = 0x432432;
33+
constexpr int kSeed = 0x432432;
3834

39-
std::string TestJsonData(int num_rows, bool pretty = false) {
40-
std::default_random_engine engine(seed);
35+
template <typename Input>
36+
std::string GenerateTestData(const Input& input, int num_rows,
37+
const GenerateOptions& options, bool pretty = false) {
38+
std::default_random_engine engine(kSeed);
4139
std::string json;
4240
for (int i = 0; i < num_rows; ++i) {
4341
StringBuffer sb;
4442
Writer writer(sb);
45-
ABORT_NOT_OK(Generate(TestSchema(), engine, &writer));
43+
ABORT_NOT_OK(Generate(input, engine, &writer, options));
4644
json += pretty ? PrettyPrint(sb.GetString()) : sb.GetString();
4745
json += "\n";
4846
}
49-
5047
return json;
5148
}
5249

53-
static void BenchmarkJSONChunking(benchmark::State& state,
50+
template <typename Input>
51+
std::string GenerateTestData(const Input& input, int num_rows, bool pretty = false) {
52+
return GenerateTestData(input, num_rows, GenerateOptions::Defaults(), pretty);
53+
}
54+
55+
FieldVector GenerateTestFields(int num_fields, int mean_name_length) {
56+
const std::shared_ptr<DataType> types[] = {boolean(), int64(), float64(), utf8()};
57+
58+
std::default_random_engine engine(kSeed);
59+
60+
std::poisson_distribution<int> length_dist(mean_name_length);
61+
std::uniform_int_distribution<uint16_t> char_dist(32, 126);
62+
std::uniform_int_distribution<size_t> type_dist(0, std::size(types) - 1);
63+
64+
std::unordered_set<std::string> names;
65+
names.reserve(num_fields);
66+
67+
while (static_cast<int>(names.size()) < num_fields) {
68+
auto length = length_dist(engine);
69+
if (!length) continue;
70+
std::string name(length, '\0');
71+
for (auto& ch : name) ch = static_cast<char>(char_dist(engine));
72+
names.emplace(std::move(name));
73+
}
74+
75+
FieldVector fields;
76+
fields.reserve(num_fields);
77+
for (const auto& name : names) {
78+
fields.push_back(field(name, types[type_dist(engine)]));
79+
}
80+
81+
return fields;
82+
}
83+
84+
FieldVector TestFields() { return {field("int", int32()), field("str", utf8())}; }
85+
86+
static void BenchmarkJSONChunking(benchmark::State& state, // NOLINT non-const reference
5487
const std::shared_ptr<Buffer>& json,
55-
ParseOptions options) { // NOLINT non-const reference
88+
ParseOptions options) {
5689
auto chunker = MakeChunker(options);
5790

5891
for (auto _ : state) {
@@ -61,6 +94,7 @@ static void BenchmarkJSONChunking(benchmark::State& state,
6194
}
6295

6396
state.SetBytesProcessed(state.iterations() * json->size());
97+
state.counters["json_size"] = static_cast<double>(json->size());
6498
}
6599

66100
static void ChunkJSONPrettyPrinted(
@@ -69,9 +103,9 @@ static void ChunkJSONPrettyPrinted(
69103

70104
auto options = ParseOptions::Defaults();
71105
options.newlines_in_values = true;
72-
options.explicit_schema = TestSchema();
106+
options.explicit_schema = schema(TestFields());
73107

74-
auto json = TestJsonData(num_rows, /* pretty */ true);
108+
auto json = GenerateTestData(options.explicit_schema, num_rows, /*pretty=*/true);
75109
BenchmarkJSONChunking(state, std::make_shared<Buffer>(json), options);
76110
}
77111

@@ -81,15 +115,15 @@ static void ChunkJSONLineDelimited(
81115

82116
auto options = ParseOptions::Defaults();
83117
options.newlines_in_values = false;
84-
options.explicit_schema = TestSchema();
118+
options.explicit_schema = schema(TestFields());
85119

86-
auto json = TestJsonData(num_rows);
120+
auto json = GenerateTestData(options.explicit_schema, num_rows);
87121
BenchmarkJSONChunking(state, std::make_shared<Buffer>(json), options);
88122
state.SetBytesProcessed(0);
89123
}
90124

91125
static void BenchmarkJSONParsing(benchmark::State& state, // NOLINT non-const reference
92-
const std::shared_ptr<Buffer>& json, int32_t num_rows,
126+
const std::shared_ptr<Buffer>& json,
93127
ParseOptions options) {
94128
for (auto _ : state) {
95129
std::unique_ptr<BlockParser> parser;
@@ -100,22 +134,23 @@ static void BenchmarkJSONParsing(benchmark::State& state, // NOLINT non-const r
100134
ABORT_NOT_OK(parser->Finish(&parsed));
101135
}
102136
state.SetBytesProcessed(state.iterations() * json->size());
137+
state.counters["json_size"] = static_cast<double>(json->size());
103138
}
104139

105140
static void ParseJSONBlockWithSchema(
106141
benchmark::State& state) { // NOLINT non-const reference
107142
const int32_t num_rows = 5000;
108143
auto options = ParseOptions::Defaults();
109144
options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
110-
options.explicit_schema = TestSchema();
145+
options.explicit_schema = schema(TestFields());
111146

112-
auto json = TestJsonData(num_rows);
113-
BenchmarkJSONParsing(state, std::make_shared<Buffer>(json), num_rows, options);
147+
auto json = GenerateTestData(options.explicit_schema, num_rows);
148+
BenchmarkJSONParsing(state, std::make_shared<Buffer>(json), options);
114149
}
115150

116151
static void BenchmarkJSONReading(benchmark::State& state, // NOLINT non-const reference
117-
const std::string& json, int32_t num_rows,
118-
ReadOptions read_options, ParseOptions parse_options) {
152+
const std::string& json, ReadOptions read_options,
153+
ParseOptions parse_options) {
119154
for (auto _ : state) {
120155
std::shared_ptr<io::InputStream> input;
121156
ABORT_NOT_OK(MakeStream(json, &input));
@@ -127,20 +162,22 @@ static void BenchmarkJSONReading(benchmark::State& state, // NOLINT non-const r
127162
}
128163

129164
state.SetBytesProcessed(state.iterations() * json.size());
165+
state.counters["json_size"] = static_cast<double>(json.size());
130166
}
131167

132168
static void BenchmarkReadJSONBlockWithSchema(
133-
benchmark::State& state, bool use_threads) { // NOLINT non-const reference
169+
benchmark::State& state, // NOLINT non-const reference
170+
bool use_threads) {
134171
const int32_t num_rows = 500000;
135172
auto read_options = ReadOptions::Defaults();
136173
read_options.use_threads = use_threads;
137174

138175
auto parse_options = ParseOptions::Defaults();
139176
parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
140-
parse_options.explicit_schema = TestSchema();
177+
parse_options.explicit_schema = schema(TestFields());
141178

142-
auto json = TestJsonData(num_rows);
143-
BenchmarkJSONReading(state, json, num_rows, read_options, parse_options);
179+
auto json = GenerateTestData(parse_options.explicit_schema, num_rows);
180+
BenchmarkJSONReading(state, json, read_options, parse_options);
144181
}
145182

146183
static void ReadJSONBlockWithSchemaSingleThread(
@@ -153,12 +190,47 @@ static void ReadJSONBlockWithSchemaMultiThread(
153190
BenchmarkReadJSONBlockWithSchema(state, true);
154191
}
155192

193+
static void ParseJSONFields(benchmark::State& state) { // NOLINT non-const reference
194+
const bool ordered = !!state.range(0);
195+
const bool with_schema = !!state.range(1);
196+
const double sparsity = state.range(2) / 100.0;
197+
const auto num_fields = static_cast<int>(state.range(3));
198+
199+
// This would generate approximately 400 kB of JSON data
200+
int32_t num_rows = static_cast<int32_t>(2e4 / (1.0 - sparsity) / num_fields);
201+
// ... however, we want enough rows to make setup/finish overhead negligible
202+
num_rows = std::max<int32_t>(num_rows, 200);
203+
// ... and also we want to avoid an "Exceeded maximum rows" error.
204+
num_rows = std::min<int32_t>(num_rows, kMaxParserNumRows);
205+
// In the end, we will empirically generate between 400 kB and 4 MB of JSON data.
206+
207+
auto fields = GenerateTestFields(num_fields, 10);
208+
209+
auto parse_options = ParseOptions::Defaults();
210+
if (with_schema) {
211+
parse_options.explicit_schema = schema(fields);
212+
parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
213+
}
214+
215+
auto gen_options = GenerateOptions::Defaults();
216+
gen_options.field_probability = 1.0 - sparsity;
217+
gen_options.randomize_field_order = !ordered;
218+
219+
auto json = GenerateTestData(fields, num_rows, gen_options);
220+
BenchmarkJSONParsing(state, std::make_shared<Buffer>(json), parse_options);
221+
}
222+
156223
BENCHMARK(ChunkJSONPrettyPrinted);
157224
BENCHMARK(ChunkJSONLineDelimited);
158225
BENCHMARK(ParseJSONBlockWithSchema);
159226

160227
BENCHMARK(ReadJSONBlockWithSchemaSingleThread);
161228
BENCHMARK(ReadJSONBlockWithSchemaMultiThread)->UseRealTime();
162229

230+
BENCHMARK(ParseJSONFields)
231+
// NOTE: "sparsity" is the percentage of missing fields
232+
->ArgNames({"ordered", "schema", "sparsity", "num_fields"})
233+
->ArgsProduct({{1, 0}, {1, 0}, {0, 10, 90}, {10, 100, 1000}});
234+
163235
} // namespace json
164236
} // namespace arrow

cpp/src/arrow/json/test_common.h

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,34 @@ using rj::StringBuffer;
5454
using std::string_view;
5555
using Writer = rj::Writer<StringBuffer>;
5656

57+
struct GenerateOptions {
58+
// Probability of a field being written
59+
double field_probability = 1.0;
60+
// Probability of a value being null
61+
double null_probability = 0.2;
62+
// Whether to randomize the order of written fields
63+
bool randomize_field_order = false;
64+
65+
static constexpr GenerateOptions Defaults() { return GenerateOptions{}; }
66+
};
67+
5768
inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); }
5869

5970
template <typename Engine>
60-
inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
61-
Writer* writer);
71+
inline static Status Generate(
72+
const std::shared_ptr<DataType>& type, Engine& e, Writer* writer,
73+
const GenerateOptions& options = GenerateOptions::Defaults());
6274

6375
template <typename Engine>
64-
inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
65-
Engine& e, Writer* writer);
76+
inline static Status Generate(
77+
const std::vector<std::shared_ptr<Field>>& fields, Engine& e, Writer* writer,
78+
const GenerateOptions& options = GenerateOptions::Defaults());
6679

6780
template <typename Engine>
68-
inline static Status Generate(const std::shared_ptr<Schema>& schm, Engine& e,
69-
Writer* writer) {
70-
return Generate(schm->fields(), e, writer);
81+
inline static Status Generate(
82+
const std::shared_ptr<Schema>& schm, Engine& e, Writer* writer,
83+
const GenerateOptions& options = GenerateOptions::Defaults()) {
84+
return Generate(schm->fields(), e, writer, options);
7185
}
7286

7387
template <typename Engine>
@@ -99,7 +113,7 @@ struct GenerateImpl {
99113
template <typename T>
100114
enable_if_base_binary<T, Status> Visit(const T&) {
101115
auto size = std::poisson_distribution<>{4}(e);
102-
std::uniform_int_distribution<uint16_t> gen_char(32, 127); // FIXME generate UTF8
116+
std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME generate UTF8
103117
std::string s(size, '\0');
104118
for (char& ch : s) ch = static_cast<char>(gen_char(e));
105119
return OK(writer.String(s.c_str()));
@@ -109,11 +123,13 @@ struct GenerateImpl {
109123
enable_if_list_like<T, Status> Visit(const T& t) {
110124
auto size = std::poisson_distribution<>{4}(e);
111125
writer.StartArray();
112-
for (int i = 0; i < size; ++i) RETURN_NOT_OK(Generate(t.value_type(), e, &writer));
126+
for (int i = 0; i < size; ++i) {
127+
RETURN_NOT_OK(Generate(t.value_type(), e, &writer, options));
128+
}
113129
return OK(writer.EndArray(size));
114130
}
115131

116-
Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer); }
132+
Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); }
117133

118134
Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); }
119135

@@ -135,29 +151,54 @@ struct GenerateImpl {
135151

136152
Engine& e;
137153
rj::Writer<rj::StringBuffer>& writer;
154+
const GenerateOptions& options;
138155
};
139156

140157
template <typename Engine>
141158
inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
142-
Writer* writer) {
143-
if (std::uniform_real_distribution<>{0, 1}(e) < .2) {
144-
// one out of 5 chance of null, anywhere
159+
Writer* writer, const GenerateOptions& options) {
160+
if (std::bernoulli_distribution(options.null_probability)(e)) {
145161
writer->Null();
146162
return Status::OK();
147163
}
148-
GenerateImpl<Engine> visitor = {e, *writer};
164+
GenerateImpl<Engine> visitor = {e, *writer, options};
149165
return VisitTypeInline(*type, &visitor);
150166
}
151167

152168
template <typename Engine>
153169
inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
154-
Engine& e, Writer* writer) {
170+
Engine& e, Writer* writer, const GenerateOptions& options) {
155171
RETURN_NOT_OK(OK(writer->StartObject()));
156-
for (const auto& f : fields) {
157-
writer->Key(f->name().c_str());
158-
RETURN_NOT_OK(Generate(f->type(), e, writer));
172+
173+
int num_fields = 0;
174+
auto write_field = [&](const Field& f) {
175+
++num_fields;
176+
writer->Key(f.name().c_str());
177+
return Generate(f.type(), e, writer, options);
178+
};
179+
180+
std::bernoulli_distribution bool_dist(options.field_probability);
181+
if (options.randomize_field_order) {
182+
std::vector<size_t> indices;
183+
indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
184+
for (size_t i = 0; i < fields.size(); ++i) {
185+
if (bool_dist(e)) {
186+
indices.push_back(i);
187+
}
188+
}
189+
std::shuffle(indices.begin(), indices.end(), e);
190+
for (auto i : indices) {
191+
RETURN_NOT_OK(write_field(*fields[i]));
192+
}
193+
} else {
194+
for (const auto& f : fields) {
195+
if (bool_dist(e)) {
196+
RETURN_NOT_OK(write_field(*f));
197+
}
198+
}
159199
}
160-
return OK(writer->EndObject(static_cast<int>(fields.size())));
200+
201+
return OK(writer->EndObject(num_fields));
161202
}
162203

163204
inline static Status MakeStream(string_view src_str,

0 commit comments

Comments
 (0)