1717
1818#include " benchmark/benchmark.h"
1919
20- #include < string >
20+ #include < unordered_set >
2121
2222#include " arrow/json/chunker.h"
2323#include " arrow/json/options.h"
3030namespace arrow {
3131namespace json {
3232
33- std::shared_ptr<Schema> TestSchema () {
34- return schema ({field (" int" , int32 ()), field (" str" , utf8 ())});
35- }
36-
37- constexpr int seed = 0x432432 ;
33+ constexpr int kSeed = 0x432432 ;
3834
39- std::string TestJsonData (int num_rows, bool pretty = false ) {
40- std::default_random_engine engine (seed);
35+ template <typename Input>
36+ std::string GenerateTestData (const Input& input, int num_rows,
37+ const GenerateOptions& options, bool pretty = false ) {
38+ std::default_random_engine engine (kSeed );
4139 std::string json;
4240 for (int i = 0 ; i < num_rows; ++i) {
4341 StringBuffer sb;
4442 Writer writer (sb);
45- ABORT_NOT_OK (Generate (TestSchema () , engine, &writer));
43+ ABORT_NOT_OK (Generate (input , engine, &writer, options ));
4644 json += pretty ? PrettyPrint (sb.GetString ()) : sb.GetString ();
4745 json += " \n " ;
4846 }
49-
5047 return json;
5148}
5249
53- static void BenchmarkJSONChunking (benchmark::State& state,
50+ template <typename Input>
51+ std::string GenerateTestData (const Input& input, int num_rows, bool pretty = false ) {
52+ return GenerateTestData (input, num_rows, GenerateOptions::Defaults (), pretty);
53+ }
54+
55+ FieldVector GenerateTestFields (int num_fields, int mean_name_length) {
56+ const std::shared_ptr<DataType> types[] = {boolean (), int64 (), float64 (), utf8 ()};
57+
58+ std::default_random_engine engine (kSeed );
59+
60+ std::poisson_distribution<int > length_dist (mean_name_length);
61+ std::uniform_int_distribution<uint16_t > char_dist (32 , 126 );
62+ std::uniform_int_distribution<size_t > type_dist (0 , std::size (types) - 1 );
63+
64+ std::unordered_set<std::string> names;
65+ names.reserve (num_fields);
66+
67+ while (static_cast <int >(names.size ()) < num_fields) {
68+ auto length = length_dist (engine);
69+ if (!length) continue ;
70+ std::string name (length, ' \0 ' );
71+ for (auto & ch : name) ch = static_cast <char >(char_dist (engine));
72+ names.emplace (std::move (name));
73+ }
74+
75+ FieldVector fields;
76+ fields.reserve (num_fields);
77+ for (const auto & name : names) {
78+ fields.push_back (field (name, types[type_dist (engine)]));
79+ }
80+
81+ return fields;
82+ }
83+
84+ FieldVector TestFields () { return {field (" int" , int32 ()), field (" str" , utf8 ())}; }
85+
86+ static void BenchmarkJSONChunking (benchmark::State& state, // NOLINT non-const reference
5487 const std::shared_ptr<Buffer>& json,
55- ParseOptions options) { // NOLINT non-const reference
88+ ParseOptions options) {
5689 auto chunker = MakeChunker (options);
5790
5891 for (auto _ : state) {
@@ -61,6 +94,7 @@ static void BenchmarkJSONChunking(benchmark::State& state,
6194 }
6295
6396 state.SetBytesProcessed (state.iterations () * json->size ());
97+ state.counters [" json_size" ] = static_cast <double >(json->size ());
6498}
6599
66100static void ChunkJSONPrettyPrinted (
@@ -69,9 +103,9 @@ static void ChunkJSONPrettyPrinted(
69103
70104 auto options = ParseOptions::Defaults ();
71105 options.newlines_in_values = true ;
72- options.explicit_schema = TestSchema ( );
106+ options.explicit_schema = schema ( TestFields () );
73107
74- auto json = TestJsonData ( num_rows, /* pretty */ true );
108+ auto json = GenerateTestData (options. explicit_schema , num_rows, /* pretty= */ true );
75109 BenchmarkJSONChunking (state, std::make_shared<Buffer>(json), options);
76110}
77111
@@ -81,15 +115,15 @@ static void ChunkJSONLineDelimited(
81115
82116 auto options = ParseOptions::Defaults ();
83117 options.newlines_in_values = false ;
84- options.explicit_schema = TestSchema ( );
118+ options.explicit_schema = schema ( TestFields () );
85119
86- auto json = TestJsonData ( num_rows);
120+ auto json = GenerateTestData (options. explicit_schema , num_rows);
87121 BenchmarkJSONChunking (state, std::make_shared<Buffer>(json), options);
88122 state.SetBytesProcessed (0 );
89123}
90124
91125static void BenchmarkJSONParsing (benchmark::State& state, // NOLINT non-const reference
92- const std::shared_ptr<Buffer>& json, int32_t num_rows,
126+ const std::shared_ptr<Buffer>& json,
93127 ParseOptions options) {
94128 for (auto _ : state) {
95129 std::unique_ptr<BlockParser> parser;
@@ -100,22 +134,23 @@ static void BenchmarkJSONParsing(benchmark::State& state, // NOLINT non-const r
100134 ABORT_NOT_OK (parser->Finish (&parsed));
101135 }
102136 state.SetBytesProcessed (state.iterations () * json->size ());
137+ state.counters [" json_size" ] = static_cast <double >(json->size ());
103138}
104139
105140static void ParseJSONBlockWithSchema (
106141 benchmark::State& state) { // NOLINT non-const reference
107142 const int32_t num_rows = 5000 ;
108143 auto options = ParseOptions::Defaults ();
109144 options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
110- options.explicit_schema = TestSchema ( );
145+ options.explicit_schema = schema ( TestFields () );
111146
112- auto json = TestJsonData ( num_rows);
113- BenchmarkJSONParsing (state, std::make_shared<Buffer>(json), num_rows, options);
147+ auto json = GenerateTestData (options. explicit_schema , num_rows);
148+ BenchmarkJSONParsing (state, std::make_shared<Buffer>(json), options);
114149}
115150
116151static void BenchmarkJSONReading (benchmark::State& state, // NOLINT non-const reference
117- const std::string& json, int32_t num_rows ,
118- ReadOptions read_options, ParseOptions parse_options) {
152+ const std::string& json, ReadOptions read_options ,
153+ ParseOptions parse_options) {
119154 for (auto _ : state) {
120155 std::shared_ptr<io::InputStream> input;
121156 ABORT_NOT_OK (MakeStream (json, &input));
@@ -127,20 +162,22 @@ static void BenchmarkJSONReading(benchmark::State& state, // NOLINT non-const r
127162 }
128163
129164 state.SetBytesProcessed (state.iterations () * json.size ());
165+ state.counters [" json_size" ] = static_cast <double >(json.size ());
130166}
131167
132168static void BenchmarkReadJSONBlockWithSchema (
133- benchmark::State& state, bool use_threads) { // NOLINT non-const reference
169+ benchmark::State& state, // NOLINT non-const reference
170+ bool use_threads) {
134171 const int32_t num_rows = 500000 ;
135172 auto read_options = ReadOptions::Defaults ();
136173 read_options.use_threads = use_threads;
137174
138175 auto parse_options = ParseOptions::Defaults ();
139176 parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
140- parse_options.explicit_schema = TestSchema ( );
177+ parse_options.explicit_schema = schema ( TestFields () );
141178
142- auto json = TestJsonData ( num_rows);
143- BenchmarkJSONReading (state, json, num_rows, read_options, parse_options);
179+ auto json = GenerateTestData (parse_options. explicit_schema , num_rows);
180+ BenchmarkJSONReading (state, json, read_options, parse_options);
144181}
145182
146183static void ReadJSONBlockWithSchemaSingleThread (
@@ -153,12 +190,47 @@ static void ReadJSONBlockWithSchemaMultiThread(
153190 BenchmarkReadJSONBlockWithSchema (state, true );
154191}
155192
193+ static void ParseJSONFields (benchmark::State& state) { // NOLINT non-const reference
194+ const bool ordered = !!state.range (0 );
195+ const bool with_schema = !!state.range (1 );
196+ const double sparsity = state.range (2 ) / 100.0 ;
197+ const auto num_fields = static_cast <int >(state.range (3 ));
198+
199+ // This would generate approximately 400 kB of JSON data
200+ int32_t num_rows = static_cast <int32_t >(2e4 / (1.0 - sparsity) / num_fields);
201+ // ... however, we want enough rows to make setup/finish overhead negligible
202+ num_rows = std::max<int32_t >(num_rows, 200 );
203+ // ... and also we want to avoid an "Exceeded maximum rows" error.
204+ num_rows = std::min<int32_t >(num_rows, kMaxParserNumRows );
205+ // In the end, we will empirically generate between 400 kB and 4 MB of JSON data.
206+
207+ auto fields = GenerateTestFields (num_fields, 10 );
208+
209+ auto parse_options = ParseOptions::Defaults ();
210+ if (with_schema) {
211+ parse_options.explicit_schema = schema (fields);
212+ parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
213+ }
214+
215+ auto gen_options = GenerateOptions::Defaults ();
216+ gen_options.field_probability = 1.0 - sparsity;
217+ gen_options.randomize_field_order = !ordered;
218+
219+ auto json = GenerateTestData (fields, num_rows, gen_options);
220+ BenchmarkJSONParsing (state, std::make_shared<Buffer>(json), parse_options);
221+ }
222+
156223BENCHMARK (ChunkJSONPrettyPrinted);
157224BENCHMARK (ChunkJSONLineDelimited);
158225BENCHMARK (ParseJSONBlockWithSchema);
159226
160227BENCHMARK (ReadJSONBlockWithSchemaSingleThread);
161228BENCHMARK (ReadJSONBlockWithSchemaMultiThread)->UseRealTime ();
162229
230+ BENCHMARK (ParseJSONFields)
231+ // NOTE: "sparsity" is the percentage of missing fields
232+ ->ArgNames ({" ordered" , " schema" , " sparsity" , " num_fields" })
233+ ->ArgsProduct({{1 , 0 }, {1 , 0 }, {0 , 10 , 90 }, {10 , 100 , 1000 }});
234+
163235} // namespace json
164236} // namespace arrow
0 commit comments