forked from simdjson/simdjson
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.h
More file actions
429 lines (393 loc) · 18.3 KB
/
parser.h
File metadata and controls
429 lines (393 loc) · 18.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
#ifndef SIMDJSON_DOM_PARSER_H
#define SIMDJSON_DOM_PARSER_H
#include "simdjson/common_defs.h"
#include "simdjson/dom/document.h"
#include "simdjson/error.h"
#include "simdjson/internal/dom_parser_implementation.h"
#include "simdjson/internal/tape_ref.h"
#include "simdjson/minify.h"
#include "simdjson/padded_string.h"
#include "simdjson/portability.h"
#include <memory>
#include <ostream>
#include <string>
namespace simdjson {
namespace dom {
class document_stream;
class element;
/** The default batch size for parser.parse_many() and parser.load_many() */
static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
/**
* A persistent document parser.
*
* The parser is designed to be reused, holding the internal buffers necessary to do parsing,
* as well as memory for a single document. The parsed document is overwritten on each parse.
*
* This class cannot be copied, only moved, to avoid unintended allocations.
*
* @note This is not thread safe: one parser cannot produce two documents at the same time!
*/
class parser {
public:
/**
* Create a JSON parser.
*
* The new parser will have zero capacity.
*
* @param max_capacity The maximum document length the parser can automatically handle. The parser
* will allocate more capacity on an as needed basis (when it sees documents too big to handle)
* up to this amount. The parser still starts with zero capacity no matter what this number is:
* to allocate an initial capacity, call allocate() after constructing the parser.
* Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process).
*/
really_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept;
/**
* Take another parser's buffers and state.
*
* @param other The parser to take. Its capacity is zeroed.
*/
really_inline parser(parser &&other) noexcept;
parser(const parser &) = delete; ///< @private Disallow copying
/**
* Take another parser's buffers and state.
*
* @param other The parser to take. Its capacity is zeroed.
*/
really_inline parser &operator=(parser &&other) noexcept;
parser &operator=(const parser &) = delete; ///< @private Disallow copying
/** Deallocate the JSON parser. */
~parser()=default;
/**
* Load a JSON document from a file and return a reference to it.
*
* dom::parser parser;
* const element doc = parser.load("jsonexamples/twitter.json");
*
* ### IMPORTANT: Document Lifetime
*
* The JSON document still lives in the parser: this is the most efficient way to parse JSON
* documents because it reuses the same buffers, but you *must* use the document before you
* destroy the parser or call parse() again.
*
* ### Parser Capacity
*
* If the parser's current capacity is less than the file length, it will allocate enough capacity
* to handle it (up to max_capacity).
*
* @param path The path to load.
* @return The document, or an error:
* - IO_ERROR if there was an error opening or reading the file.
* - MEMALLOC if the parser does not have enough capacity and memory allocation fails.
* - CAPACITY if the parser does not have enough capacity and len > max_capacity.
* - other json errors if parsing fails.
*/
inline simdjson_result<element> load(const std::string &path) & noexcept;
inline simdjson_result<element> load(const std::string &path) && = delete ;
/**
* Parse a JSON document and return a temporary reference to it.
*
* dom::parser parser;
* element doc = parser.parse(buf, len);
*
* ### IMPORTANT: Document Lifetime
*
* The JSON document still lives in the parser: this is the most efficient way to parse JSON
* documents because it reuses the same buffers, but you *must* use the document before you
* destroy the parser or call parse() again.
*
* ### REQUIRED: Buffer Padding
*
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
* those bytes are initialized to, as long as they are allocated.
*
* If realloc_if_needed is true, it is assumed that the buffer does *not* have enough padding,
* and it is copied into an enlarged temporary buffer before parsing.
*
* ### Parser Capacity
*
* If the parser's current capacity is less than len, it will allocate enough capacity
* to handle it (up to max_capacity).
*
* @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless
* realloc_if_needed is true.
* @param len The length of the JSON.
* @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.
* @return The document, or an error:
* - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,
* and memory allocation fails.
* - CAPACITY if the parser does not have enough capacity and len > max_capacity.
* - other json errors if parsing fails.
*/
inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept;
inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete;
/** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
really_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) & noexcept;
really_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) && =delete;
/** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
really_inline simdjson_result<element> parse(const std::string &s) & noexcept;
really_inline simdjson_result<element> parse(const std::string &s) && =delete;
/** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */
really_inline simdjson_result<element> parse(const padded_string &s) & noexcept;
really_inline simdjson_result<element> parse(const padded_string &s) && =delete;
/** @private We do not want to allow implicit conversion from C string to std::string. */
really_inline simdjson_result<element> parse(const char *buf) noexcept = delete;
/**
* Load a file containing many JSON documents.
*
* dom::parser parser;
* for (const element doc : parser.load_many(path)) {
* cout << std::string(doc["title"]) << endl;
* }
*
* ### Format
*
* The file must contain a series of one or more JSON documents, concatenated into a single
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
* then starts parsing the next document at that point. (It does this with more parallelism and
* lookahead than you might think, though.)
*
* documents that consist of an object or array may omit the whitespace between them, concatenating
* with no separator. documents that consist of a single primitive (i.e. documents that are not
* arrays or objects) MUST be separated with whitespace.
*
* The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
* Setting batch_size to excessively large or excesively small values may impact negatively the
* performance.
*
* ### Error Handling
*
* All errors are returned during iteration: if there is a global error such as memory allocation,
* it will be yielded as the first result. Iteration always stops after the first error.
*
* As with all other simdjson methods, non-exception error handling is readily available through
* the same interface, requiring you to check the error before using the document:
*
* dom::parser parser;
* dom::document_stream docs;
* auto error = parser.load_many(path).get(docs);
* if (error) { cerr << error << endl; exit(1); }
* for (auto doc : docs) {
* std::string_view title;
* if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }
* cout << title << endl;
* }
*
* ### Threads
*
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
* hood to do some lookahead.
*
* ### Parser Capacity
*
* If the parser's current capacity is less than batch_size, it will allocate enough capacity
* to handle it (up to max_capacity).
*
* @param path File name pointing at the concatenated JSON to parse.
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
* spot is cache-related: small enough to fit in cache, yet big enough to
* parse as many documents as possible in one tight loop.
* Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet spot in our tests.
* @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
* - IO_ERROR if there was an error opening or reading the file.
* - MEMALLOC if the parser does not have enough capacity and memory allocation fails.
* - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
* - other json errors if parsing fails.
*/
inline simdjson_result<document_stream> load_many(const std::string &path, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
/**
* Parse a buffer containing many JSON documents.
*
* dom::parser parser;
* for (element doc : parser.parse_many(buf, len)) {
* cout << std::string(doc["title"]) << endl;
* }
*
* ### Format
*
* The buffer must contain a series of one or more JSON documents, concatenated into a single
* buffer, separated by whitespace. It effectively parses until it has a fully valid document,
* then starts parsing the next document at that point. (It does this with more parallelism and
* lookahead than you might think, though.)
*
* documents that consist of an object or array may omit the whitespace between them, concatenating
* with no separator. documents that consist of a single primitive (i.e. documents that are not
* arrays or objects) MUST be separated with whitespace.
*
* The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
* Setting batch_size to excessively large or excesively small values may impact negatively the
* performance.
*
* ### Error Handling
*
* All errors are returned during iteration: if there is a global error such as memory allocation,
* it will be yielded as the first result. Iteration always stops after the first error.
*
* As with all other simdjson methods, non-exception error handling is readily available through
* the same interface, requiring you to check the error before using the document:
*
* dom::parser parser;
* dom::document_stream docs;
* auto error = parser.load_many(path).get(docs);
* if (error) { cerr << error << endl; exit(1); }
* for (auto doc : docs) {
* std::string_view title;
* if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }
* cout << title << endl;
* }
*
* ### REQUIRED: Buffer Padding
*
* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what
* those bytes are initialized to, as long as they are allocated.
*
* ### Threads
*
* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the
* hood to do some lookahead.
*
* ### Parser Capacity
*
* If the parser's current capacity is less than batch_size, it will allocate enough capacity
* to handle it (up to max_capacity).
*
* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.
* @param len The length of the concatenated JSON.
* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet
* spot is cache-related: small enough to fit in cache, yet big enough to
* parse as many documents as possible in one tight loop.
* Defaults to 10MB, which has been a reasonable sweet spot in our tests.
* @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:
* - MEMALLOC if the parser does not have enough capacity and memory allocation fails
* - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.
* - other json errors if parsing fails.
*/
inline simdjson_result<document_stream> parse_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
/** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
inline simdjson_result<document_stream> parse_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
/** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
inline simdjson_result<document_stream> parse_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
/** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */
inline simdjson_result<document_stream> parse_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;
/** @private We do not want to allow implicit conversion from C string to std::string. */
simdjson_result<document_stream> parse_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete;
/**
* Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
* and `max_depth` depth.
*
* @param capacity The new capacity.
* @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
* @return The error, if there is one.
*/
WARN_UNUSED inline error_code allocate(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;
/**
* @private deprecated because it returns bool instead of error_code, which is our standard for
* failures. Use allocate() instead.
*
* Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
* and `max_depth` depth.
*
* @param capacity The new capacity.
* @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
* @return true if successful, false if allocation failed.
*/
[[deprecated("Use allocate() instead.")]]
WARN_UNUSED inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;
/**
* The largest document this parser can support without reallocating.
*
* @return Current capacity, in bytes.
*/
really_inline size_t capacity() const noexcept;
/**
* The largest document this parser can automatically support.
*
* The parser may reallocate internal buffers as needed up to this amount.
*
* @return Maximum capacity, in bytes.
*/
really_inline size_t max_capacity() const noexcept;
/**
* The maximum level of nested object and arrays supported by this parser.
*
* @return Maximum depth, in bytes.
*/
really_inline size_t max_depth() const noexcept;
/**
* Set max_capacity. This is the largest document this parser can automatically support.
*
* The parser may reallocate internal buffers as needed up to this amount as documents are passed
* to it.
*
* This call will not allocate or deallocate, even if capacity is currently above max_capacity.
*
* @param max_capacity The new maximum capacity, in bytes.
*/
really_inline void set_max_capacity(size_t max_capacity) noexcept;
/** @private Use the new DOM API instead */
class Iterator;
/** @private Use simdjson_error instead */
using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error;
/** @private [for benchmarking access] The implementation to use */
std::unique_ptr<internal::dom_parser_implementation> implementation{};
/** @private Use `if (parser.parse(...).error())` instead */
bool valid{false};
/** @private Use `parser.parse(...).error()` instead */
error_code error{UNINITIALIZED};
/** @private Use `parser.parse(...).value()` instead */
document doc{};
/** @private returns true if the document parsed was valid */
[[deprecated("Use the result of parser.parse() instead")]]
inline bool is_valid() const noexcept;
/**
* @private return an error code corresponding to the last parsing attempt, see
* simdjson.h will return UNITIALIZED if no parsing was attempted
*/
[[deprecated("Use the result of parser.parse() instead")]]
inline int get_error_code() const noexcept;
/** @private return the string equivalent of "get_error_code" */
[[deprecated("Use error_message() on the result of parser.parse() instead, or cout << error")]]
inline std::string get_error_message() const noexcept;
/** @private */
[[deprecated("Use cout << on the result of parser.parse() instead")]]
inline bool print_json(std::ostream &os) const noexcept;
/** @private Private and deprecated: use `parser.parse(...).doc.dump_raw_tape()` instead */
inline bool dump_raw_tape(std::ostream &os) const noexcept;
private:
/**
* The maximum document length this parser will automatically support.
*
* The parser will not be automatically allocated above this amount.
*/
size_t _max_capacity;
/**
* The loaded buffer (reused each time load() is called)
*/
std::unique_ptr<char[]> loaded_bytes;
/** Capacity of loaded_bytes buffer. */
size_t _loaded_bytes_capacity{0};
// all nodes are stored on the doc.tape using a 64-bit word.
//
// strings, double and ints are stored as
// a 64-bit word with a pointer to the actual value
//
//
//
// for objects or arrays, store [ or { at the beginning and } and ] at the
// end. For the openings ([ or {), we annotate them with a reference to the
// location on the doc.tape of the end, and for then closings (} and ]), we
// annotate them with a reference to the location of the opening
//
//
/**
* Ensure we have enough capacity to handle at least desired_capacity bytes,
* and auto-allocate if not.
*/
inline error_code ensure_capacity(size_t desired_capacity) noexcept;
/** Read the file into loaded_bytes */
inline simdjson_result<size_t> read_file(const std::string &path) noexcept;
friend class parser::Iterator;
friend class document_stream;
}; // class parser
} // namespace dom
} // namespace simdjson
#endif // SIMDJSON_DOM_PARSER_H