simdjson/include/simdjson/dom/parser.h at master · JavaScriptExpert/simdjson

History

429 lines (393 loc) · 18.3 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

#ifndef SIMDJSON_DOM_PARSER_H

#define SIMDJSON_DOM_PARSER_H

#include "simdjson/common_defs.h"

#include "simdjson/dom/document.h"

#include "simdjson/error.h"

#include "simdjson/internal/dom_parser_implementation.h"

#include "simdjson/internal/tape_ref.h"

#include "simdjson/minify.h"

#include "simdjson/padded_string.h"

#include "simdjson/portability.h"

#include <memory>

#include <ostream>

#include <string>

namespace simdjson {

namespace dom {

class document_stream;

class element;

/** The default batch size for parser.parse_many() and parser.load_many() */

static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;

/**

* A persistent document parser.

* The parser is designed to be reused, holding the internal buffers necessary to do parsing,

* as well as memory for a single document. The parsed document is overwritten on each parse.

* This class cannot be copied, only moved, to avoid unintended allocations.

* @note This is not thread safe: one parser cannot produce two documents at the same time!

class parser {

public:

/**

* Create a JSON parser.

* The new parser will have zero capacity.

* @param max_capacity The maximum document length the parser can automatically handle. The parser

* will allocate more capacity on an as needed basis (when it sees documents too big to handle)

* up to this amount. The parser still starts with zero capacity no matter what this number is:

* to allocate an initial capacity, call allocate() after constructing the parser.

* Defaults to SIMDJSON_MAXSIZE_BYTES (the largest single document simdjson can process).

really_inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept;

/**

* Take another parser's buffers and state.

* @param other The parser to take. Its capacity is zeroed.

really_inline parser(parser &&other) noexcept;

parser(const parser &) = delete; ///< @private Disallow copying

/**

* Take another parser's buffers and state.

* @param other The parser to take. Its capacity is zeroed.

really_inline parser &operator=(parser &&other) noexcept;

parser &operator=(const parser &) = delete; ///< @private Disallow copying

/** Deallocate the JSON parser. */

~parser()=default;

/**

* Load a JSON document from a file and return a reference to it.

* dom::parser parser;

* const element doc = parser.load("jsonexamples/twitter.json");

* ### IMPORTANT: Document Lifetime

* The JSON document still lives in the parser: this is the most efficient way to parse JSON

* documents because it reuses the same buffers, but you *must* use the document before you

* destroy the parser or call parse() again.

* ### Parser Capacity

* If the parser's current capacity is less than the file length, it will allocate enough capacity

* to handle it (up to max_capacity).

* @param path The path to load.

* @return The document, or an error:

* - IO_ERROR if there was an error opening or reading the file.

* - MEMALLOC if the parser does not have enough capacity and memory allocation fails.

* - CAPACITY if the parser does not have enough capacity and len > max_capacity.

* - other json errors if parsing fails.

inline simdjson_result<element> load(const std::string &path) & noexcept;

inline simdjson_result<element> load(const std::string &path) && = delete ;

/**

* Parse a JSON document and return a temporary reference to it.

* dom::parser parser;

* element doc = parser.parse(buf, len);

* ### IMPORTANT: Document Lifetime

* The JSON document still lives in the parser: this is the most efficient way to parse JSON

* documents because it reuses the same buffers, but you *must* use the document before you

* destroy the parser or call parse() again.

* ### REQUIRED: Buffer Padding

* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what

* those bytes are initialized to, as long as they are allocated.

* If realloc_if_needed is true, it is assumed that the buffer does *not* have enough padding,

* and it is copied into an enlarged temporary buffer before parsing.

* ### Parser Capacity

* If the parser's current capacity is less than len, it will allocate enough capacity

* to handle it (up to max_capacity).

* @param buf The JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes, unless

* realloc_if_needed is true.

* @param len The length of the JSON.

* @param realloc_if_needed Whether to reallocate and enlarge the JSON buffer to add padding.

* @return The document, or an error:

* - MEMALLOC if realloc_if_needed is true or the parser does not have enough capacity,

* and memory allocation fails.

* - CAPACITY if the parser does not have enough capacity and len > max_capacity.

* - other json errors if parsing fails.

inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) & noexcept;

inline simdjson_result<element> parse(const uint8_t *buf, size_t len, bool realloc_if_needed = true) && =delete;

/** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */

really_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) & noexcept;

really_inline simdjson_result<element> parse(const char *buf, size_t len, bool realloc_if_needed = true) && =delete;

/** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */

really_inline simdjson_result<element> parse(const std::string &s) & noexcept;

really_inline simdjson_result<element> parse(const std::string &s) && =delete;

/** @overload parse(const uint8_t *buf, size_t len, bool realloc_if_needed) */

really_inline simdjson_result<element> parse(const padded_string &s) & noexcept;

really_inline simdjson_result<element> parse(const padded_string &s) && =delete;

/** @private We do not want to allow implicit conversion from C string to std::string. */

really_inline simdjson_result<element> parse(const char *buf) noexcept = delete;

/**

* Load a file containing many JSON documents.

* dom::parser parser;

* for (const element doc : parser.load_many(path)) {

* cout << std::string(doc["title"]) << endl;

* }

* ### Format

* The file must contain a series of one or more JSON documents, concatenated into a single

* buffer, separated by whitespace. It effectively parses until it has a fully valid document,

* then starts parsing the next document at that point. (It does this with more parallelism and

* lookahead than you might think, though.)

* documents that consist of an object or array may omit the whitespace between them, concatenating

* with no separator. documents that consist of a single primitive (i.e. documents that are not

* arrays or objects) MUST be separated with whitespace.

* The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.

* Setting batch_size to excessively large or excesively small values may impact negatively the

* performance.

* ### Error Handling

* All errors are returned during iteration: if there is a global error such as memory allocation,

* it will be yielded as the first result. Iteration always stops after the first error.

* As with all other simdjson methods, non-exception error handling is readily available through

* the same interface, requiring you to check the error before using the document:

* dom::parser parser;

* dom::document_stream docs;

* auto error = parser.load_many(path).get(docs);

* if (error) { cerr << error << endl; exit(1); }

* for (auto doc : docs) {

* std::string_view title;

* if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }

* cout << title << endl;

* }

* ### Threads

* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the

* hood to do some lookahead.

* ### Parser Capacity

* If the parser's current capacity is less than batch_size, it will allocate enough capacity

* to handle it (up to max_capacity).

* @param path File name pointing at the concatenated JSON to parse.

* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet

* spot is cache-related: small enough to fit in cache, yet big enough to

* parse as many documents as possible in one tight loop.

* Defaults to 1MB (as simdjson::dom::DEFAULT_BATCH_SIZE), which has been a reasonable sweet spot in our tests.

* @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:

* - IO_ERROR if there was an error opening or reading the file.

* - MEMALLOC if the parser does not have enough capacity and memory allocation fails.

* - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.

* - other json errors if parsing fails.

inline simdjson_result<document_stream> load_many(const std::string &path, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;

/**

* Parse a buffer containing many JSON documents.

* dom::parser parser;

* for (element doc : parser.parse_many(buf, len)) {

* cout << std::string(doc["title"]) << endl;

* }

* ### Format

* The buffer must contain a series of one or more JSON documents, concatenated into a single

* buffer, separated by whitespace. It effectively parses until it has a fully valid document,

* then starts parsing the next document at that point. (It does this with more parallelism and

* lookahead than you might think, though.)

* documents that consist of an object or array may omit the whitespace between them, concatenating

* with no separator. documents that consist of a single primitive (i.e. documents that are not

* arrays or objects) MUST be separated with whitespace.

* The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.

* Setting batch_size to excessively large or excesively small values may impact negatively the

* performance.

* ### Error Handling

* All errors are returned during iteration: if there is a global error such as memory allocation,

* it will be yielded as the first result. Iteration always stops after the first error.

* As with all other simdjson methods, non-exception error handling is readily available through

* the same interface, requiring you to check the error before using the document:

* dom::parser parser;

* dom::document_stream docs;

* auto error = parser.load_many(path).get(docs);

* if (error) { cerr << error << endl; exit(1); }

* for (auto doc : docs) {

* std::string_view title;

* if ((error = doc["title"].get(title)) { cerr << error << endl; exit(1); }

* cout << title << endl;

* }

* ### REQUIRED: Buffer Padding

* The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what

* those bytes are initialized to, as long as they are allocated.

* ### Threads

* When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the

* hood to do some lookahead.

* ### Parser Capacity

* If the parser's current capacity is less than batch_size, it will allocate enough capacity

* to handle it (up to max_capacity).

* @param buf The concatenated JSON to parse. Must have at least len + SIMDJSON_PADDING allocated bytes.

* @param len The length of the concatenated JSON.

* @param batch_size The batch size to use. MUST be larger than the largest document. The sweet

* spot is cache-related: small enough to fit in cache, yet big enough to

* parse as many documents as possible in one tight loop.

* Defaults to 10MB, which has been a reasonable sweet spot in our tests.

* @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors:

* - MEMALLOC if the parser does not have enough capacity and memory allocation fails

* - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity.

* - other json errors if parsing fails.

inline simdjson_result<document_stream> parse_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;

/** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */

inline simdjson_result<document_stream> parse_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;

/** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */

inline simdjson_result<document_stream> parse_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;

/** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */

inline simdjson_result<document_stream> parse_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept;

/** @private We do not want to allow implicit conversion from C string to std::string. */

simdjson_result<document_stream> parse_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete;

/**

* Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length

* and `max_depth` depth.

* @param capacity The new capacity.

* @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.

* @return The error, if there is one.

WARN_UNUSED inline error_code allocate(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;

/**

* @private deprecated because it returns bool instead of error_code, which is our standard for

* failures. Use allocate() instead.

* Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length

* and `max_depth` depth.

* @param capacity The new capacity.

* @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.

* @return true if successful, false if allocation failed.

[[deprecated("Use allocate() instead.")]]

WARN_UNUSED inline bool allocate_capacity(size_t capacity, size_t max_depth = DEFAULT_MAX_DEPTH) noexcept;

/**

* The largest document this parser can support without reallocating.

* @return Current capacity, in bytes.

really_inline size_t capacity() const noexcept;

/**

* The largest document this parser can automatically support.

* The parser may reallocate internal buffers as needed up to this amount.

* @return Maximum capacity, in bytes.

really_inline size_t max_capacity() const noexcept;

/**

* The maximum level of nested object and arrays supported by this parser.

* @return Maximum depth, in bytes.

really_inline size_t max_depth() const noexcept;

/**

* Set max_capacity. This is the largest document this parser can automatically support.

* The parser may reallocate internal buffers as needed up to this amount as documents are passed

* to it.

* This call will not allocate or deallocate, even if capacity is currently above max_capacity.

* @param max_capacity The new maximum capacity, in bytes.

really_inline void set_max_capacity(size_t max_capacity) noexcept;

/** @private Use the new DOM API instead */

class Iterator;

/** @private Use simdjson_error instead */

using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error;

/** @private [for benchmarking access] The implementation to use */

std::unique_ptr<internal::dom_parser_implementation> implementation{};

/** @private Use `if (parser.parse(...).error())` instead */

bool valid{false};

/** @private Use `parser.parse(...).error()` instead */

error_code error{UNINITIALIZED};

/** @private Use `parser.parse(...).value()` instead */

document doc{};

/** @private returns true if the document parsed was valid */

[[deprecated("Use the result of parser.parse() instead")]]

inline bool is_valid() const noexcept;

/**

* @private return an error code corresponding to the last parsing attempt, see

* simdjson.h will return UNITIALIZED if no parsing was attempted

[[deprecated("Use the result of parser.parse() instead")]]

inline int get_error_code() const noexcept;

/** @private return the string equivalent of "get_error_code" */

[[deprecated("Use error_message() on the result of parser.parse() instead, or cout << error")]]

inline std::string get_error_message() const noexcept;

/** @private */

[[deprecated("Use cout << on the result of parser.parse() instead")]]

inline bool print_json(std::ostream &os) const noexcept;

/** @private Private and deprecated: use `parser.parse(...).doc.dump_raw_tape()` instead */

inline bool dump_raw_tape(std::ostream &os) const noexcept;

private:

/**

* The maximum document length this parser will automatically support.

* The parser will not be automatically allocated above this amount.

size_t _max_capacity;

/**

* The loaded buffer (reused each time load() is called)

std::unique_ptr<char[]> loaded_bytes;

/** Capacity of loaded_bytes buffer. */

size_t _loaded_bytes_capacity{0};

// all nodes are stored on the doc.tape using a 64-bit word.

// strings, double and ints are stored as

// a 64-bit word with a pointer to the actual value

// for objects or arrays, store [ or { at the beginning and } and ] at the

// end. For the openings ([ or {), we annotate them with a reference to the

// location on the doc.tape of the end, and for then closings (} and ]), we

// annotate them with a reference to the location of the opening

/**

* Ensure we have enough capacity to handle at least desired_capacity bytes,

* and auto-allocate if not.

inline error_code ensure_capacity(size_t desired_capacity) noexcept;

/** Read the file into loaded_bytes */

inline simdjson_result<size_t> read_file(const std::string &path) noexcept;

friend class parser::Iterator;

friend class document_stream;

}; // class parser

} // namespace dom

} // namespace simdjson

#endif // SIMDJSON_DOM_PARSER_H

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

parser.h

Latest commit

History

parser.h

File metadata and controls