@@ -73,8 +73,6 @@ class json_structural_indexer {
7373 really_inline void step (const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept ;
7474 really_inline void next (simd::simd8x64<uint8_t > in, json_block block, size_t idx);
7575 really_inline error_code finish (dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
76- static really_inline uint32_t find_next_document_index (dom_parser_implementation &parser);
77- static really_inline size_t trim_partial_utf8 (const uint8_t *buf, size_t len);
7876
7977 json_scanner scanner{};
8078 utf8_checker checker{};
@@ -98,7 +96,7 @@ really_inline json_structural_indexer::json_structural_indexer(uint32_t *structu
9896// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
9997// to finish: utf-8 checks and generating the output from the last iteration.
10098//
101- // The reason we run 2 inputs at a time, is steps 2 and 3 are// still* not enough to soak up all
99+ // The reason we run 2 inputs at a time, is steps 2 and 3 are * still* not enough to soak up all
102100// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
103101// workout.
104102//
@@ -162,13 +160,6 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
162160 }
163161
164162 parser.n_structural_indexes = uint32_t (indexer.tail - parser.structural_indexes .get ());
165- // a valid JSON file cannot have zero structural indexes - we should have found something
166- if (unlikely (parser.n_structural_indexes == 0u )) {
167- return EMPTY;
168- }
169- if (unlikely (parser.structural_indexes [parser.n_structural_indexes - 1 ] > len)) {
170- return UNEXPECTED_ERROR;
171- }
172163 /* **
173164 * This is related to https://github.com/simdjson/simdjson/issues/906
174165 * Basically, we want to make sure that if the parsing continues beyond the last (valid)
@@ -186,102 +177,22 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
186177 parser.structural_indexes [parser.n_structural_indexes ] = uint32_t (len);
187178 parser.structural_indexes [parser.n_structural_indexes + 1 ] = uint32_t (len);
188179 parser.structural_indexes [parser.n_structural_indexes + 2 ] = 0 ;
180+ parser.next_structural_index = 0 ;
181+ // a valid JSON file cannot have zero structural indexes - we should have found something
182+ if (unlikely (parser.n_structural_indexes == 0u )) {
183+ return EMPTY;
184+ }
185+ if (unlikely (parser.structural_indexes [parser.n_structural_indexes - 1 ] > len)) {
186+ return UNEXPECTED_ERROR;
187+ }
189188 if (partial) {
190189 auto new_structural_indexes = find_next_document_index (parser);
191190 if (new_structural_indexes == 0 && parser.n_structural_indexes > 0 ) {
192191 return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
193192 }
194193 parser.n_structural_indexes = new_structural_indexes;
195194 }
196- parser.next_structural_index = 0 ;
197195 return checker.errors ();
198196}
199197
200- /* *
201- * This algorithm is used to quickly identify the last structural position that
202- * makes up a complete document.
203- *
204- * It does this by going backwards and finding the last *document boundary* (a
205- * place where one value follows another without a comma between them). If the
206- * last document (the characters after the boundary) has an equal number of
207- * start and end brackets, it is considered complete.
208- *
209- * Simply put, we iterate over the structural characters, starting from
210- * the end. We consider that we found the end of a JSON document when the
211- * first element of the pair is NOT one of these characters: '{' '[' ';' ','
212- * and when the second element is NOT one of these characters: '}' '}' ';' ','.
213- *
214- * This simple comparison works most of the time, but it does not cover cases
215- * where the batch's structural indexes contain a perfect amount of documents.
216- * In such a case, we do not have access to the structural index which follows
217- * the last document, therefore, we do not have access to the second element in
218- * the pair, and means that we cannot identify the last document. To fix this
219- * issue, we keep a count of the open and closed curly/square braces we found
220- * while searching for the pair. When we find a pair AND the count of open and
221- * closed curly/square braces is the same, we know that we just passed a
222- * complete
223- * document, therefore the last json buffer location is the end of the batch
224- */
225- really_inline uint32_t json_structural_indexer::find_next_document_index (dom_parser_implementation &parser) {
226- // TODO don't count separately, just figure out depth
227- auto arr_cnt = 0 ;
228- auto obj_cnt = 0 ;
229- for (auto i = parser.n_structural_indexes - 1 ; i > 0 ; i--) {
230- auto idxb = parser.structural_indexes [i];
231- switch (parser.buf [idxb]) {
232- case ' :' :
233- case ' ,' :
234- continue ;
235- case ' }' :
236- obj_cnt--;
237- continue ;
238- case ' ]' :
239- arr_cnt--;
240- continue ;
241- case ' {' :
242- obj_cnt++;
243- break ;
244- case ' [' :
245- arr_cnt++;
246- break ;
247- }
248- auto idxa = parser.structural_indexes [i - 1 ];
249- switch (parser.buf [idxa]) {
250- case ' {' :
251- case ' [' :
252- case ' :' :
253- case ' ,' :
254- continue ;
255- }
256- // Last document is complete, so the next document will appear after!
257- if (!arr_cnt && !obj_cnt) {
258- return parser.n_structural_indexes ;
259- }
260- // Last document is incomplete; mark the document at i + 1 as the next one
261- return i;
262- }
263- return 0 ;
264- }
265-
266- // Skip the last character if it is partial
267- really_inline size_t json_structural_indexer::trim_partial_utf8 (const uint8_t *buf, size_t len) {
268- if (unlikely (len < 3 )) {
269- switch (len) {
270- case 2 :
271- if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
272- if (buf[len-2 ] >= 0b11100000 ) { return len-2 ; } // 3- and 4-byte characters with only 2 bytes left
273- return len;
274- case 1 :
275- if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
276- return len;
277- case 0 :
278- return len;
279- }
280- }
281- if (buf[len-1 ] >= 0b11000000 ) { return len-1 ; } // 2-, 3- and 4-byte characters with only 1 byte left
282- if (buf[len-2 ] >= 0b11100000 ) { return len-2 ; } // 3- and 4-byte characters with only 1 byte left
283- if (buf[len-3 ] >= 0b11110000 ) { return len-3 ; } // 4-byte characters with only 3 bytes left
284- return len;
285- }
286-
287198} // namespace stage1
0 commit comments