Skip to content

Commit 0dbda65

Browse files
committed
Fix fallback implementation
1 parent fe01da0 commit 0dbda65

File tree

6 files changed

+140
-113
lines changed

6 files changed

+140
-113
lines changed

src/arm64/dom_parser_implementation.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
8282
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
8383
}
8484

85+
#include "generic/stage1/find_next_document_index.h"
8586
#include "generic/stage1/utf8_lookup2_algorithm.h"
8687
#include "generic/stage1/json_structural_indexer.h"
8788
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {

src/fallback/dom_parser_implementation.cpp

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,17 @@ namespace simdjson {
99
namespace fallback {
1010
namespace stage1 {
1111

12+
#include "generic/stage1/find_next_document_index.h"
13+
1214
class structural_scanner {
1315
public:
1416

15-
really_inline structural_scanner(dom_parser_implementation &_parser, bool _streaming)
17+
really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
1618
: buf{_parser.buf},
1719
next_structural_index{_parser.structural_indexes.get()},
1820
parser{_parser},
1921
len{static_cast<uint32_t>(_parser.len)},
20-
streaming{_streaming} {
22+
partial{_partial} {
2123
}
2224

2325
really_inline void add_structural() {
@@ -41,7 +43,12 @@ really_inline void validate_utf8_character() {
4143
// 2-byte
4244
if ((buf[idx] & 0b00100000) == 0) {
4345
// missing continuation
44-
if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
46+
if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
47+
if (idx+1 > len && partial) { idx = len; return; }
48+
error = UTF8_ERROR;
49+
idx++;
50+
return;
51+
}
4552
// overlong: 1100000_ 10______
4653
if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
4754
idx += 2;
@@ -51,7 +58,12 @@ really_inline void validate_utf8_character() {
5158
// 3-byte
5259
if ((buf[idx] & 0b00010000) == 0) {
5360
// missing continuation
54-
if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
61+
if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
62+
if (idx+2 > len && partial) { idx = len; return; }
63+
error = UTF8_ERROR;
64+
idx++;
65+
return;
66+
}
5567
// overlong: 11100000 100_____ ________
5668
if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
5769
// surrogates: U+D800-U+DFFF 11101101 101_____
@@ -62,7 +74,12 @@ really_inline void validate_utf8_character() {
6274

6375
// 4-byte
6476
// missing continuation
65-
if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
77+
if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
78+
if (idx+2 > len && partial) { idx = len; return; }
79+
error = UTF8_ERROR;
80+
idx++;
81+
return;
82+
}
6683
// overlong: 11110000 1000____ ________ ________
6784
if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
6885
// too large: > U+10FFFF:
@@ -87,7 +104,7 @@ really_inline void validate_string() {
87104
idx++;
88105
}
89106
}
90-
if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
107+
if (idx >= len && !partial) { error = UNCLOSED_STRING; }
91108
}
92109

93110
really_inline bool is_whitespace_or_operator(uint8_t c) {
@@ -128,16 +145,26 @@ really_inline error_code scan() {
128145
break;
129146
}
130147
}
131-
if (unlikely(next_structural_index == parser.structural_indexes.get())) {
132-
return EMPTY;
133-
}
134148
*next_structural_index = len;
135-
next_structural_index++;
136149
// We pad beyond.
137150
// https://github.com/simdjson/simdjson/issues/906
138-
next_structural_index[0] = len;
139-
next_structural_index[1] = 0;
151+
next_structural_index[1] = len;
152+
next_structural_index[2] = 0;
140153
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
154+
parser.next_structural_index = 0;
155+
156+
if (unlikely(parser.n_structural_indexes == 0)) {
157+
return EMPTY;
158+
}
159+
160+
if (partial) {
161+
auto new_structural_indexes = find_next_document_index(parser);
162+
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
163+
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
164+
}
165+
parser.n_structural_indexes = new_structural_indexes;
166+
}
167+
141168
return error;
142169
}
143170

@@ -148,16 +175,16 @@ really_inline error_code scan() {
148175
uint32_t len;
149176
uint32_t idx{0};
150177
error_code error{SUCCESS};
151-
bool streaming;
178+
bool partial;
152179
}; // structural_scanner
153180

154181
} // namespace stage1
155182

156183

157-
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
184+
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
158185
this->buf = _buf;
159186
this->len = _len;
160-
stage1::structural_scanner scanner(*this, streaming);
187+
stage1::structural_scanner scanner(*this, partial);
161188
return scanner.scan();
162189
}
163190

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/**
2+
* This algorithm is used to quickly identify the last structural position that
3+
* makes up a complete document.
4+
*
5+
* It does this by going backwards and finding the last *document boundary* (a
6+
* place where one value follows another without a comma between them). If the
7+
* last document (the characters after the boundary) has an equal number of
8+
* start and end brackets, it is considered complete.
9+
*
10+
* Simply put, we iterate over the structural characters, starting from
11+
* the end. We consider that we found the end of a JSON document when the
12+
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
13+
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
14+
*
15+
* This simple comparison works most of the time, but it does not cover cases
16+
* where the batch's structural indexes contain a perfect amount of documents.
17+
* In such a case, we do not have access to the structural index which follows
18+
* the last document, therefore, we do not have access to the second element in
19+
* the pair, and that means we cannot identify the last document. To fix this
20+
* issue, we keep a count of the open and closed curly/square braces we found
21+
* while searching for the pair. When we find a pair AND the count of open and
22+
* closed curly/square braces is the same, we know that we just passed a
23+
* complete document, therefore the last json buffer location is the end of the
24+
* batch.
25+
*/
26+
really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
27+
// TODO don't count separately, just figure out depth
28+
auto arr_cnt = 0;
29+
auto obj_cnt = 0;
30+
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
31+
auto idxb = parser.structural_indexes[i];
32+
switch (parser.buf[idxb]) {
33+
case ':':
34+
case ',':
35+
continue;
36+
case '}':
37+
obj_cnt--;
38+
continue;
39+
case ']':
40+
arr_cnt--;
41+
continue;
42+
case '{':
43+
obj_cnt++;
44+
break;
45+
case '[':
46+
arr_cnt++;
47+
break;
48+
}
49+
auto idxa = parser.structural_indexes[i - 1];
50+
switch (parser.buf[idxa]) {
51+
case '{':
52+
case '[':
53+
case ':':
54+
case ',':
55+
continue;
56+
}
57+
// Last document is complete, so the next document will appear after!
58+
if (!arr_cnt && !obj_cnt) {
59+
return parser.n_structural_indexes;
60+
}
61+
// Last document is incomplete; mark the document at i + 1 as the next one
62+
return i;
63+
}
64+
return 0;
65+
}
66+
67+
// Skip the last character if it is partial
68+
really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
69+
if (unlikely(len < 3)) {
70+
switch (len) {
71+
case 2:
72+
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
73+
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
74+
return len;
75+
case 1:
76+
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
77+
return len;
78+
case 0:
79+
return len;
80+
}
81+
}
82+
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
83+
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
84+
if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
85+
return len;
86+
}

src/generic/stage1/json_structural_indexer.h

Lines changed: 9 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ class json_structural_indexer {
7373
really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
7474
really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
7575
really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
76-
static really_inline uint32_t find_next_document_index(dom_parser_implementation &parser);
77-
static really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len);
7876

7977
json_scanner scanner{};
8078
utf8_checker checker{};
@@ -98,7 +96,7 @@ really_inline json_structural_indexer::json_structural_indexer(uint32_t *structu
9896
// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
9997
// to finish: utf-8 checks and generating the output from the last iteration.
10098
//
101-
// The reason we run 2 inputs at a time, is steps 2 and 3 are//still* not enough to soak up all
99+
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
102100
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
103101
// workout.
104102
//
@@ -162,13 +160,6 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
162160
}
163161

164162
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
165-
// a valid JSON file cannot have zero structural indexes - we should have found something
166-
if (unlikely(parser.n_structural_indexes == 0u)) {
167-
return EMPTY;
168-
}
169-
if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
170-
return UNEXPECTED_ERROR;
171-
}
172163
/***
173164
* This is related to https://github.com/simdjson/simdjson/issues/906
174165
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
@@ -186,102 +177,22 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
186177
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
187178
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
188179
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
180+
parser.next_structural_index = 0;
181+
// a valid JSON file cannot have zero structural indexes - we should have found something
182+
if (unlikely(parser.n_structural_indexes == 0u)) {
183+
return EMPTY;
184+
}
185+
if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
186+
return UNEXPECTED_ERROR;
187+
}
189188
if (partial) {
190189
auto new_structural_indexes = find_next_document_index(parser);
191190
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
192191
return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
193192
}
194193
parser.n_structural_indexes = new_structural_indexes;
195194
}
196-
parser.next_structural_index = 0;
197195
return checker.errors();
198196
}
199197

200-
/**
201-
* This algorithm is used to quickly identify the last structural position that
202-
* makes up a complete document.
203-
*
204-
* It does this by going backwards and finding the last *document boundary* (a
205-
* place where one value follows another without a comma between them). If the
206-
* last document (the characters after the boundary) has an equal number of
207-
* start and end brackets, it is considered complete.
208-
*
209-
* Simply put, we iterate over the structural characters, starting from
210-
* the end. We consider that we found the end of a JSON document when the
211-
* first element of the pair is NOT one of these characters: '{' '[' ';' ','
212-
* and when the second element is NOT one of these characters: '}' '}' ';' ','.
213-
*
214-
* This simple comparison works most of the time, but it does not cover cases
215-
* where the batch's structural indexes contain a perfect amount of documents.
216-
* In such a case, we do not have access to the structural index which follows
217-
* the last document, therefore, we do not have access to the second element in
218-
* the pair, and means that we cannot identify the last document. To fix this
219-
* issue, we keep a count of the open and closed curly/square braces we found
220-
* while searching for the pair. When we find a pair AND the count of open and
221-
* closed curly/square braces is the same, we know that we just passed a
222-
* complete
223-
* document, therefore the last json buffer location is the end of the batch
224-
*/
225-
really_inline uint32_t json_structural_indexer::find_next_document_index(dom_parser_implementation &parser) {
226-
// TODO don't count separately, just figure out depth
227-
auto arr_cnt = 0;
228-
auto obj_cnt = 0;
229-
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
230-
auto idxb = parser.structural_indexes[i];
231-
switch (parser.buf[idxb]) {
232-
case ':':
233-
case ',':
234-
continue;
235-
case '}':
236-
obj_cnt--;
237-
continue;
238-
case ']':
239-
arr_cnt--;
240-
continue;
241-
case '{':
242-
obj_cnt++;
243-
break;
244-
case '[':
245-
arr_cnt++;
246-
break;
247-
}
248-
auto idxa = parser.structural_indexes[i - 1];
249-
switch (parser.buf[idxa]) {
250-
case '{':
251-
case '[':
252-
case ':':
253-
case ',':
254-
continue;
255-
}
256-
// Last document is complete, so the next document will appear after!
257-
if (!arr_cnt && !obj_cnt) {
258-
return parser.n_structural_indexes;
259-
}
260-
// Last document is incomplete; mark the document at i + 1 as the next one
261-
return i;
262-
}
263-
return 0;
264-
}
265-
266-
// Skip the last character if it is partial
267-
really_inline size_t json_structural_indexer::trim_partial_utf8(const uint8_t *buf, size_t len) {
268-
if (unlikely(len < 3)) {
269-
switch (len) {
270-
case 2:
271-
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
272-
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
273-
return len;
274-
case 1:
275-
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
276-
return len;
277-
case 0:
278-
return len;
279-
}
280-
}
281-
if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
282-
if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
283-
if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
284-
return len;
285-
}
286-
287198
} // namespace stage1

src/haswell/dom_parser_implementation.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
7070
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
7171
}
7272

73+
#include "generic/stage1/find_next_document_index.h"
7374
#include "generic/stage1/utf8_lookup2_algorithm.h"
7475
#include "generic/stage1/json_structural_indexer.h"
7576
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {

src/westmere/dom_parser_implementation.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
7171
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
7272
}
7373

74+
#include "generic/stage1/find_next_document_index.h"
7475
#include "generic/stage1/utf8_lookup2_algorithm.h"
7576
#include "generic/stage1/json_structural_indexer.h"
7677
WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {

0 commit comments

Comments
 (0)