Skip to content

Commit 779ce18

Browse files
committed
Getting ready to document the tape format.
1 parent c47e734 commit 779ce18

6 files changed

Lines changed: 34 additions & 13 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ To simplify the engineering, we make some assumptions.
100100
## Features
101101

102102
- The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
103-
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808). Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson stores integers larger than 2147483648 as floating-point numbers.)
103+
- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
104104
- We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation.)
105105
- We fully validate the numbers. (Parsers like gason and ultranjson will accept `[0e+]` as valid JSON.)
106106
- We validate string content for unescaped characters. (Parsers like fastjson and ultrajson accept unescaped line breaks and tags in strings.)

include/simdjson/numberparsing.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ static really_inline bool parse_number(const u8 *const buf,
483483
#endif
484484
}
485485
} else {
486-
if (unlikely(digitcount >= 19)) { // this is uncommon!!!
486+
if (unlikely(digitcount >= 18)) { // this is uncommon!!!
487487
return parse_large_integer(buf, pj, offset,
488488
found_minus);
489489
}

include/simdjson/parsedjson.h

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -217,19 +217,23 @@ struct ParsedJson {
217217
bool dump_raw_tape(std::ostream &os) {
218218
if(!isvalid) return false;
219219
size_t tapeidx = 0;
220-
u64 tape_val = tape[tapeidx++];
220+
u64 tape_val = tape[tapeidx];
221221
u8 type = (tape_val >> 56);
222+
os << tapeidx << " : " << type;
223+
tapeidx++;
222224
size_t howmany = 0;
223225
if (type == 'r') {
224226
howmany = tape_val & JSONVALUEMASK;
225227
} else {
226228
printf("Error: no starting root node?");
227229
return false;
228230
}
231+
os << "\t// pointing to " << howmany <<" (right after last node)\n";
232+
u64 payload;
229233
for (; tapeidx < howmany; tapeidx++) {
230234
os << tapeidx << " : ";
231235
tape_val = tape[tapeidx];
232-
u64 payload = tape_val & JSONVALUEMASK;
236+
payload = tape_val & JSONVALUEMASK;
233237
type = (tape_val >> 56);
234238
switch (type) {
235239
case '"': // we have a string
@@ -261,16 +265,16 @@ struct ParsedJson {
261265
os << "false\n";
262266
break;
263267
case '{': // we have an object
264-
os << "{\t// pointing to next tape location " << payload << "\n";
268+
os << "{\t// pointing to next tape location " << payload << " (first node after the scope) \n";
265269
break;
266270
case '}': // we end an object
267-
os << "}\t// pointing to previous tape location " << payload << "\n";
271+
os << "}\t// pointing to previous tape location " << payload << " (start of the scope) \n";
268272
break;
269273
case '[': // we start an array
270-
os << "[\t// pointing to next tape location " << payload << "\n";
274+
os << "[\t// pointing to next tape location " << payload << " (first node after the scope) \n";
271275
break;
272276
case ']': // we end an array
273-
os << "]\t// pointing to previous tape location " << payload << "\n";
277+
os << "]\t// pointing to previous tape location " << payload << " (start of the scope) \n";
274278
break;
275279
case 'r': // we start and end with the root node
276280
printf("end of root\n");
@@ -279,6 +283,10 @@ struct ParsedJson {
279283
return false;
280284
}
281285
}
286+
tape_val = tape[tapeidx];
287+
payload = tape_val & JSONVALUEMASK;
288+
type = (tape_val >> 56);
289+
os << tapeidx << " : "<< type <<"\t// pointing to " << payload <<" (start root)\n";
282290
return true;
283291
}
284292

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
9223372036854775808

jsonchecker/pass11.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
4611686018427387904

src/stage34_unified.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,25 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
103103
if (depth > pj.depthcapacity) {
104104
goto fail;
105105
}
106+
106107
UPDATE_CHAR();
107108
switch (c) {
108109
case '{':
110+
pj.containing_scope_offset[depth] = pj.get_current_loc();
111+
pj.ret_address[depth] = &&start_continue;
112+
depth++;
113+
if (depth > pj.depthcapacity) {
114+
goto fail;
115+
}
109116
pj.write_tape(0, c); // strangely, moving this to object_begin slows things down
110117
goto object_begin;
111118
case '[':
119+
pj.containing_scope_offset[depth] = pj.get_current_loc();
120+
pj.ret_address[depth] = &&start_continue;
121+
depth++;
122+
if (depth > pj.depthcapacity) {
123+
goto fail;
124+
}
112125
pj.write_tape(0, c);
113126
goto array_begin;
114127
#define SIMDJSON_ALLOWANYTHINGINROOT
@@ -216,11 +229,6 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
216229
default:
217230
goto fail;
218231
}
219-
#ifdef SIMDJSON_ALLOWANYTHINGINROOT
220-
depth--; // for fall-through cases (e.g., documents containing just a string)
221-
pj.annotate_previousloc(pj.containing_scope_offset[depth],
222-
pj.get_current_loc());
223-
#endif // ALLOWANYTHINGINROOT
224232
start_continue:
225233
DEBUG_PRINTF("in start_object_close\n");
226234
// the string might not be NULL terminated.
@@ -465,6 +473,7 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
465473

466474
succeed:
467475
DEBUG_PRINTF("in succeed, depth = %d \n", depth);
476+
depth --;
468477
if(depth != 0) {
469478
printf("internal bug\n");
470479
abort();
@@ -473,6 +482,8 @@ bool unified_machine(const u8 *buf, size_t len, ParsedJson &pj) {
473482
printf("internal bug\n");
474483
abort();
475484
}
485+
pj.annotate_previousloc(pj.containing_scope_offset[depth],
486+
pj.get_current_loc());
476487
pj.write_tape(pj.containing_scope_offset[depth], 'r'); // r is root
477488

478489

0 commit comments

Comments
 (0)