Skip to content

Commit bcabdfc

Browse files
ioioioiolemire
authored andcommitted
Json pointer (simdjson#220)
* json pointer support * Addition of tests for the json pointer * Adding a new tool for the JSON Pointer support, and some documentation.
1 parent cb44b3b commit bcabdfc

File tree

8 files changed

+381
-7
lines changed

8 files changed

+381
-7
lines changed

Makefile

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ endif # ifeq ($(DEBUG),1)
5757
endif # ifeq ($(SANITIZE),1)
5858
endif # ifeq ($(MEMSANITIZE),1)
5959

60-
MAINEXECUTABLES=parse minify json2json jsonstats statisticalmodel
61-
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck
60+
MAINEXECUTABLES=parse minify json2json jsonstats statisticalmodel jsonpointer
61+
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck pointercheck
6262
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile allparsingcompetition
6363
SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing
6464

@@ -91,20 +91,22 @@ benchmark:
9191
bash ./scripts/parser.sh
9292
bash ./scripts/parseandstat.sh
9393

94-
test: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json
94+
test: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
9595
./basictests
9696
./numberparsingcheck
9797
./stringparsingcheck
9898
./jsoncheck
99+
./pointercheck
99100
./scripts/testjson2json.sh
100101
./scripts/issue150.sh
101102
@echo "It looks like the code is good!"
102103

103-
quiettest: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json
104+
quiettest: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
104105
./basictests
105106
./numberparsingcheck
106107
./stringparsingcheck
107108
./jsoncheck
109+
./pointercheck
108110
./scripts/testjson2json.sh
109111
./scripts/issue150.sh
110112

@@ -149,6 +151,8 @@ numberparsingcheck:tests/numberparsingcheck.cpp $(HEADERS) $(LIBFILES)
149151
stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
150152
$(CXX) $(CXXFLAGS) -o stringparsingcheck tests/stringparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
151153

154+
pointercheck:tests/pointercheck.cpp $(HEADERS) $(LIBFILES)
155+
$(CXX) $(CXXFLAGS) -o pointercheck tests/pointercheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp -I. $(LIBFLAGS)
152156

153157
minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) submodules $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
154158
$(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
@@ -159,6 +163,9 @@ minify: tools/minify.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIB
159163
json2json: tools/json2json.cpp $(HEADERS) $(LIBFILES)
160164
$(CXX) $(CXXFLAGS) -o json2json $ tools/json2json.cpp $(LIBFILES) -I.
161165

166+
jsonpointer: tools/jsonpointer.cpp $(HEADERS) $(LIBFILES)
167+
$(CXX) $(CXXFLAGS) -o jsonpointer $ tools/jsonpointer.cpp $(LIBFILES) -I.
168+
162169
jsonstats: tools/jsonstats.cpp $(HEADERS) $(LIBFILES)
163170
$(CXX) $(CXXFLAGS) -o jsonstats $ tools/jsonstats.cpp $(LIBFILES) -I.
164171

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ Under Windows, we build some tools using the windows/dirent_portable.h file (whi
6767

6868
## Code usage and example
6969

70-
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::iterator pjh(pj)`, see 'Navigating the parsed document').
70+
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::iterator pjh(pj)`, see 'Navigating the parsed document').
7171

7272
```C
7373
#include "simdjson/jsonparser.h"
@@ -313,6 +313,7 @@ If you find the version of `simdjson` shipped with `vcpkg` is out-of-date, feel
313313
- `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output.
314314
- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`.
315315
- `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space characters.
316+
- `jsonpointer mydoc.json <jsonpath> <jsonpath> ... <jsonpath>` parses the document, constructs a model and then processes a series of [JSON Pointer paths](https://tools.ietf.org/html/rfc6901). The result is itself a JSON document.
316317
317318
## Scope
318319
@@ -347,6 +348,20 @@ The parser works in two stages:
347348
- Stage 1. (Find marks) Identifies quickly structure elements, strings, and so forth. We validate UTF-8 encoding at that stage.
348349
- Stage 2. (Structure building) Involves constructing a "tree" of sort (materialized as a tape) to navigate through the data. Strings and numbers are parsed at this stage.
349350
351+
## JSON Pointer
352+
353+
We can navigate the parsed JSON using JSON Pointers as per the [RFC6901 standard](https://tools.ietf.org/html/rfc6901).
354+
355+
You can build a tool (jsonpointer) to parse a JSON document and then issue an array of JSON Pointer queries:
356+
357+
```
358+
make jsonpointer
359+
./jsonpointer jsonexamples/small/demo.json /Image/Width /Image/Height /Image/IDs/2
360+
./jsonpointer jsonexamples/twitter.json /statuses/0/id /statuses/1/id /statuses/2/id /statuses/3/id /statuses/4/id /statuses/5/id
361+
```
362+
363+
In C++, given a `ParsedJson`, we can move to a node with the `move_to` method, passing a `std::string` representing the JSON Pointer query.
364+
350365
## Navigating the parsed document
351366
352367
Here is a code sample to dump back the parsed JSON to a string:

include/simdjson/parsedjson.h

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,47 @@ struct ParsedJson {
231231
// this is equivalent but much faster than calling "next()".
232232
inline void move_to_value();
233233

234+
// when at [, go one level deep, and advance to the given index.
235+
// if successful, we are left pointing at the value,
236+
// if not, we are still pointing at the array ([)
237+
inline bool move_to_index(uint32_t index);
238+
239+
// Moves the iterator to the value correspoding to the json pointer.
240+
// Always search from the root of the document.
241+
// if successful, we are left pointing at the value,
242+
// if not, we are still pointing the same value we were pointing before the call.
243+
// The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
244+
// However, the standard says "If a referenced member name is not unique in an object,
245+
// the member that is referenced is undefined, and evaluation fails".
246+
// Here we just return the first corresponding value.
247+
// The length parameter is the length of the jsonpointer string ('pointer').
248+
bool move_to(const char * pointer, uint32_t length);
249+
250+
// Moves the iterator to the value correspoding to the json pointer.
251+
// Always search from the root of the document.
252+
// if successful, we are left pointing at the value,
253+
// if not, we are still pointing the same value we were pointing before the call.
254+
// The json pointer implementation follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
255+
// However, the standard says "If a referenced member name is not unique in an object,
256+
// the member that is referenced is undefined, and evaluation fails".
257+
// Here we just return the first corresponding value.
258+
inline bool move_to(const std::string & pointer) {
259+
return move_to(pointer.c_str(), pointer.length());
260+
}
261+
262+
263+
264+
private:
265+
266+
// Almost the same as move_to(), except it searchs from the current position.
267+
// The pointer's syntax is identical, though that case is not handled by the rfc6901 standard.
268+
// The '/' is still required at the beginning.
269+
// However, contrary to move_to(), the URI Fragment Identifier Representation is not supported here.
270+
// Also, in case of failure, we are left pointing at the closest value it could reach.
271+
// For these reasons it is private. It exists because it is used by move_to().
272+
bool relative_move_to(const char * pointer, uint32_t length);
273+
public:
274+
234275
// throughout return true if we can do the navigation, false
235276
// otherwise
236277

@@ -264,6 +305,10 @@ struct ParsedJson {
264305
// a scope is a series of nodes at the same level
265306
inline void to_start_scope();
266307

308+
inline void rewind() {
309+
while(up());
310+
}
311+
267312
// void to_end_scope(); // move us to
268313
// the start of our current scope; always succeeds
269314

@@ -419,8 +464,24 @@ bool ParsedJson::iterator::move_to_key(const char * key, uint32_t length) {
419464
return false;
420465
}
421466

467+
bool ParsedJson::iterator::move_to_index(uint32_t index) {
468+
assert(is_array());
469+
if (down()) {
470+
uint32_t i = 0;
471+
for (; i < index; i++) {
472+
if (!next()) {
473+
break;
474+
}
475+
}
476+
if (i == index) {
477+
return true;
478+
}
479+
assert(up());
480+
}
481+
return false;
482+
}
422483

423-
bool ParsedJson::iterator::prev() {
484+
bool ParsedJson::iterator::prev() {
424485
if(location - 1 < depthindex[depth].start_of_scope) {
425486
return false;
426487
}

jsonexamples/twitter.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15479,4 +15479,4 @@
1547915479
"since_id": 0,
1548015480
"since_id_str": "0"
1548115481
}
15482-
}
15482+
}

src/parsedjsoniterator.cpp

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,172 @@ bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const {
9393
}
9494
return true;
9595
}
96+
97+
bool ParsedJson::iterator::move_to(const char * pointer, uint32_t length) {
98+
char* new_pointer = nullptr;
99+
if (pointer[0] == '#') {
100+
// Converting fragment representation to string representation
101+
new_pointer = new char[length];
102+
uint32_t new_length = 0;
103+
for (uint32_t i = 1; i < length; i++) {
104+
if (pointer[i] == '%' && pointer[i+1] == 'x') {
105+
try {
106+
int fragment = std::stoi(std::string(&pointer[i+2], 2), nullptr, 16);
107+
if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) {
108+
// escaping the character
109+
new_pointer[new_length] = '\\';
110+
new_length++;
111+
}
112+
new_pointer[new_length] = fragment;
113+
i += 3;
114+
}
115+
catch(std::invalid_argument& e) {
116+
delete[] new_pointer;
117+
return false; // the fragment is invalid
118+
}
119+
}
120+
else {
121+
new_pointer[new_length] = pointer[i];
122+
}
123+
new_length++;
124+
}
125+
length = new_length;
126+
pointer = new_pointer;
127+
}
128+
129+
// saving the current state
130+
size_t depth_s = depth;
131+
size_t location_s = location;
132+
uint8_t current_type_s = current_type;
133+
uint64_t current_val_s = current_val;
134+
scopeindex_t *depthindex_s = depthindex;
135+
136+
rewind(); // The json pointer is used from the root of the document.
137+
138+
bool found = relative_move_to(pointer, length);
139+
delete[] new_pointer;
140+
141+
if (!found) {
142+
// since the pointer has found nothing, we get back to the original position.
143+
depth = depth_s;
144+
location = location_s;
145+
current_type = current_type_s;
146+
current_val = current_val_s;
147+
depthindex = depthindex_s;
148+
}
149+
150+
return found;
151+
}
152+
153+
bool ParsedJson::iterator::relative_move_to(const char * pointer, uint32_t length) {
154+
if (length == 0) {
155+
// returns the whole document
156+
return true;
157+
}
158+
159+
if (pointer[0] != '/') {
160+
// '/' must be the first character
161+
return false;
162+
}
163+
164+
// finding the key in an object or the index in an array
165+
std::string key_or_index;
166+
uint32_t offset = 1;
167+
168+
// checking for the "-" case
169+
if (is_array() && pointer[1] == '-') {
170+
if (length != 2) {
171+
// the pointer must be exactly "/-"
172+
// there can't be anything more after '-' as an index
173+
return false;
174+
}
175+
key_or_index = '-';
176+
offset = length; // will skip the loop coming right after
177+
}
178+
179+
// We either transform the first reference token to a valid json key
180+
// or we make sure it is a valid index in an array.
181+
for (; offset < length ; offset++) {
182+
if (pointer[offset] == '/') {
183+
// beginning of the next key or index
184+
break;
185+
}
186+
if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) {
187+
// the index of an array must be an integer
188+
// we also make sure std::stoi won't discard whitespaces later
189+
return false;
190+
}
191+
if (pointer[offset] == '~') {
192+
// "~1" represents "/"
193+
if (pointer[offset+1] == '1') {
194+
key_or_index += '/';
195+
offset++;
196+
continue;
197+
}
198+
// "~0" represents "~"
199+
if (pointer[offset+1] == '0') {
200+
key_or_index += '~';
201+
offset++;
202+
continue;
203+
}
204+
}
205+
if (pointer[offset] == '\\') {
206+
if (pointer[offset+1] == '\\' || pointer[offset+1] == '"' || (pointer[offset+1] <= 0x1F)) {
207+
key_or_index += pointer[offset+1];
208+
offset++;
209+
continue;
210+
}
211+
return false; // invalid escaped character
212+
}
213+
if (pointer[offset] == '\"') {
214+
// unescaped quote character. this is an invalid case.
215+
// lets do nothing and assume most pointers will be valid.
216+
// it won't find any corresponding json key anyway.
217+
// return false;
218+
}
219+
key_or_index += pointer[offset];
220+
}
221+
222+
bool found = false;
223+
if (is_object()) {
224+
if (move_to_key(key_or_index.c_str(), key_or_index.length())) {
225+
found = relative_move_to(pointer+offset, length-offset);
226+
}
227+
}
228+
else if(is_array()) {
229+
if (key_or_index == "-") { // handling "-" case first
230+
if (down()) {
231+
while(next()); // moving to the end of the array
232+
// moving to the nonexistent value right after...
233+
size_t npos;
234+
if ((current_type == '[') || (current_type == '{')) {
235+
// we need to jump
236+
npos = ( current_val & JSONVALUEMASK);
237+
} else {
238+
npos = location + ((current_type == 'd' || current_type == 'l') ? 2 : 1);
239+
}
240+
location = npos;
241+
current_val = pj.tape[npos];
242+
current_type = (current_val >> 56);
243+
return true; // how could it fail ?
244+
}
245+
} else { // regular numeric index
246+
// The index can't have a leading '0'
247+
if (key_or_index[0] == '0' && key_or_index.length() > 1) {
248+
return false;
249+
}
250+
// it cannot be empty
251+
if (key_or_index.length() == 0) {
252+
return false;
253+
}
254+
// we already checked the index contains only valid digits
255+
uint32_t index = std::stoi(key_or_index);
256+
if (move_to_index(index)) {
257+
found = relative_move_to(pointer+offset, length-offset);
258+
}
259+
}
260+
}
261+
262+
return found;
263+
}
96264
}

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ endif()
66

77
add_cpp_test(basictests)
88
add_cpp_test(jsoncheck)
9+
add_cpp_test(pointercheck)
910

1011
## This causes problems
1112
# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)

0 commit comments

Comments
 (0)