Fix fallback implementation

jkeiser · jkeiser · commit 0dbda65e4427 · 2020-06-08T14:52:23.000-07:00
diff --git a/src/arm64/dom_parser_implementation.cpp b/src/arm64/dom_parser_implementation.cpp
@@ -82,6 +82,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }
 
+#include "generic/stage1/find_next_document_index.h"
 #include "generic/stage1/utf8_lookup2_algorithm.h"
 #include "generic/stage1/json_structural_indexer.h"
 WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
diff --git a/src/fallback/dom_parser_implementation.cpp b/src/fallback/dom_parser_implementation.cpp
@@ -9,15 +9,17 @@ namespace simdjson {
 namespace fallback {
 namespace stage1 {
 
+#include "generic/stage1/find_next_document_index.h"
+
 class structural_scanner {
 public:
 
-really_inline structural_scanner(dom_parser_implementation &_parser, bool _streaming)
+really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
   : buf{_parser.buf},
     next_structural_index{_parser.structural_indexes.get()},
     parser{_parser},
     len{static_cast<uint32_t>(_parser.len)},
-    streaming{_streaming} {
+    partial{_partial} {
 }
 
 really_inline void add_structural() {
@@ -41,7 +43,12 @@ really_inline void validate_utf8_character() {
   // 2-byte
   if ((buf[idx] & 0b00100000) == 0) {
     // missing continuation
-    if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
+    if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
+      if (idx+1 > len && partial) { idx = len; return; }
+      error = UTF8_ERROR;
+      idx++;
+      return;
+    }
     // overlong: 1100000_ 10______
     if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
     idx += 2;
@@ -51,7 +58,12 @@ really_inline void validate_utf8_character() {
   // 3-byte
   if ((buf[idx] & 0b00010000) == 0) {
     // missing continuation
-    if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
+    if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
+      if (idx+2 > len && partial) { idx = len; return; }
+      error = UTF8_ERROR;
+      idx++;
+      return;
+    }
     // overlong: 11100000 100_____ ________
     if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
     // surrogates: U+D800-U+DFFF 11101101 101_____
@@ -62,7 +74,12 @@ really_inline void validate_utf8_character() {
 
   // 4-byte
   // missing continuation
-  if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
+  if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
+    if (idx+2 > len && partial) { idx = len; return; }
+    error = UTF8_ERROR;
+    idx++;
+    return;
+  }
   // overlong: 11110000 1000____ ________ ________
   if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
   // too large: > U+10FFFF:
@@ -87,7 +104,7 @@ really_inline void validate_string() {
       idx++;
     }
   }
-  if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
+  if (idx >= len && !partial) { error = UNCLOSED_STRING; }
 }
 
 really_inline bool is_whitespace_or_operator(uint8_t c) {
@@ -128,16 +145,26 @@ really_inline error_code scan() {
         break;
     }
   }
-  if (unlikely(next_structural_index == parser.structural_indexes.get())) {
-    return EMPTY;
-  }
   *next_structural_index = len;
-  next_structural_index++;
   // We pad beyond.
   // https://github.com/simdjson/simdjson/issues/906
-  next_structural_index[0] = len;
-  next_structural_index[1] = 0;
+  next_structural_index[1] = len;
+  next_structural_index[2] = 0;
   parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
+  parser.next_structural_index = 0;
+
+  if (unlikely(parser.n_structural_indexes == 0)) {
+    return EMPTY;
+  }
+
+  if (partial) {
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+    }
+    parser.n_structural_indexes = new_structural_indexes;
+  }
+
   return error;
 }
 
@@ -148,16 +175,16 @@ really_inline error_code scan() {
   uint32_t len;
   uint32_t idx{0};
   error_code error{SUCCESS};
-  bool streaming;
+  bool partial;
 }; // structural_scanner
 
 } // namespace stage1
 
 
-WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
+WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
   this->buf = _buf;
   this->len = _len;
-  stage1::structural_scanner scanner(*this, streaming);
+  stage1::structural_scanner scanner(*this, partial);
   return scanner.scan();
 }
 
diff --git a/src/generic/stage1/find_next_document_index.h b/src/generic/stage1/find_next_document_index.h
@@ -0,0 +1,86 @@
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // TODO don't count separately, just figure out depth
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  return 0;
+}
+
+// Skip the last character if it is partial
+really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
diff --git a/src/generic/stage1/json_structural_indexer.h b/src/generic/stage1/json_structural_indexer.h
@@ -73,8 +73,6 @@ class json_structural_indexer {
   really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
   really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
   really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
-  static really_inline uint32_t find_next_document_index(dom_parser_implementation &parser);
-  static really_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len);
 
   json_scanner scanner{};
   utf8_checker checker{};
@@ -98,7 +96,7 @@ really_inline json_structural_indexer::json_structural_indexer(uint32_t *structu
 // 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
 //    to finish: utf-8 checks and generating the output from the last iteration.
 //
-// The reason we run 2 inputs at a time, is steps 2 and 3 are//still* not enough to soak up all
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
 // workout.
 //
@@ -162,13 +160,6 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
   }
 
   parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
-  // a valid JSON file cannot have zero structural indexes - we should have found something
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
   /***
    * This is related to https://github.com/simdjson/simdjson/issues/906
    * Basically, we want to make sure that if the parsing continues beyond the last (valid)
@@ -186,102 +177,22 @@ really_inline error_code json_structural_indexer::finish(dom_parser_implementati
   parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
   parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
   parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
   if (partial) {
     auto new_structural_indexes = find_next_document_index(parser);
     if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
       return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
     }
     parser.n_structural_indexes = new_structural_indexes;
   }
-  parser.next_structural_index = 0;
   return checker.errors();
 }
 
-/**
-  * This algorithm is used to quickly identify the last structural position that
-  * makes up a complete document.
-  *
-  * It does this by going backwards and finding the last *document boundary* (a
-  * place where one value follows another without a comma between them). If the
-  * last document (the characters after the boundary) has an equal number of
-  * start and end brackets, it is considered complete.
-  *
-  * Simply put, we iterate over the structural characters, starting from
-  * the end. We consider that we found the end of a JSON document when the
-  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
-  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
-  *
-  * This simple comparison works most of the time, but it does not cover cases
-  * where the batch's structural indexes contain a perfect amount of documents.
-  * In such a case, we do not have access to the structural index which follows
-  * the last document, therefore, we do not have access to the second element in
-  * the pair, and means that we cannot identify the last document. To fix this
-  * issue, we keep a count of the open and closed curly/square braces we found
-  * while searching for the pair. When we find a pair AND the count of open and
-  * closed curly/square braces is the same, we know that we just passed a
-  * complete
-  * document, therefore the last json buffer location is the end of the batch
-  */
-really_inline uint32_t json_structural_indexer::find_next_document_index(dom_parser_implementation &parser) {
-  // TODO don't count separately, just figure out depth
-  auto arr_cnt = 0;
-  auto obj_cnt = 0;
-  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
-    auto idxb = parser.structural_indexes[i];
-    switch (parser.buf[idxb]) {
-    case ':':
-    case ',':
-      continue;
-    case '}':
-      obj_cnt--;
-      continue;
-    case ']':
-      arr_cnt--;
-      continue;
-    case '{':
-      obj_cnt++;
-      break;
-    case '[':
-      arr_cnt++;
-      break;
-    }
-    auto idxa = parser.structural_indexes[i - 1];
-    switch (parser.buf[idxa]) {
-    case '{':
-    case '[':
-    case ':':
-    case ',':
-      continue;
-    }
-    // Last document is complete, so the next document will appear after!
-    if (!arr_cnt && !obj_cnt) {
-      return parser.n_structural_indexes;
-    }
-    // Last document is incomplete; mark the document at i + 1 as the next one
-    return i;
-  }
-  return 0;
-}
-
-// Skip the last character if it is partial
-really_inline size_t json_structural_indexer::trim_partial_utf8(const uint8_t *buf, size_t len) {
-  if (unlikely(len < 3)) {
-    switch (len) {
-      case 2:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
-        return len;
-      case 1:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        return len;
-      case 0:
-        return len;
-    }
-  }
-  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
-  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
-  return len;
-}
-
 } // namespace stage1
diff --git a/src/haswell/dom_parser_implementation.cpp b/src/haswell/dom_parser_implementation.cpp
@@ -70,6 +70,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
 }
 
+#include "generic/stage1/find_next_document_index.h"
 #include "generic/stage1/utf8_lookup2_algorithm.h"
 #include "generic/stage1/json_structural_indexer.h"
 WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
diff --git a/src/westmere/dom_parser_implementation.cpp b/src/westmere/dom_parser_implementation.cpp
@@ -71,6 +71,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }
 
+#include "generic/stage1/find_next_document_index.h"
 #include "generic/stage1/utf8_lookup2_algorithm.h"
 #include "generic/stage1/json_structural_indexer.h"
 WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {

Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui`
`82`	`82`	`return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);`
`83`	`83`	`}`
`84`	`84`
	`85`	`+#include "generic/stage1/find_next_document_index.h"`
`85`	`86`	`#include "generic/stage1/utf8_lookup2_algorithm.h"`
`86`	`87`	`#include "generic/stage1/json_structural_indexer.h"`
`87`	`88`	`WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {`
Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui`
`70`	`70`	`return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);`
`71`	`71`	`}`
`72`	`72`
	`73`	`+#include "generic/stage1/find_next_document_index.h"`
`73`	`74`	`#include "generic/stage1/utf8_lookup2_algorithm.h"`
`74`	`75`	`#include "generic/stage1/json_structural_indexer.h"`
`75`	`76`	`WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {`
Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui`
`71`	`71`	`return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);`
`72`	`72`	`}`
`73`	`73`
	`74`	`+#include "generic/stage1/find_next_document_index.h"`
`74`	`75`	`#include "generic/stage1/utf8_lookup2_algorithm.h"`
`75`	`76`	`#include "generic/stage1/json_structural_indexer.h"`
`76`	`77`	`WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {`