Skip to content

Commit f03a6ab

Browse files
committed
Tweaking.
1 parent 5dc07ed commit f03a6ab

File tree

7 files changed

+97
-7
lines changed

7 files changed

+97
-7
lines changed

include/simdjson/implementation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ class implementation {
9494
* @param len the length of the string in bytes.
9595
* @return true if and only if the string is valid UTF-8.
9696
*/
97-
WARN_UNUSED virtual bool utf8_validate(const char *buf, size_t len) const noexcept = 0;
97+
WARN_UNUSED virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
9898

9999
protected:
100100
/** @private Construct an implementation with the given name and description. For subclasses. */

src/fallback/dom_parser_implementation.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,93 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
243243
// for fear of aliasing
244244
return SUCCESS;
245245
}
246+
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
247+
const uint8_t *data = (const uint8_t *)buf;
248+
uint64_t pos = 0;
249+
uint64_t next_pos = 0;
250+
uint32_t code_point = 0;
251+
while (pos < len) {
252+
253+
// check of the next 8 bytes are ascii.
254+
next_pos = pos + 16;
255+
if (next_pos <=
256+
len) { // if it is safe to read 8 more bytes, check that they are ascii
257+
uint64_t v1;
258+
memcpy(&v1, data + pos, sizeof(uint64_t));
259+
uint64_t v2;
260+
memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
261+
uint64_t v{v1 | v2};
262+
if ((v & 0x8080808080808080) == 0) {
263+
pos = next_pos;
264+
continue;
265+
}
266+
}
267+
unsigned char byte = data[pos];
268+
269+
if (byte < 0b10000000) {
270+
pos++;
271+
continue;
272+
} else if ((byte & 0b11100000) == 0b11000000) {
273+
next_pos = pos + 2;
274+
if (next_pos > len) {
275+
return false;
276+
}
277+
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
278+
return false;
279+
}
280+
// range check
281+
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
282+
if (code_point < 0x80 || 0x7ff < code_point) {
283+
return false;
284+
}
285+
} else if ((byte & 0b11110000) == 0b11100000) {
286+
next_pos = pos + 3;
287+
if (next_pos > len) {
288+
return false;
289+
}
290+
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
291+
return false;
292+
}
293+
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
294+
return false;
295+
}
296+
// range check
297+
code_point = (byte & 0b00001111) << 12 |
298+
(data[pos + 1] & 0b00111111) << 6 |
299+
(data[pos + 2] & 0b00111111);
300+
if (code_point < 0x800 || 0xffff < code_point ||
301+
(0xd7ff < code_point && code_point < 0xe000)) {
302+
return false;
303+
}
304+
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
305+
next_pos = pos + 4;
306+
if (next_pos > len) {
307+
return false;
308+
}
309+
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
310+
return false;
311+
}
312+
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
313+
return false;
314+
}
315+
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
316+
return false;
317+
}
318+
// range check
319+
code_point =
320+
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
321+
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
322+
if (code_point < 0xffff || 0x10ffff < code_point) {
323+
return false;
324+
}
325+
} else {
326+
// we may have a continuation
327+
return false;
328+
}
329+
pos = next_pos;
330+
}
331+
return true;
332+
}
246333

247334
} // namespace fallback
248335
} // namespace simdjson

src/fallback/implementation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class implementation final : public simdjson::implementation {
2222
std::unique_ptr<internal::dom_parser_implementation>& dst
2323
) const noexcept final;
2424
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
25-
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
25+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
2626
};
2727

2828
} // namespace fallback

src/haswell/dom_parser_implementation.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, si
9494
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
9595
}
9696
#include "generic/stage1/utf8_validator.h"
97+
WARN_UNUSED bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
98+
return simdjson::haswell::stage1::utf8_validate(buf,len);
99+
}
97100
} // namespace haswell
98101
} // namespace simdjson
99102
UNTARGET_REGION

src/haswell/implementation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class implementation final : public simdjson::implementation {
2020
std::unique_ptr<internal::dom_parser_implementation>& dst
2121
) const noexcept final;
2222
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
23-
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
23+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
2424
};
2525

2626
} // namespace haswell

src/implementation.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ class detect_best_supported_implementation_on_first_use final : public implement
4848
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
4949
return set_best()->minify(buf, len, dst, dst_len);
5050
}
51-
WARN_UNUSED bool utf8_validate(const char * buf, size_t len) const noexcept final override {
52-
return set_best()->utf8_validate(buf, len);
51+
WARN_UNUSED bool validate_utf8(const char * buf, size_t len) const noexcept final override {
52+
return set_best()->validate_utf8(buf, len);
5353
}
5454
really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
5555
private:
@@ -88,7 +88,7 @@ class unsupported_implementation final : public implementation {
8888
WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
8989
return UNSUPPORTED_ARCHITECTURE;
9090
}
91-
WARN_UNUSED bool utf8_validate(const char *, size_t) const noexcept final override {
91+
WARN_UNUSED bool validate_utf8(const char *, size_t) const noexcept final override {
9292
return false; // just refuse the validate
9393
}
9494
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}

src/westmere/implementation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class implementation final : public simdjson::implementation {
1919
std::unique_ptr<internal::dom_parser_implementation>& dst
2020
) const noexcept final;
2121
WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
22-
WARN_UNUSED bool utf8_validate(const char *buf, size_t len) const noexcept final;
22+
WARN_UNUSED bool validate_utf8(const char *buf, size_t len) const noexcept final;
2323
};
2424

2525
} // namespace westmere

0 commit comments

Comments
 (0)