@@ -243,6 +243,93 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
243243 // for fear of aliasing
244244 return SUCCESS;
245245}
246+ WARN_UNUSED bool implementation::validate_utf8 (const char *buf, size_t len) const noexcept {
247+ const uint8_t *data = (const uint8_t *)buf;
248+ uint64_t pos = 0 ;
249+ uint64_t next_pos = 0 ;
250+ uint32_t code_point = 0 ;
251+ while (pos < len) {
252+
253+ // check of the next 8 bytes are ascii.
254+ next_pos = pos + 16 ;
255+ if (next_pos <=
256+ len) { // if it is safe to read 8 more bytes, check that they are ascii
257+ uint64_t v1;
258+ memcpy (&v1, data + pos, sizeof (uint64_t ));
259+ uint64_t v2;
260+ memcpy (&v2, data + pos + sizeof (uint64_t ), sizeof (uint64_t ));
261+ uint64_t v{v1 | v2};
262+ if ((v & 0x8080808080808080 ) == 0 ) {
263+ pos = next_pos;
264+ continue ;
265+ }
266+ }
267+ unsigned char byte = data[pos];
268+
269+ if (byte < 0b10000000 ) {
270+ pos++;
271+ continue ;
272+ } else if ((byte & 0b11100000 ) == 0b11000000 ) {
273+ next_pos = pos + 2 ;
274+ if (next_pos > len) {
275+ return false ;
276+ }
277+ if ((data[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
278+ return false ;
279+ }
280+ // range check
281+ code_point = (byte & 0b00011111 ) << 6 | (data[pos + 1 ] & 0b00111111 );
282+ if (code_point < 0x80 || 0x7ff < code_point) {
283+ return false ;
284+ }
285+ } else if ((byte & 0b11110000 ) == 0b11100000 ) {
286+ next_pos = pos + 3 ;
287+ if (next_pos > len) {
288+ return false ;
289+ }
290+ if ((data[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
291+ return false ;
292+ }
293+ if ((data[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
294+ return false ;
295+ }
296+ // range check
297+ code_point = (byte & 0b00001111 ) << 12 |
298+ (data[pos + 1 ] & 0b00111111 ) << 6 |
299+ (data[pos + 2 ] & 0b00111111 );
300+ if (code_point < 0x800 || 0xffff < code_point ||
301+ (0xd7ff < code_point && code_point < 0xe000 )) {
302+ return false ;
303+ }
304+ } else if ((byte & 0b11111000 ) == 0b11110000 ) { // 0b11110000
305+ next_pos = pos + 4 ;
306+ if (next_pos > len) {
307+ return false ;
308+ }
309+ if ((data[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
310+ return false ;
311+ }
312+ if ((data[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
313+ return false ;
314+ }
315+ if ((data[pos + 3 ] & 0b11000000 ) != 0b10000000 ) {
316+ return false ;
317+ }
318+ // range check
319+ code_point =
320+ (byte & 0b00000111 ) << 18 | (data[pos + 1 ] & 0b00111111 ) << 12 |
321+ (data[pos + 2 ] & 0b00111111 ) << 6 | (data[pos + 3 ] & 0b00111111 );
322+ if (code_point < 0xffff || 0x10ffff < code_point) {
323+ return false ;
324+ }
325+ } else {
326+ // we may have a continuation
327+ return false ;
328+ }
329+ pos = next_pos;
330+ }
331+ return true ;
332+ }
246333
247334} // namespace fallback
248335} // namespace simdjson
0 commit comments