[unicode] Add tests for UTF-8 decoders + minor cleanups.

marjakh · Commit Bot · commit fcb89f551563 · 2017-09-29T13:18:52.000Z
Verify that both UTF-8 decoders (incremental and non-incremental one) match the expectations. Also cleanup / harden the UTF-8 handling code, as suggested in https://chromium-review.googlesource.com/c/v8/v8/+/671020/ . BUG=chromium:765608 Change-Id: I6344d62ca15b75ac8e333421c94c4aa35ab8190d Reviewed-on: https://chromium-review.googlesource.com/681217 Commit-Queue: Marja Hölttä <marja@chromium.org> Reviewed-by: Camillo Bruni <cbruni@chromium.org> Cr-Commit-Position: refs/heads/master@{#48229}
diff --git a/src/unicode.cc b/src/unicode.cc
@@ -197,27 +197,27 @@ static inline uint8_t NonASCIISequenceLength(byte first) {
   // clang-format off
   static const uint8_t lengths[256] = {
       // The first 128 entries correspond to ASCII characters.
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* OO - Of */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 10 - 1f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 20 - 2f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 30 - 3f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 40 - 4f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 50 - 5f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 60 - 6f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 70 - 7f */
       // The following 64 entries correspond to continuation bytes.
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 80 - 8f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 90 - 9f */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* a0 - af */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* b0 - bf */
       // The next are two invalid overlong encodings and 30 two-byte sequences.
-      0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  /* c0-c1 + c2-cf */
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  /* d0-df */
       // 16 three-byte sequences.
-      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  /* e0-ef */
       // 5 four-byte sequences, followed by sequences that could only encode
       // code points outside of the unicode range.
-      4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* f0-f4 + f5-ff */
   // clang-format on
   return lengths[first];
 }
@@ -322,7 +322,8 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
   if (*buffer == 0) {
     // We're at the start of a new character.
     uint32_t kind = NonASCIISequenceLength(next);
-    if (kind >= 2 && kind <= 4) {
+    CHECK_LE(kind, 4);
+    if (kind >= 2) {
       // Start of 2..4 byte character, and no buffer.
 
       // The mask for the lower bits depends on the kind, and is
@@ -333,7 +334,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
       // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
       // in 2nd nibble, and the value  in the bottom three. The 2nd nibble is
       // intended as a counter about how many bytes are still needed.
-      *buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
+      uint32_t character_info = kind << 28 | (kind - 1) << 24;
+      DCHECK_EQ(character_info & mask, 0);
+      *buffer = character_info | (next & mask);
       return kIncomplete;
     } else {
       // No buffer, and not the start of a 1-byte char (handled at the
diff --git a/src/unicode.h b/src/unicode.h
@@ -127,8 +127,7 @@ class Utf16 {
   }
 };
 
-
-class Utf8 {
+class V8_EXPORT_PRIVATE Utf8 {
  public:
   static inline uchar Length(uchar chr, int previous);
   static inline unsigned EncodeOneByte(char* out, uint8_t c);
diff --git a/test/unittests/unicode-unittest.cc b/test/unittests/unicode-unittest.cc

Original file line number	Diff line number	Diff line change
`@@ -127,8 +127,7 @@ class Utf16 {`
`127`	`127`	`}`
`128`	`128`	`};`
`129`	`129`
`130`		`-`
`131`		`-class Utf8 {`
	`130`	`+class V8_EXPORT_PRIVATE Utf8 {`
`132`	`131`	`public:`
`133`	`132`	`static inline uchar Length(uchar chr, int previous);`
`134`	`133`	`static inline unsigned EncodeOneByte(char* out, uint8_t c);`