Skip to content

Commit fcb89f5

Browse files
marjakhCommit Bot
authored andcommitted
[unicode] Add tests for UTF-8 decoders + minor cleanups.
Verify that both UTF-8 decoders (incremental and non-incremental one) match the expectations. Also cleanup / harden the UTF-8 handling code, as suggested in https://chromium-review.googlesource.com/c/v8/v8/+/671020/ . BUG=chromium:765608 Change-Id: I6344d62ca15b75ac8e333421c94c4aa35ab8190d Reviewed-on: https://chromium-review.googlesource.com/681217 Commit-Queue: Marja Hölttä <marja@chromium.org> Reviewed-by: Camillo Bruni <cbruni@chromium.org> Cr-Commit-Position: refs/heads/master@{#48229}
1 parent 8b749bf commit fcb89f5

3 files changed

Lines changed: 425 additions & 20 deletions

File tree

src/unicode.cc

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -197,27 +197,27 @@ static inline uint8_t NonASCIISequenceLength(byte first) {
197197
// clang-format off
198198
static const uint8_t lengths[256] = {
199199
// The first 128 entries correspond to ASCII characters.
200-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
206-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* OO - Of */
201+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10 - 1f */
202+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2f */
203+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30 - 3f */
204+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4f */
205+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 50 - 5f */
206+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6f */
207+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70 - 7f */
208208
// The following 64 entries correspond to continuation bytes.
209-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
211-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 - 8f */
210+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90 - 9f */
211+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0 - af */
212+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0 - bf */
213213
// The next are two invalid overlong encodings and 30 two-byte sequences.
214-
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
215-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
214+
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0-c1 + c2-cf */
215+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* d0-df */
216216
// 16 three-byte sequences.
217-
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
217+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* e0-ef */
218218
// 5 four-byte sequences, followed by sequences that could only encode
219219
// code points outside of the unicode range.
220-
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
220+
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* f0-f4 + f5-ff */
221221
// clang-format on
222222
return lengths[first];
223223
}
@@ -322,7 +322,8 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
322322
if (*buffer == 0) {
323323
// We're at the start of a new character.
324324
uint32_t kind = NonASCIISequenceLength(next);
325-
if (kind >= 2 && kind <= 4) {
325+
CHECK_LE(kind, 4);
326+
if (kind >= 2) {
326327
// Start of 2..4 byte character, and no buffer.
327328

328329
// The mask for the lower bits depends on the kind, and is
@@ -333,7 +334,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
333334
// Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
334335
// in 2nd nibble, and the value in the bottom three. The 2nd nibble is
335336
// intended as a counter about how many bytes are still needed.
336-
*buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
337+
uint32_t character_info = kind << 28 | (kind - 1) << 24;
338+
DCHECK_EQ(character_info & mask, 0);
339+
*buffer = character_info | (next & mask);
337340
return kIncomplete;
338341
} else {
339342
// No buffer, and not the start of a 1-byte char (handled at the

src/unicode.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,7 @@ class Utf16 {
127127
}
128128
};
129129

130-
131-
class Utf8 {
130+
class V8_EXPORT_PRIVATE Utf8 {
132131
public:
133132
static inline uchar Length(uchar chr, int previous);
134133
static inline unsigned EncodeOneByte(char* out, uint8_t c);

0 commit comments

Comments
 (0)