@@ -228,52 +228,80 @@ static inline bool IsContinuationCharacter(byte chr) {
228228// This method decodes an UTF-8 value according to RFC 3629.
229229uchar Utf8::CalculateValue (const byte* str, size_t max_length, size_t * cursor) {
230230 size_t length = NonASCIISequenceLength (str[0 ]);
231-
232- // Check continuation characters.
233- size_t max_count = std::min (length, max_length);
234- size_t count = 1 ;
235- while (count < max_count && IsContinuationCharacter (str[count])) {
236- count++;
231+ if (length == 0 || max_length < length) {
232+ *cursor += 1 ;
233+ return kBadChar ;
237234 }
238- *cursor += count;
239-
240- // There must be enough continuation characters.
241- if (count != length) return kBadChar ;
242-
243- // Check overly long sequences & other conditions.
244- if (length == 3 ) {
245- if (str[0 ] == 0xE0 && (str[1 ] < 0xA0 || str[1 ] > 0xBF )) {
246- // Overlong three-byte sequence?
235+ if (length == 2 ) {
236+ if (!IsContinuationCharacter (str[1 ])) {
237+ *cursor += 1 ;
247238 return kBadChar ;
248- } else if (str[0 ] == 0xED && (str[1 ] < 0x80 || str[1 ] > 0x9F )) {
249- // High and low surrogate halves?
239+ }
240+ *cursor += 2 ;
241+ return ((str[0 ] << 6 ) + str[1 ]) - 0x00003080 ;
242+ }
243+ if (length == 3 ) {
244+ switch (str[0 ]) {
245+ case 0xE0 :
246+ // Overlong three-byte sequence.
247+ if (str[1 ] < 0xA0 || str[1 ] > 0xBF ) {
248+ *cursor += 1 ;
249+ return kBadChar ;
250+ }
251+ break ;
252+ case 0xED :
253+ // High and low surrogate halves.
254+ if (str[1 ] < 0x80 || str[1 ] > 0x9F ) {
255+ *cursor += 1 ;
256+ return kBadChar ;
257+ }
258+ break ;
259+ default :
260+ if (!IsContinuationCharacter (str[1 ])) {
261+ *cursor += 1 ;
262+ return kBadChar ;
263+ }
264+ }
265+ if (!IsContinuationCharacter (str[2 ])) {
266+ *cursor += 1 ;
250267 return kBadChar ;
251268 }
252- } else if (length == 4 ) {
253- if (str[0 ] == 0xF0 && (str[1 ] < 0x90 || str[1 ] > 0xBF )) {
269+ *cursor += 3 ;
270+ return ((str[0 ] << 12 ) + (str[1 ] << 6 ) + str[2 ]) - 0x000E2080 ;
271+ }
272+ DCHECK (length == 4 );
273+ switch (str[0 ]) {
274+ case 0xF0 :
254275 // Overlong four-byte sequence.
255- return kBadChar ;
256- } else if (str[0 ] == 0xF4 && (str[1 ] < 0x80 || str[1 ] > 0x8F )) {
276+ if (str[1 ] < 0x90 || str[1 ] > 0xBF ) {
277+ *cursor += 1 ;
278+ return kBadChar ;
279+ }
280+ break ;
281+ case 0xF4 :
257282 // Code points outside of the unicode range.
258- return kBadChar ;
259- }
283+ if (str[1 ] < 0x80 || str[1 ] > 0x8F ) {
284+ *cursor += 1 ;
285+ return kBadChar ;
286+ }
287+ break ;
288+ default :
289+ if (!IsContinuationCharacter (str[1 ])) {
290+ *cursor += 1 ;
291+ return kBadChar ;
292+ }
260293 }
261-
262- // All errors have been handled, so we only have to assemble the result.
263- switch (length) {
264- case 1 :
265- return str[0 ];
266- case 2 :
267- return ((str[0 ] << 6 ) + str[1 ]) - 0x00003080 ;
268- case 3 :
269- return ((str[0 ] << 12 ) + (str[1 ] << 6 ) + str[2 ]) - 0x000E2080 ;
270- case 4 :
271- return ((str[0 ] << 18 ) + (str[1 ] << 12 ) + (str[2 ] << 6 ) + str[3 ]) -
272- 0x03C82080 ;
294+ if (!IsContinuationCharacter (str[2 ])) {
295+ *cursor += 1 ;
296+ return kBadChar ;
273297 }
274-
275- UNREACHABLE ();
276- return kBadChar ;
298+ if (!IsContinuationCharacter (str[3 ])) {
299+ *cursor += 1 ;
300+ return kBadChar ;
301+ }
302+ *cursor += 4 ;
303+ return ((str[0 ] << 18 ) + (str[1 ] << 12 ) + (str[2 ] << 6 ) + str[3 ]) -
304+ 0x03C82080 ;
277305}
278306
279307uchar Utf8::ValueOfIncremental (byte next, Utf8IncrementalBuffer* buffer) {
@@ -295,10 +323,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
295323 // with one shift.
296324 uint8_t mask = 0x7f >> kind;
297325
298- // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
299- // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
300- // intended as a counter about how many bytes are still needed.
301- *buffer = kind << 28 | (kind - 1 ) << 24 | (next & mask);
326+ // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
327+ // in the bottom three.
328+ *buffer = (kind - 1 ) << 24 | (next & mask);
302329 return kIncomplete ;
303330 } else {
304331 // No buffer, and not the start of a 1-byte char (handled at the
@@ -327,19 +354,15 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
327354 // We're inside of a character, as described by buffer.
328355
329356 // How many bytes (excluding this one) do we still expect?
330- uint8_t bytes_expected = *buffer >> 28 ;
331- uint8_t bytes_left = (*buffer >> 24 ) & 0x0f ;
332- bytes_left--;
357+ uint8_t count = (*buffer >> 24 ) - 1 ;
333358 // Update the value.
334359 uint32_t value = ((*buffer & 0xffffff ) << 6 ) | (next & 0x3F );
335- if (bytes_left ) {
336- *buffer = (bytes_expected << 28 | bytes_left << 24 | value) ;
360+ if (count ) {
361+ *buffer = count << 24 | value;
337362 return kIncomplete ;
338363 } else {
339364 *buffer = 0 ;
340- bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80 ) ||
341- (bytes_expected == 3 && value < 0x800 );
342- return sequence_was_too_long ? kBadChar : value;
365+ return value;
343366 }
344367 } else {
345368 // Within a character, but not a continuation character? Then the
0 commit comments