Skip to content

Commit 4aeb94a

Browse files
jungshikCommit Bot
authored andcommitted
Use ICU for ID_START, ID_CONTINUE and WhiteSpace check
Use ICU to check ID_Start, ID_Continue and WhiteSpace even for BMP when V8_INTL_SUPPORT is on (which is default). Change LineTerminator::Is() to check 4 code points from ES#sec-line-terminators instead of using tables and Lookup function. Remove Lowercase::Is(). It's not used anywhere. Update webkit/{ToNumber,parseFloat}.js to have the correct expectation for U+180E and the corresponding expected files. This is a follow-up to an earlier change ( https://codereview.chromium.org/2720953003 ). CQ_INCLUDE_TRYBOTS=master.tryserver.v8:v8_win_dbg,v8_mac_dbg;master.tryserver.chromium.android:android_arm64_dbg_recipe CQ_INCLUDE_TRYBOTS=master.tryserver.v8:v8_linux_noi18n_rel_ng BUG=v8:5370,v8:5155 TEST=unittests --gtest_filter=CharP* TEST=webkit: ToNumber, parseFloat TEST=test262: built-ins/Number/S9.3*, built-ins/parse{Int,Float}/S15* TEST=test262: language/white-space/mong* TEST=test262: built-ins/String/prototype/trim/u180e TEST=mjsunit: whitespaces Review-Url: https://codereview.chromium.org/2331303002 Cr-Commit-Position: refs/heads/master@{#45957}
1 parent 8e646bd commit 4aeb94a

11 files changed

Lines changed: 171 additions & 280 deletions

BUILD.gn

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2469,6 +2469,7 @@ v8_source_set("v8_base") {
24692469
} else {
24702470
sources -= [
24712471
"src/builtins/builtins-intl.cc",
2472+
"src/char-predicates.cc",
24722473
"src/intl.cc",
24732474
"src/intl.h",
24742475
"src/objects/intl-objects.cc",

src/char-predicates.cc

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,43 @@
22
// Use of this source code is governed by a BSD-style license that can be
33
// found in the LICENSE file.
44

5+
#ifndef V8_INTL_SUPPORT
6+
#error Internationalization is expected to be enabled.
7+
#endif // V8_INTL_SUPPORT
8+
59
#include "src/char-predicates.h"
610

7-
#ifdef V8_INTL_SUPPORT
811
#include "unicode/uchar.h"
912
#include "unicode/urename.h"
10-
#endif // V8_INTL_SUPPORT
1113

1214
namespace v8 {
1315
namespace internal {
1416

15-
bool SupplementaryPlanes::IsIDStart(uc32 c) {
16-
DCHECK(c > 0xFFFF);
17-
#ifdef V8_INTL_SUPPORT
18-
// This only works for code points in the SMPs, since ICU does not exclude
19-
// code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
20-
// Code points in the SMP do not have those properties.
21-
return u_isIDStart(c);
22-
#else
23-
// This is incorrect, but if we don't have ICU, use this as fallback.
24-
return false;
25-
#endif // V8_INTL_SUPPORT
17+
// ES#sec-names-and-keywords Names and Keywords
18+
// UnicodeIDStart, '$', '_' and '\'
19+
bool IdentifierStart::Is(uc32 c) {
20+
// cannot use u_isIDStart because it does not work for
21+
// Other_ID_Start characters.
22+
return u_hasBinaryProperty(c, UCHAR_ID_START) ||
23+
(c < 0x60 && (c == '$' || c == '\\' || c == '_'));
2624
}
2725

26+
// ES#sec-names-and-keywords Names and Keywords
27+
// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ
28+
bool IdentifierPart::Is(uc32 c) {
29+
// Can't use u_isIDPart because it does not work for
30+
// Other_ID_Continue characters.
31+
return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) ||
32+
(c < 0x60 && (c == '$' || c == '\\' || c == '_')) || c == 0x200C ||
33+
c == 0x200D;
34+
}
2835

29-
bool SupplementaryPlanes::IsIDPart(uc32 c) {
30-
DCHECK(c > 0xFFFF);
31-
#ifdef V8_INTL_SUPPORT
32-
// This only works for code points in the SMPs, since ICU does not exclude
33-
// code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
34-
// Code points in the SMP do not have those properties.
35-
return u_isIDPart(c);
36-
#else
37-
// This is incorrect, but if we don't have ICU, use this as fallback.
38-
return false;
39-
#endif // V8_INTL_SUPPORT
36+
// ES#sec-white-space White Space
37+
// gC=Zs, U+0009, U+000B, U+000C, U+FEFF
38+
bool WhiteSpace::Is(uc32 c) {
39+
return (u_charType(c) == U_SPACE_SEPARATOR) ||
40+
(c < 0x0D && (c == 0x09 || c == 0x0B || c == 0x0C)) || c == 0xFEFF;
4041
}
42+
4143
} // namespace internal
4244
} // namespace v8

src/char-predicates.h

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -26,53 +26,58 @@ inline bool IsBinaryDigit(uc32 c);
2626
inline bool IsRegExpWord(uc32 c);
2727
inline bool IsRegExpNewline(uc32 c);
2828

29-
struct V8_EXPORT_PRIVATE SupplementaryPlanes {
30-
static bool IsIDStart(uc32 c);
31-
static bool IsIDPart(uc32 c);
32-
};
33-
34-
35-
// ES6 draft section 11.6
29+
// ES#sec-names-and-keywords
3630
// This includes '_', '$' and '\', and ID_Start according to
3731
// http://www.unicode.org/reports/tr31/, which consists of categories
3832
// 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', but excluding properties
3933
// 'Pattern_Syntax' or 'Pattern_White_Space'.
40-
// For code points in the SMPs, we can resort to ICU (if available).
34+
#ifdef V8_INTL_SUPPORT
35+
struct V8_EXPORT_PRIVATE IdentifierStart {
36+
static bool Is(uc32 c);
37+
#else
4138
struct IdentifierStart {
39+
// Non-BMP characters are not supported without I18N.
4240
static inline bool Is(uc32 c) {
43-
if (c > 0xFFFF) return SupplementaryPlanes::IsIDStart(c);
44-
return unibrow::ID_Start::Is(c);
41+
return (c <= 0xFFFF) ? unibrow::ID_Start::Is(c) : false;
4542
}
43+
#endif
4644
};
4745

48-
49-
// ES6 draft section 11.6
46+
// ES#sec-names-and-keywords
5047
// This includes \u200c and \u200d, and ID_Continue according to
5148
// http://www.unicode.org/reports/tr31/, which consists of ID_Start,
5249
// the categories 'Mn', 'Mc', 'Nd', 'Pc', but excluding properties
5350
// 'Pattern_Syntax' or 'Pattern_White_Space'.
54-
// For code points in the SMPs, we can resort to ICU (if available).
51+
#ifdef V8_INTL_SUPPORT
52+
struct V8_EXPORT_PRIVATE IdentifierPart {
53+
static bool Is(uc32 c);
54+
#else
5555
struct IdentifierPart {
5656
static inline bool Is(uc32 c) {
57-
if (c > 0xFFFF) return SupplementaryPlanes::IsIDPart(c);
58-
return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
57+
// Non-BMP charaacters are not supported without I18N.
58+
if (c <= 0xFFFF) {
59+
return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
60+
}
61+
return false;
5962
}
63+
#endif
6064
};
6165

62-
6366
// ES6 draft section 11.2
6467
// This includes all code points of Unicode category 'Zs'.
65-
// \u180e stops being one as of Unicode 6.3.0, but ES6 adheres to Unicode 5.1,
66-
// so it is also included.
67-
// Further included are \u0009, \u000b, \u0020, \u00a0, \u000c, and \ufeff.
68-
// There are no category 'Zs' code points in the SMPs.
68+
// Further included are \u0009, \u000b, \u000c, and \ufeff.
69+
#ifdef V8_INTL_SUPPORT
70+
struct V8_EXPORT_PRIVATE WhiteSpace {
71+
static bool Is(uc32 c);
72+
#else
6973
struct WhiteSpace {
7074
static inline bool Is(uc32 c) { return unibrow::WhiteSpace::Is(c); }
75+
#endif
7176
};
7277

73-
7478
// WhiteSpace and LineTerminator according to ES6 draft section 11.2 and 11.3
75-
// This consists of \000a, \000d, \u2028, and \u2029.
79+
// This includes all the characters with Unicode category 'Z' (= Zs+Zl+Zp)
80+
// as well as \u0009 - \u000d and \ufeff.
7681
struct WhiteSpaceOrLineTerminator {
7782
static inline bool Is(uc32 c) {
7883
return WhiteSpace::Is(c) || unibrow::LineTerminator::Is(c);

0 commit comments

Comments
 (0)