Skip to content

Commit 25573cd

Browse files
committed
regex: Support alternate \u{...} syntax
The first variation of unicode expansion added support for \uHHHH style expansion. We've since had requests to also support \u{...} expansion which serves the same purpose but has different semantics. At a relatively abstract level this new syntax takes codepoints as input, while the initial syntax is closer to utf-16. Bug: T403212 Change-Id: I37eeacd69dc7f86826ae45e48ce792bb32ef835f
1 parent afcef31 commit 25573cd

File tree

3 files changed

+219
-53
lines changed

3 files changed

+219
-53
lines changed

lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java

Lines changed: 75 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ static CharSequence replaceAnchors(CharSequence input) {
8282
}
8383

8484
private static boolean isValidHexSubSequence(CharSequence input, int i, int len) {
85-
if (i + len > input.length()) {
85+
if (len < 1 || i + len > input.length()) {
8686
return false;
8787
}
8888
CharSequence maybeHex = input.subSequence(i, i + len);
@@ -100,37 +100,96 @@ static CharSequence expandEscapeCodes(CharSequence input) {
100100

101101
if (parse.inLiteral) {
102102
result.append(c);
103-
} else if (parse.escaped && ESCAPE_CODES.containsKey(c)) {
103+
continue;
104+
}
105+
106+
if (parse.escaped && ESCAPE_CODES.containsKey(c)) {
104107
result.setLength(result.length() - 1);
105108
result.append(ESCAPE_CODES.get(c));
106-
} else if (parse.escaped && c == 'u' && isValidHexSubSequence(input, i + 1, 4)) {
109+
continue;
110+
}
111+
112+
if (parse.escaped && c == 'u' && i + 1 < input.length() && input.charAt(i + 1) == '{') {
113+
// Find closing brace
114+
int braceEnd = -1;
115+
for (int j = i + 2; j < input.length() && j < i + 10; j++) { // limit search
116+
if (input.charAt(j) == '}') {
117+
braceEnd = j;
118+
break;
119+
}
120+
}
121+
if (braceEnd < i + 2) {
122+
throw new IllegalArgumentException("Missing closing brace of \\u{...} escape sequence");
123+
}
124+
if (!isValidHexSubSequence(input, i + 2, braceEnd - i - 2)) {
125+
throw new IllegalArgumentException("Invalid hex content in \\u{...} escape sequence");
126+
}
127+
128+
String hex = input.subSequence(i + 2, braceEnd).toString();
129+
int codePoint = Integer.parseInt(hex, 16);
130+
// Java is very flexible, so while > 0x10FFFF will throw, these reserved points do not
131+
// even though they are not valid codepoints.
132+
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
133+
throw new IllegalArgumentException("Not a valid Unicode code point in \\u{...} escape sequence");
134+
}
135+
// modern java would fail this at Character.toChars, but CI still includes java 8 which does
136+
// not. To get consistency for test cases, check explicitly.
137+
if (codePoint > 0x10FFFF) {
138+
throw new IllegalArgumentException("Not a valid Unicode code point in \\u{...} escape sequence");
139+
}
140+
result.setLength(result.length() - 1); // remove the \
141+
result.append('\\');
142+
result.append(Character.toChars(codePoint));
143+
i = braceEnd; // skip to closing brace
144+
continue;
145+
}
146+
147+
if (parse.escaped && c == 'u') {
148+
if (!isValidHexSubSequence(input, i + 1, 4)) {
149+
throw new IllegalArgumentException("Invalid hex content in \\uHHHH escape sequence");
150+
}
107151
String hex = input.subSequence(i + 1, i + 5).toString();
108152
char firstChar = (char) Integer.parseInt(hex, 16);
153+
if (Character.isLowSurrogate(firstChar)) {
154+
// low surrogate can only follow high surrogate
155+
throw new IllegalArgumentException("Invalid low surrogate in \\uHHHH escape sequence");
156+
}
109157
result.setLength(result.length() - 1);
110158
// prepending \ treats it as a literal value. Yes this is probably the same char that
111159
// was removed with setLength(n-1), but explicit seems better than implicit.
112160
result.append('\\');
113161
result.append(firstChar);
114162
i += 4;
115163

164+
if (!Character.isHighSurrogate(firstChar)) {
165+
continue;
166+
}
116167
// directly handle paired surrogate, otherwise the above would
117168
// inject a \ inside the pair.
118-
if (Character.isHighSurrogate(firstChar) &&
119-
i + 2 < input.length() &&
120-
input.charAt(i + 1) == '\\' &&
121-
input.charAt(i + 2) == 'u' &&
122-
isValidHexSubSequence(input, i + 3, 4)
169+
if (
170+
i + 2 >= input.length() ||
171+
input.charAt(i + 1) != '\\' ||
172+
input.charAt(i + 2) != 'u'
123173
) {
124-
String lowHex = input.subSequence(i + 3, i + 7).toString();
125-
char secondChar = (char) Integer.parseInt(lowHex, 16);
126-
if (Character.isLowSurrogate(secondChar)) {
127-
result.append(secondChar);
128-
i += 6; // Skip the \\uHHHH for the low surrogate
129-
}
174+
throw new IllegalArgumentException(
175+
"High surrogate must be followed with low surrogate in \\uHHHH escape sequence");
130176
}
131-
} else {
132-
result.append(c);
177+
if (!isValidHexSubSequence(input, i + 3, 4)) {
178+
throw new IllegalArgumentException("Invalid hex content in \\uHHHH escape sequence");
179+
}
180+
String lowHex = input.subSequence(i + 3, i + 7).toString();
181+
char secondChar = (char) Integer.parseInt(lowHex, 16);
182+
if (!Character.isLowSurrogate(secondChar)) {
183+
throw new IllegalArgumentException(
184+
"High surrogate must be followed with low surrogate in \\uHHHH escape sequence");
185+
}
186+
result.append(secondChar);
187+
i += 6; // Skip the \\uHHHH for the low surrogate
188+
continue;
133189
}
190+
191+
// Default action
192+
result.append(c);
134193
}
135194
return result.toString();
136195
}

lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,25 @@ private void assertNoPatternMatch(Map<String, String> sources, String regex) {
5050
assertPatternMatch(sources, regex);
5151
}
5252

53+
private void assertLuceneMatch(Map<String, String> sources, String regex, String... expected) {
54+
// Some things are not supported by java Pattern, or depend on JVM version, but we still
55+
// want to assert that the resulting rewritten pattern matches what we expect it to.
56+
boolean replaceAnchors = true;
57+
CharacterRunAutomaton charRun = buildLuceneRegex(regex, replaceAnchors);
58+
UnaryOperator<String> valueTransform = replaceAnchors ? RegexRewriter::anchorTransformation : s -> s;
59+
for (Map.Entry<String, String> entry : sources.entrySet()) {
60+
boolean expectMatch = Arrays.stream(expected).anyMatch(docid -> docid.equals(entry.getKey()));
61+
boolean luceneMatch = charRun.run(valueTransform.apply(entry.getValue()));
62+
assertThat(luceneMatch)
63+
.describedAs("lucene regex `%s` against `%s`:`%s`", regex, entry.getKey(), entry.getValue())
64+
.isEqualTo(expectMatch);
65+
}
66+
}
67+
68+
private void assertNoLuceneMatch(Map<String, String> sources, String regex) {
69+
assertLuceneMatch(sources, regex);
70+
}
71+
5372
@Test
5473
void testPatternEquivalence() {
5574
Map<String, String> sources = new HashMap<>();
@@ -109,7 +128,7 @@ void testPatternEquivalence() {
109128
assertPatternMatch(sources, "\\u000a", "newline", "multiline");
110129
// multiline match
111130
assertPatternMatch(sources, "qwe[\\r\\n]+rty", "multiline");
112-
// expansion of regex syntax, \u002e is '.' but must not be treated as the any-match
131+
// expansion of regex syntax, \\u002e is '.' but must not be treated as the any-match
113132
assertNoPatternMatch(sources, "abcde\\u002e");
114133
// expansion of the expected char should match
115134
assertPatternMatch(sources, "abcde\\u0066", "findme");
@@ -130,13 +149,15 @@ void testUnicode() {
130149
assertPatternMatch(sources, "[😀]", "emoji");
131150
// Can find pairs inside [] if provided
132151
assertPatternMatch(sources, "[\\uD83D\\uDE00]", "emoji");
152+
assertLuceneMatch(sources, "[\\u{1F600}]", "emoji");
133153
// Can find emoji as direct string
134154
assertPatternMatch(sources, "\\uD83D\\uDE00", "emoji");
155+
assertLuceneMatch(sources, "\\u{1f600}", "emoji");
135156
// Can find 3-byte character
136157
assertPatternMatch(sources, "\\u6c34", "water");
137-
// Can not match on partial surrogate pairs (only equivalent on java 15+, prior to that
138-
// the java Pattern class could match half pairs.)
139-
// assertNoPatternMatch(sources, "\\uD83D");
140-
// assertNoPatternMatch(sources, "\\uDE00");
158+
assertLuceneMatch(sources, "\\u{6c34}", "water");
159+
// Can use expanded chars in a range
160+
assertPatternMatch(sources, "[\\u6c33-\\u6c35]", "water");
161+
assertLuceneMatch(sources, "[\\u{6c33}-\\u{6c35}]", "water");
141162
}
142163
}

0 commit comments

Comments
 (0)