Skip to content

Commit afcef31

Browse files
committed
regex: Fix expansion of multibyte characters
The expansion routine was mistakenly forcing all \u expansion to be preceeded by a \, to tell the regex engine to treat the resulting character as a literal. When it comes to multi-byte characters this was injecting a \ between the two characters. Instead check if we have a high surrogate, and if so check for a following low surrogate and expand them as a pair. Bug: T403212 Change-Id: I612c24a4b1c7035341e318110c1442c5ea154a45
1 parent 9600080 commit afcef31

File tree

3 files changed

+31
-10
lines changed

3 files changed

+31
-10
lines changed

lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ private static boolean isValidHexSubSequence(CharSequence input, int i, int len)
8989
return maybeHex.chars().allMatch((hexChar) -> Character.digit(hexChar, 16) >= 0);
9090
}
9191

92-
@SuppressWarnings({"ModifiedControlVariable"})
9392
@SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
93+
@SuppressWarnings({"ModifiedControlVariable", "CyclomaticComplexity", "NPathComplexity"})
9494
static CharSequence expandEscapeCodes(CharSequence input) {
9595
StringBuilder result = new StringBuilder();
9696
RegexParseState parse = new RegexParseState(input);
@@ -105,12 +105,29 @@ static CharSequence expandEscapeCodes(CharSequence input) {
105105
result.append(ESCAPE_CODES.get(c));
106106
} else if (parse.escaped && c == 'u' && isValidHexSubSequence(input, i + 1, 4)) {
107107
String hex = input.subSequence(i + 1, i + 5).toString();
108-
int cp = Integer.parseInt(hex, 16);
108+
char firstChar = (char) Integer.parseInt(hex, 16);
109109
result.setLength(result.length() - 1);
110-
// prepending \ treats it as a literal value.
110+
// prepending \ treats it as a literal value. Yes this is probably the same char that
111+
// was removed with setLength(n-1), but explicit seems better than implicit.
111112
result.append('\\');
112-
result.append((char) cp);
113+
result.append(firstChar);
113114
i += 4;
115+
116+
// directly handle paired surrogate, otherwise the above would
117+
// inject a \ inside the pair.
118+
if (Character.isHighSurrogate(firstChar) &&
119+
i + 2 < input.length() &&
120+
input.charAt(i + 1) == '\\' &&
121+
input.charAt(i + 2) == 'u' &&
122+
isValidHexSubSequence(input, i + 3, 4)
123+
) {
124+
String lowHex = input.subSequence(i + 3, i + 7).toString();
125+
char secondChar = (char) Integer.parseInt(lowHex, 16);
126+
if (Character.isLowSurrogate(secondChar)) {
127+
result.append(secondChar);
128+
i += 6; // Skip the \\uHHHH for the low surrogate
129+
}
130+
}
114131
} else {
115132
result.append(c);
116133
}

lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,11 @@ void testUnicode() {
126126
sources.put("emoji", "😀");
127127
sources.put("water", "水");
128128

129-
// Can find emoji
129+
// Can find pairs inside [] if provided natively (validate lucene behaviour)
130+
assertPatternMatch(sources, "[😀]", "emoji");
131+
// Can find pairs inside [] if provided
132+
assertPatternMatch(sources, "[\\uD83D\\uDE00]", "emoji");
133+
// Can find emoji as direct string
130134
assertPatternMatch(sources, "\\uD83D\\uDE00", "emoji");
131135
// Can find 3-byte character
132136
assertPatternMatch(sources, "\\u6c34", "water");

lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -345,19 +345,19 @@ void testEdgeCases() {
345345

346346
@Test
347347
void testUnicodeSurrogatePairs() {
348+
348349
// unicode escapes for characters beyond BMP
349-
// (joined with + because otherwise the compiler complains of illegal escape character)
350-
assertEscapeExpansion("\\\uD835" + "\\\uDC00", "\\uD835\\uDC00"); // Mathematical bold A
351-
assertEscapeExpansion("\\\uD83D" + "\\\uDE00", "\\uD83D\\uDE00"); // Grinning face emoji
352-
assertEscapeExpansion("\\\uD835" + "\\\uDFCF" + "\\\uD835" + "\\\uDFD0", "\\uD835\\uDFCF\\uD835\\uDFD0"); // Mathematical bold digits
350+
assertEscapeExpansion("\\𝐀", "\\uD835\\uDC00"); // Mathematical bold A
351+
assertEscapeExpansion("\\😀", "\\uD83D\\uDE00"); // Grinning face emoji
352+
assertEscapeExpansion("\\𝟏\\𝟐", "\\uD835\\uDFCF\\uD835\\uDFD0"); // Mathematical bold digits
353353
// incomplete surrogate pairs (they expand! maybe not ideal). In testing the
354354
// downstream regex engine will not match half a surrogate pair.
355355
assertEscapeExpansion("\\\uD83D", "\\uD83D"); // High surrogate without low
356356
assertEscapeExpansion("\\\uDE00", "\\uDE00"); // Low surrogate without high
357357
// invalid surrogate sequences (also expands! also won't match anything).
358358
assertEscapeExpansion("\\\uuD83D\n", "\\uD83D\\n"); // High surrogate + regular escape
359359
// surrogate pairs in character classes (see also RegexEquivalenceTest.testUnicode)
360-
assertEscapeExpansion("[\\\uD83D" + "\\\uDE00]", "[\\uD83D\\uDE00]");
360+
assertEscapeExpansion("[\\😀]", "[\\uD83D\\uDE00]");
361361
// surrogate pairs in quoted literals (should not expand)
362362
assertNoEscapeExpansion("\"\\uD83D\\uDE00\"");
363363
}

0 commit comments

Comments
 (0)