regex: Fix expansion of multibyte characters

ebernhardson · ebernhardson · commit afcef31d8f92 · 2025-11-17T09:44:12.000-08:00
The expansion routine was mistakenly forcing all \u expansion to
be preceeded by a \, to tell the regex engine to treat the
resulting character as a literal. When it comes to multi-byte
characters this was injecting a \ between the two characters.

Instead check if we have a high surrogate, and if so check for a
following low surrogate and expand them as a pair.

Bug: T403212
Change-Id: I612c24a4b1c7035341e318110c1442c5ea154a45
diff --git a/lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java b/lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java
@@ -89,8 +89,8 @@ private static boolean isValidHexSubSequence(CharSequence input, int i, int len)
         return  maybeHex.chars().allMatch((hexChar) -> Character.digit(hexChar, 16) >= 0);
     }
 
-    @SuppressWarnings({"ModifiedControlVariable"})
     @SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
+    @SuppressWarnings({"ModifiedControlVariable", "CyclomaticComplexity", "NPathComplexity"})
     static CharSequence expandEscapeCodes(CharSequence input) {
         StringBuilder result = new StringBuilder();
         RegexParseState parse = new RegexParseState(input);
@@ -105,12 +105,29 @@ static CharSequence expandEscapeCodes(CharSequence input) {
                 result.append(ESCAPE_CODES.get(c));
             } else if (parse.escaped && c == 'u' && isValidHexSubSequence(input, i + 1, 4)) {
                 String hex = input.subSequence(i + 1, i + 5).toString();
-                int cp = Integer.parseInt(hex, 16);
+                char firstChar = (char) Integer.parseInt(hex, 16);
                 result.setLength(result.length() - 1);
-                // prepending \ treats it as a literal value.
+                // prepending \ treats it as a literal value. Yes this is probably the same char that
+                // was removed with setLength(n-1), but explicit seems better than implicit.
                 result.append('\\');
-                result.append((char) cp);
+                result.append(firstChar);
                 i += 4;
+
+                // directly handle paired surrogate, otherwise the above would
+                // inject a \ inside the pair.
+                if (Character.isHighSurrogate(firstChar) &&
+                    i + 2 < input.length()  &&
+                    input.charAt(i + 1) == '\\' &&
+                    input.charAt(i + 2) == 'u' &&
+                    isValidHexSubSequence(input, i + 3, 4)
+                ) {
+                    String lowHex = input.subSequence(i + 3, i + 7).toString();
+                    char secondChar = (char) Integer.parseInt(lowHex, 16);
+                    if (Character.isLowSurrogate(secondChar)) {
+                        result.append(secondChar);
+                        i += 6; // Skip the \\uHHHH for the low surrogate
+                    }
+                }
             } else {
                 result.append(c);
             }
diff --git a/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java b/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java
@@ -126,7 +126,11 @@ void testUnicode() {
         sources.put("emoji", "😀");
         sources.put("water", "水");
 
-        // Can find emoji
+        // Can find pairs inside [] if provided natively (validate lucene behaviour)
+        assertPatternMatch(sources, "[😀]", "emoji");
+        // Can find pairs inside [] if provided
+        assertPatternMatch(sources, "[\\uD83D\\uDE00]", "emoji");
+        // Can find emoji as direct string
         assertPatternMatch(sources, "\\uD83D\\uDE00", "emoji");
         // Can find 3-byte character
         assertPatternMatch(sources, "\\u6c34", "water");
diff --git a/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java b/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java
@@ -345,19 +345,19 @@ void testEdgeCases() {
 
     @Test
     void testUnicodeSurrogatePairs() {
+
         // unicode escapes for characters beyond BMP
-        // (joined with + because otherwise the compiler complains of illegal escape character)
-        assertEscapeExpansion("\\\uD835" + "\\\uDC00", "\\uD835\\uDC00"); // Mathematical bold A
-        assertEscapeExpansion("\\\uD83D" + "\\\uDE00", "\\uD83D\\uDE00"); // Grinning face emoji
-        assertEscapeExpansion("\\\uD835" + "\\\uDFCF" + "\\\uD835" + "\\\uDFD0", "\\uD835\\uDFCF\\uD835\\uDFD0"); // Mathematical bold digits
+        assertEscapeExpansion("\\𝐀", "\\uD835\\uDC00"); // Mathematical bold A
+        assertEscapeExpansion("\\😀", "\\uD83D\\uDE00"); // Grinning face emoji
+        assertEscapeExpansion("\\𝟏\\𝟐", "\\uD835\\uDFCF\\uD835\\uDFD0"); // Mathematical bold digits
         // incomplete surrogate pairs (they expand! maybe not ideal). In testing the
         // downstream regex engine will not match half a surrogate pair.
         assertEscapeExpansion("\\\uD83D", "\\uD83D"); // High surrogate without low
         assertEscapeExpansion("\\\uDE00", "\\uDE00"); // Low surrogate without high
         // invalid surrogate sequences (also expands! also won't match anything).
         assertEscapeExpansion("\\\uuD83D\n", "\\uD83D\\n"); // High surrogate + regular escape
         // surrogate pairs in character classes (see also RegexEquivalenceTest.testUnicode)
-        assertEscapeExpansion("[\\\uD83D" + "\\\uDE00]", "[\\uD83D\\uDE00]");
+        assertEscapeExpansion("[\\😀]", "[\\uD83D\\uDE00]");
         // surrogate pairs in quoted literals (should not expand)
         assertNoEscapeExpansion("\"\\uD83D\\uDE00\"");
     }