regex: Support alternate \u{...} syntax

ebernhardson · ebernhardson · commit 25573cddba32 · 2025-11-18T13:15:15.000-08:00
The first variation of unicode expansion added support for \uHHHH
style expansion. We've since had requests to also support \u{...}
expansion which serves the same purpose but has different
semantics. At a relatively abstract level this new syntax takes
codepoints as input, while the initial syntax is closer to utf-16.

Bug: T403212
Change-Id: I37eeacd69dc7f86826ae45e48ce792bb32ef835f
diff --git a/lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java b/lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java
@@ -82,7 +82,7 @@ static CharSequence replaceAnchors(CharSequence input) {
     }
 
     private static boolean isValidHexSubSequence(CharSequence input, int i, int len) {
-        if (i + len > input.length()) {
+        if (len < 1 || i + len > input.length()) {
             return false;
         }
         CharSequence maybeHex = input.subSequence(i, i + len);
@@ -100,37 +100,96 @@ static CharSequence expandEscapeCodes(CharSequence input) {
 
             if (parse.inLiteral) {
                 result.append(c);
-            } else if (parse.escaped && ESCAPE_CODES.containsKey(c)) {
+                continue;
+            }
+
+            if (parse.escaped && ESCAPE_CODES.containsKey(c)) {
                 result.setLength(result.length() - 1);
                 result.append(ESCAPE_CODES.get(c));
-            } else if (parse.escaped && c == 'u' && isValidHexSubSequence(input, i + 1, 4)) {
+                continue;
+            }
+
+            if (parse.escaped && c == 'u' && i + 1 < input.length() && input.charAt(i + 1) == '{') {
+                // Find closing brace
+                int braceEnd = -1;
+                for (int j = i + 2; j < input.length() && j < i + 10; j++) { // limit search
+                    if (input.charAt(j) == '}') {
+                        braceEnd = j;
+                        break;
+                    }
+                }
+                if (braceEnd < i + 2) {
+                    throw new IllegalArgumentException("Missing closing brace of \\u{...} escape sequence");
+                }
+                if (!isValidHexSubSequence(input, i + 2, braceEnd - i - 2)) {
+                    throw new IllegalArgumentException("Invalid hex content in \\u{...} escape sequence");
+                }
+
+                String hex = input.subSequence(i + 2, braceEnd).toString();
+                int codePoint = Integer.parseInt(hex, 16);
+                // Java is very flexible, so while > 0x10FFFF will throw, these reserved points do not
+                // even though they are not valid codepoints.
+                if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
+                    throw new IllegalArgumentException("Not a valid Unicode code point in \\u{...} escape sequence");
+                }
+                // modern java would fail this at Character.toChars, but CI still includes java 8 which does
+                // not. To get consistency for test cases, check explicitly.
+                if (codePoint > 0x10FFFF) {
+                    throw new IllegalArgumentException("Not a valid Unicode code point in \\u{...} escape sequence");
+                }
+                result.setLength(result.length() - 1); // remove the \
+                result.append('\\');
+                result.append(Character.toChars(codePoint));
+                i = braceEnd; // skip to closing brace
+                continue;
+            }
+
+            if (parse.escaped && c == 'u') {
+                if (!isValidHexSubSequence(input, i + 1, 4)) {
+                    throw new IllegalArgumentException("Invalid hex content in \\uHHHH escape sequence");
+                }
                 String hex = input.subSequence(i + 1, i + 5).toString();
                 char firstChar = (char) Integer.parseInt(hex, 16);
+                if (Character.isLowSurrogate(firstChar)) {
+                    // low surrogate can only follow high surrogate
+                    throw new IllegalArgumentException("Invalid low surrogate in \\uHHHH escape sequence");
+                }
                 result.setLength(result.length() - 1);
                 // prepending \ treats it as a literal value. Yes this is probably the same char that
                 // was removed with setLength(n-1), but explicit seems better than implicit.
                 result.append('\\');
                 result.append(firstChar);
                 i += 4;
 
+                if (!Character.isHighSurrogate(firstChar)) {
+                    continue;
+                }
                 // directly handle paired surrogate, otherwise the above would
                 // inject a \ inside the pair.
-                if (Character.isHighSurrogate(firstChar) &&
-                    i + 2 < input.length()  &&
-                    input.charAt(i + 1) == '\\' &&
-                    input.charAt(i + 2) == 'u' &&
-                    isValidHexSubSequence(input, i + 3, 4)
+                if (
+                    i + 2 >= input.length()  ||
+                    input.charAt(i + 1) != '\\' ||
+                    input.charAt(i + 2) != 'u'
                 ) {
-                    String lowHex = input.subSequence(i + 3, i + 7).toString();
-                    char secondChar = (char) Integer.parseInt(lowHex, 16);
-                    if (Character.isLowSurrogate(secondChar)) {
-                        result.append(secondChar);
-                        i += 6; // Skip the \\uHHHH for the low surrogate
-                    }
+                    throw new IllegalArgumentException(
+                        "High surrogate must be followed with low surrogate in \\uHHHH escape sequence");
                 }
-            } else {
-                result.append(c);
+                if (!isValidHexSubSequence(input, i + 3, 4)) {
+                    throw new IllegalArgumentException("Invalid hex content in \\uHHHH escape sequence");
+                }
+                String lowHex = input.subSequence(i + 3, i + 7).toString();
+                char secondChar = (char) Integer.parseInt(lowHex, 16);
+                if (!Character.isLowSurrogate(secondChar)) {
+                    throw new IllegalArgumentException(
+                        "High surrogate must be followed with low surrogate in \\uHHHH escape sequence");
+                }
+                result.append(secondChar);
+                i += 6; // Skip the \\uHHHH for the low surrogate
+                continue;
             }
+
+            // Default action
+            result.append(c);
         }
         return result.toString();
     }
diff --git a/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java b/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java
@@ -50,6 +50,25 @@ private void assertNoPatternMatch(Map<String, String> sources, String regex) {
         assertPatternMatch(sources, regex);
     }
 
+    private void assertLuceneMatch(Map<String, String> sources, String regex, String... expected) {
+        // Some things are not supported by java Pattern, or depend on JVM version, but we still
+        // want to assert that the resulting rewritten pattern matches what we expect it to.
+        boolean replaceAnchors = true;
+        CharacterRunAutomaton charRun = buildLuceneRegex(regex, replaceAnchors);
+        UnaryOperator<String> valueTransform = replaceAnchors ? RegexRewriter::anchorTransformation : s -> s;
+        for (Map.Entry<String, String> entry : sources.entrySet()) {
+            boolean expectMatch = Arrays.stream(expected).anyMatch(docid -> docid.equals(entry.getKey()));
+            boolean luceneMatch = charRun.run(valueTransform.apply(entry.getValue()));
+            assertThat(luceneMatch)
+                .describedAs("lucene regex `%s` against `%s`:`%s`", regex, entry.getKey(), entry.getValue())
+                .isEqualTo(expectMatch);
+        }
+    }
+
+    private void assertNoLuceneMatch(Map<String, String> sources, String regex) {
+        assertLuceneMatch(sources, regex);
+    }
+
     @Test
     void testPatternEquivalence() {
         Map<String, String> sources = new HashMap<>();
@@ -109,7 +128,7 @@ void testPatternEquivalence() {
         assertPatternMatch(sources, "\\u000a", "newline", "multiline");
         // multiline match
         assertPatternMatch(sources, "qwe[\\r\\n]+rty", "multiline");
-        // expansion of regex syntax, \u002e is '.' but must not be treated as the any-match
+        // expansion of regex syntax, \\u002e is '.' but must not be treated as the any-match
         assertNoPatternMatch(sources, "abcde\\u002e");
         // expansion of the expected char should match
         assertPatternMatch(sources, "abcde\\u0066", "findme");
@@ -130,13 +149,15 @@ void testUnicode() {
         assertPatternMatch(sources, "[😀]", "emoji");
         // Can find pairs inside [] if provided
         assertPatternMatch(sources, "[\\uD83D\\uDE00]", "emoji");
+        assertLuceneMatch(sources, "[\\u{1F600}]", "emoji");
         // Can find emoji as direct string
         assertPatternMatch(sources, "\\uD83D\\uDE00", "emoji");
+        assertLuceneMatch(sources, "\\u{1f600}", "emoji");
         // Can find 3-byte character
         assertPatternMatch(sources, "\\u6c34", "water");
-        // Can not match on partial surrogate pairs (only equivalent on java 15+, prior to that
-        // the java Pattern class could match half pairs.)
-        // assertNoPatternMatch(sources, "\\uD83D");
-        // assertNoPatternMatch(sources, "\\uDE00");
+        assertLuceneMatch(sources, "\\u{6c34}", "water");
+        // Can use expanded chars in a range
+        assertPatternMatch(sources, "[\\u6c33-\\u6c35]", "water");
+        assertLuceneMatch(sources, "[\\u{6c33}-\\u{6c35}]", "water");
     }
 }
diff --git a/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java b/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java