regex: Support escape code expansion

ebernhardson · ebernhardson · commit f244ef65a2fa · 2025-09-02T08:13:11.000-07:00
Adds a step to the regex rewriting that supports \r, \n, \t and
\uNNNN escape sequences. These are commonly available in other
regex engines and it will be convenient for users to have direct
access to them rather than current workarounds that involve
constructing character classes from nearby printable characters.

Bug: T403212
Change-Id: Idbef20454b6d02130bf5f596dffffdeb1e0f7382
diff --git a/lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java b/lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java
@@ -11,13 +11,20 @@ public final class RegexRewriter {
     public static final char END_ANCHOR_MARKER = '\uFDD1';
 
     private static final Map<Character, String> CHAR_CLASSES;
+    private static final Map<Character, Character> ESCAPE_CODES;
 
     static {
         Map<Character, String> charClasses = new HashMap<>();
         charClasses.put('d', "0-9");
         charClasses.put('w', "A-Za-z0-9_");
         charClasses.put('s', "\f\n\r\t\u0011\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff");
         CHAR_CLASSES = Collections.unmodifiableMap(charClasses);
+
+        Map<Character, Character> escapeCodes = new HashMap<>();
+        escapeCodes.put('r', '\r');
+        escapeCodes.put('n', '\n');
+        escapeCodes.put('t', '\t');
+        ESCAPE_CODES = Collections.unmodifiableMap(escapeCodes);
     }
 
     private RegexRewriter() {
@@ -34,13 +41,17 @@ public static String anchorTransformation(String input) {
      * Rewrites the provided regex to support character classes and optionally anchors.
      * If anchor support is enabled then RegexRewriter.anchorTransformation must be applied
      * to strings to be checked.
+     * It's perhaps inefficient that this rewrites the regex multiple times, but the
+     * implementation is easier to reason about as separate transformations.
      */
     public static CharSequence rewrite(CharSequence regex, boolean replaceAnchors) {
         CharSequence result = replaceCharClasses(regex);
         if (replaceAnchors) {
             result = replaceAnchors(result);
         }
-        return result;
+        // expandEscapeCodes must transform last so unicode expansions stay
+        // literals and can't be interpreted as char classes.
+        return expandEscapeCodes(result);
     }
 
     /**
@@ -70,6 +81,43 @@ static CharSequence replaceAnchors(CharSequence input) {
         return result.toString();
     }
 
+    private static boolean isValidHexSubSequence(CharSequence input, int i, int len) {
+        if (i + len > input.length()) {
+            return false;
+        }
+        CharSequence maybeHex = input.subSequence(i, i + len);
+        return  maybeHex.chars().allMatch((hexChar) -> Character.digit(hexChar, 16) >= 0);
+    }
+
+    @SuppressWarnings({"ModifiedControlVariable"})
+    @SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
+    static CharSequence expandEscapeCodes(CharSequence input) {
+        StringBuilder result = new StringBuilder();
+        RegexParseState parse = new RegexParseState(input);
+
+        for (int i = 0; i < input.length(); i++) {
+            char c = parse.next(i);
+
+            if (parse.inLiteral) {
+                result.append(c);
+            } else if (parse.escaped && ESCAPE_CODES.containsKey(c)) {
+                result.setLength(result.length() - 1);
+                result.append(ESCAPE_CODES.get(c));
+            } else if (parse.escaped && c == 'u' && isValidHexSubSequence(input, i + 1, 4)) {
+                String hex = input.subSequence(i + 1, i + 5).toString();
+                int cp = Integer.parseInt(hex, 16);
+                result.setLength(result.length() - 1);
+                // prepending \ treats it as a literal value.
+                result.append('\\');
+                result.append((char) cp);
+                i += 4;
+            } else {
+                result.append(c);
+            }
+        }
+        return result.toString();
+    }
+
     private static String expandCharClass(CharSequence charClass) {
         if (charClass.length() == 0) {
             return "[]";
diff --git a/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java b/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java
@@ -57,6 +57,8 @@ void testPatternEquivalence() {
         sources.put("numbers", "12345");
         sources.put("edgecase1", "Start^Middle$End");
         sources.put("edgecase2", "^foo bar$");
+        sources.put("newline", "\n");
+        sources.put("multiline", "qwe\n\nrty");
 
         // Basic start anchor
         assertPatternMatch(sources, "^abc", "findme");
@@ -94,13 +96,43 @@ void testPatternEquivalence() {
         // \d matches numbers
         assertPatternMatch(sources, "\\d", "numbers");
         // [^\d] matches not-numbers
-        assertPatternMatch(sources, "[^\\d]", "findme", "edgecase1", "edgecase2");
+        assertPatternMatch(sources, "[^\\d]", "findme", "edgecase1", "edgecase2", "newline", "multiline");
         // \s matches spaces
-        assertPatternMatch(sources, "\\s", "edgecase2");
+        assertPatternMatch(sources, "\\s", "edgecase2", "newline", "multiline");
         // [^\s] matches not-spaces
         assertPatternMatch(sources, "^[^\\s]+$", "findme", "numbers", "edgecase1");
         // \w matches word-like things, it does not match spaces or special chars
         assertPatternMatch(sources, "^\\w+$", "findme", "numbers");
-        assertPatternMatch(sources, "[^\\w]", "edgecase1", "edgecase2");
+        assertPatternMatch(sources, "[^\\w]", "edgecase1", "edgecase2", "newline", "multiline");
+        // escape code expansion
+        assertPatternMatch(sources, "\\n", "newline", "multiline");
+        assertPatternMatch(sources, "\\u000a", "newline", "multiline");
+        // multiline match
+        assertPatternMatch(sources, "qwe[\\r\\n]+rty", "multiline");
+        // expansion of regex syntax, \u002e is '.' but must not be treated as the any-match
+        assertNoPatternMatch(sources, "abcde\\u002e");
+        // expansion of the expected char should match
+        assertPatternMatch(sources, "abcde\\u0066", "findme");
+        // same but inside a character class
+        assertNoPatternMatch(sources, "abcde[\\u002e]");
+        assertPatternMatch(sources, "abcde[\\u0066]", "findme");
+        // Expanded \\u can't be interpreted as a char class to expand (\\u0064 == 'd')
+        assertNoPatternMatch(sources, "\\\\u0064");
+    }
+
+    @Test
+    void testUnicode() {
+        Map<String, String> sources = new HashMap<>();
+        sources.put("emoji", "😀");
+        sources.put("water", "水");
+
+        // Can find emoji
+        assertPatternMatch(sources, "\\uD83D\\uDE00", "emoji");
+        // Can find 3-byte character
+        assertPatternMatch(sources, "\\u6c34", "water");
+        // Can not match on partial surrogate pairs (only equivalent on java 15+, prior to that
+        // the java Pattern class could match half pairs.)
+        // assertNoPatternMatch(sources, "\\uD83D");
+        // assertNoPatternMatch(sources, "\\uDE00");
     }
 }
diff --git a/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java b/lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java
@@ -210,6 +210,126 @@ void testBackslashEscapeCounts() {
         assertNoAnchorReplacement("foo\\\\\\^bar");
     }
 
+    private void assertEscapeExpansion(String expected, String regex) {
+        assertThat(RegexRewriter.expandEscapeCodes(regex)).isEqualTo(expected);
+    }
+
+    private void assertNoEscapeExpansion(String regex) {
+        assertThat(RegexRewriter.expandEscapeCodes(regex)).isEqualTo(regex);
+    }
+
+    @Test
+    void testEscapeExpansionLiteralString() {
+        // empty literal passthru
+        assertNoEscapeExpansion("\"\"");
+        // escape in a quoted literal should not expand
+        assertNoEscapeExpansion("\"\\t\"");
+        // multiple quoted literals should not expand
+        assertNoEscapeExpansion("\"\\t\" \"\\t\"");
+        // unclosed literal should not expand
+        assertNoEscapeExpansion("\"\\t");
+        // should expand before a literal
+        assertEscapeExpansion("\n\"\\n\"", "\\n\"\\n\"");
+        // should expand after a literal
+        assertEscapeExpansion("\"\\n\"\n", "\"\\n\"\\n");
+        // should expand between quoted literals
+        assertEscapeExpansion("\"\\t\"\n\"\\t\"", "\"\\t\"\\n\"\\t\"");
+    }
+
+    @Test
+    void testEscapeExpansionCharClasses() {
+        // unclosed character class
+        assertEscapeExpansion("[\t", "[\\t");
+        // empty character class
+        assertNoEscapeExpansion("[]");
+        // escape in a character class should expand
+        assertEscapeExpansion("[\r\n]", "[\\r\\n]");
+        // nested char class (invalid but should handle)
+        assertEscapeExpansion("[\t[\n]]", "[\\t[\\n]]");
+    }
+
+    @Test
+    void testEscapeExpansionUnicodeEdgeCases() {
+        // Unicode at end of string (incomplete)
+        assertNoEscapeExpansion("\\u123");
+        assertNoEscapeExpansion("\\u12");
+        assertNoEscapeExpansion("\\u1");
+        assertNoEscapeExpansion("\\u");
+
+        // Unicode with uppercase hex
+        assertEscapeExpansion("\\\u000C", "\\u000C");
+        assertEscapeExpansion("\\\u00FF", "\\u00FF");
+
+        // Unicode with mixed case
+        assertEscapeExpansion("\\\u00Af", "\\u00Af");
+
+        // Multiple unicode escapes
+        assertEscapeExpansion("\\\u0009\\\n", "\\u0009\\u000A");
+
+        // Unicode zero
+        assertEscapeExpansion("\\\u0000", "\\u0000");
+
+        // Unicode with non-hex after valid 4 chars
+        assertEscapeExpansion("\\\u0009g", "\\u0009g");
+
+        // Invalid hex characters
+        assertNoEscapeExpansion("\\u000g");
+        assertNoEscapeExpansion("\\u00g0");
+        assertNoEscapeExpansion("\\ug000");
+        assertNoEscapeExpansion("\\u000G"); // uppercase G invalid
+
+        // invalid followed by valid
+        assertEscapeExpansion("\\u00\\\u000e", "\\u00\\u000e");
+    }
+
+    @Test
+    void testEscapeExpansionBackslashCount() {
+        // only backslashes
+        assertNoEscapeExpansion("\\");
+        assertNoEscapeExpansion("\\\\");
+        // single backslash
+        assertEscapeExpansion("\n", "\\n");
+        // double backslash
+        assertNoEscapeExpansion("\\\\n");
+        // triple backslash
+        assertEscapeExpansion("\\\\\n", "\\\\\\n");
+    }
+
+    @Test
+    void testEscapeExpansionStateTransitions() {
+        // Transition from literal to character class
+        assertEscapeExpansion("\"abc\"[\t]", "\"abc\"[\\t]");
+        // Transition from character class to literal
+        assertEscapeExpansion("[\t]\"\\n\"", "[\\t]\"\\n\"");
+        // Multiple state changes in one regex
+        assertEscapeExpansion("\\\\t\"\\n\"[\n]\\\\t", "\\\\t\"\\n\"[\\n]\\\\t");
+    }
+
+    @Test
+    void testEscapeExpansionErrorRecovery() {
+        // Malformed but should continue processing
+        assertEscapeExpansion("\\uabcq\t", "\\uabcq\\t");
+        // Mixed valid and invalid
+        assertEscapeExpansion("\\\u0009\\uabcq\n", "\\u0009\\uabcq\\n");
+    }
+
+    @Test
+    void testBasicEscapeSequenceExpansion() {
+        // escape with nothing after should passthru
+        assertNoEscapeExpansion("\\");
+        // no defined expansion
+        assertNoEscapeExpansion("\\q\\.");
+        // simple expansion
+        assertEscapeExpansion("\t\r\n", "\\t\\r\\n");
+        // unicode escapes should expand
+        assertEscapeExpansion("\\\u000c", "\\u000c");
+        // unicode escapes with non-hex value should passthru
+        assertNoEscapeExpansion("\\uabcq");
+        // short unicode escape should passthru
+        assertNoEscapeExpansion("\\u00");
+    }
+
+
     @Test
     void testMultipleAnchors() {
         assertAnchorReplacement("\uFDD0abc\uFDD1|\uFDD0def\uFDD1", "^abc$|^def$");
@@ -223,6 +343,48 @@ void testEdgeCases() {
         assertAnchorReplacement("\uFDD0\uFDD1", "^$");
     }
 
+    @Test
+    void testUnicodeSurrogatePairs() {
+        // unicode escapes for characters beyond BMP
+        // (joined with + because otherwise the compiler complains of illegal escape character)
+        assertEscapeExpansion("\\\uD835" + "\\\uDC00", "\\uD835\\uDC00"); // Mathematical bold A
+        assertEscapeExpansion("\\\uD83D" + "\\\uDE00", "\\uD83D\\uDE00"); // Grinning face emoji
+        assertEscapeExpansion("\\\uD835" + "\\\uDFCF" + "\\\uD835" + "\\\uDFD0", "\\uD835\\uDFCF\\uD835\\uDFD0"); // Mathematical bold digits
+        // incomplete surrogate pairs (they expand! maybe not ideal). In testing the
+        // downstream regex engine will not match half a surrogate pair.
+        assertEscapeExpansion("\\\uD83D", "\\uD83D"); // High surrogate without low
+        assertEscapeExpansion("\\\uDE00", "\\uDE00"); // Low surrogate without high
+        // invalid surrogate sequences (also expands! also won't match anything).
+        assertEscapeExpansion("\\\uuD83D\n", "\\uD83D\\n"); // High surrogate + regular escape
+        // surrogate pairs in character classes (see also RegexEquivalenceTest.testUnicode)
+        assertEscapeExpansion("[\\\uD83D" + "\\\uDE00]", "[\\uD83D\\uDE00]");
+        // surrogate pairs in quoted literals (should not expand)
+        assertNoEscapeExpansion("\"\\uD83D\\uDE00\"");
+    }
+
+    @Test
+    void testMixedEscapeTypes() {
+        // Mix unicode and regular escapes for same character
+        assertEscapeExpansion("\\\n\n", "\\u000A\\n");
+        assertEscapeExpansion("\\\t\t", "\\u0009\\t");
+        assertEscapeExpansion("\\\r\r", "\\u000D\\r");
+
+        // Mix valid and invalid unicode escapes
+        assertEscapeExpansion("\\u00\n", "\\u00\\n");
+        assertEscapeExpansion("\n\\uabcg", "\\n\\uabcg");
+
+        // Escaped backslash before unicode
+        assertNoEscapeExpansion("\\\\u000A");
+        assertEscapeExpansion("\\\\\n", "\\\\\\n");
+
+        // Multiple mixed escapes in sequence
+        assertEscapeExpansion("\t\\u00g0\r\n", "\\t\\u00g0\\r\\n");
+
+        // Mixed escapes in character classes
+        assertEscapeExpansion("[\\\t\n]", "[\\u0009\\n]");
+        assertEscapeExpansion("[\t\\u00g0]", "[\\t\\u00g0]");
+    }
+
     @Test
     void testComplexCharacterClassRanges() {
         // Invalid ranges with character class shortcuts
@@ -269,6 +431,11 @@ void testPathologicalBackslashes() {
         assertAnchorReplacement("\\\\\uFDD0", "\\\\^");  // Two backslashes + anchor
         assertNoAnchorReplacement("\\\\\\^");  // Three backslashes + escaped anchor
 
+        // Pathological backslashes with unicode
+        assertNoEscapeExpansion("\\\\u000A");      // 2 backslashes + unicode (no expansion)
+        assertEscapeExpansion("\\\\\\\n", "\\\\\\u000A");  // 3 backslashes + unicode (expands)
+        assertNoEscapeExpansion("\\\\\\\\u000A");  // 4 backslashes + unicode (no expansion)
+
         // Long sequences
         String manyBackslashes = "\\\\\\\\\\\\\\\\\\\\"; // 10 backslashes
         assertNoCharClassReplacement(manyBackslashes + "d");
@@ -277,5 +444,6 @@ void testPathologicalBackslashes() {
         // Backslashes at end of constructs
         assertNoCharClassReplacement("[abc\\\\]");
         assertNoAnchorReplacement("test\\\\");
+        assertNoEscapeExpansion("pattern\\\\");
     }
 }