Skip to content

Commit f244ef6

Browse files
committed
regex: Support escape code expansion
Adds a step to the regex rewriting that supports \r, \n, \t and \uNNNN escape sequences. These are commonly available in other regex engines and it will be convenient for users to have direct access to them rather than current workarounds that involve constructing character classes from nearby printable characters. Bug: T403212 Change-Id: Idbef20454b6d02130bf5f596dffffdeb1e0f7382
1 parent c9d1834 commit f244ef6

File tree

3 files changed

+252
-4
lines changed

3 files changed

+252
-4
lines changed

lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,20 @@ public final class RegexRewriter {
1111
public static final char END_ANCHOR_MARKER = '\uFDD1';
1212

1313
private static final Map<Character, String> CHAR_CLASSES;
14+
private static final Map<Character, Character> ESCAPE_CODES;
1415

1516
static {
1617
Map<Character, String> charClasses = new HashMap<>();
1718
charClasses.put('d', "0-9");
1819
charClasses.put('w', "A-Za-z0-9_");
1920
charClasses.put('s', "\f\n\r\t\u0011\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff");
2021
CHAR_CLASSES = Collections.unmodifiableMap(charClasses);
22+
23+
Map<Character, Character> escapeCodes = new HashMap<>();
24+
escapeCodes.put('r', '\r');
25+
escapeCodes.put('n', '\n');
26+
escapeCodes.put('t', '\t');
27+
ESCAPE_CODES = Collections.unmodifiableMap(escapeCodes);
2128
}
2229

2330
private RegexRewriter() {
@@ -34,13 +41,17 @@ public static String anchorTransformation(String input) {
3441
* Rewrites the provided regex to support character classes and optionally anchors.
3542
* If anchor support is enabled then RegexRewriter.anchorTransformation must be applied
3643
* to strings to be checked.
44+
* It's perhaps inefficient that this rewrites the regex multiple times, but the
45+
* implementation is easier to reason about as separate transformations.
3746
*/
3847
public static CharSequence rewrite(CharSequence regex, boolean replaceAnchors) {
3948
CharSequence result = replaceCharClasses(regex);
4049
if (replaceAnchors) {
4150
result = replaceAnchors(result);
4251
}
43-
return result;
52+
// expandEscapeCodes must transform last so unicode expansions stay
53+
// literals and can't be interpreted as char classes.
54+
return expandEscapeCodes(result);
4455
}
4556

4657
/**
@@ -70,6 +81,43 @@ static CharSequence replaceAnchors(CharSequence input) {
7081
return result.toString();
7182
}
7283

84+
private static boolean isValidHexSubSequence(CharSequence input, int i, int len) {
85+
if (i + len > input.length()) {
86+
return false;
87+
}
88+
CharSequence maybeHex = input.subSequence(i, i + len);
89+
return maybeHex.chars().allMatch((hexChar) -> Character.digit(hexChar, 16) >= 0);
90+
}
91+
92+
@SuppressWarnings({"ModifiedControlVariable"})
93+
@SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
94+
static CharSequence expandEscapeCodes(CharSequence input) {
95+
StringBuilder result = new StringBuilder();
96+
RegexParseState parse = new RegexParseState(input);
97+
98+
for (int i = 0; i < input.length(); i++) {
99+
char c = parse.next(i);
100+
101+
if (parse.inLiteral) {
102+
result.append(c);
103+
} else if (parse.escaped && ESCAPE_CODES.containsKey(c)) {
104+
result.setLength(result.length() - 1);
105+
result.append(ESCAPE_CODES.get(c));
106+
} else if (parse.escaped && c == 'u' && isValidHexSubSequence(input, i + 1, 4)) {
107+
String hex = input.subSequence(i + 1, i + 5).toString();
108+
int cp = Integer.parseInt(hex, 16);
109+
result.setLength(result.length() - 1);
110+
// prepending \ treats it as a literal value.
111+
result.append('\\');
112+
result.append((char) cp);
113+
i += 4;
114+
} else {
115+
result.append(c);
116+
}
117+
}
118+
return result.toString();
119+
}
120+
73121
private static String expandCharClass(CharSequence charClass) {
74122
if (charClass.length() == 0) {
75123
return "[]";

lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ void testPatternEquivalence() {
5757
sources.put("numbers", "12345");
5858
sources.put("edgecase1", "Start^Middle$End");
5959
sources.put("edgecase2", "^foo bar$");
60+
sources.put("newline", "\n");
61+
sources.put("multiline", "qwe\n\nrty");
6062

6163
// Basic start anchor
6264
assertPatternMatch(sources, "^abc", "findme");
@@ -94,13 +96,43 @@ void testPatternEquivalence() {
9496
// \d matches numbers
9597
assertPatternMatch(sources, "\\d", "numbers");
9698
// [^\d] matches not-numbers
97-
assertPatternMatch(sources, "[^\\d]", "findme", "edgecase1", "edgecase2");
99+
assertPatternMatch(sources, "[^\\d]", "findme", "edgecase1", "edgecase2", "newline", "multiline");
98100
// \s matches spaces
99-
assertPatternMatch(sources, "\\s", "edgecase2");
101+
assertPatternMatch(sources, "\\s", "edgecase2", "newline", "multiline");
100102
// [^\s] matches not-spaces
101103
assertPatternMatch(sources, "^[^\\s]+$", "findme", "numbers", "edgecase1");
102104
// \w matches word-like things, it does not match spaces or special chars
103105
assertPatternMatch(sources, "^\\w+$", "findme", "numbers");
104-
assertPatternMatch(sources, "[^\\w]", "edgecase1", "edgecase2");
106+
assertPatternMatch(sources, "[^\\w]", "edgecase1", "edgecase2", "newline", "multiline");
107+
// escape code expansion
108+
assertPatternMatch(sources, "\\n", "newline", "multiline");
109+
assertPatternMatch(sources, "\\u000a", "newline", "multiline");
110+
// multiline match
111+
assertPatternMatch(sources, "qwe[\\r\\n]+rty", "multiline");
112+
// expansion of regex syntax, \u002e is '.' but must not be treated as the any-match
113+
assertNoPatternMatch(sources, "abcde\\u002e");
114+
// expansion of the expected char should match
115+
assertPatternMatch(sources, "abcde\\u0066", "findme");
116+
// same but inside a character class
117+
assertNoPatternMatch(sources, "abcde[\\u002e]");
118+
assertPatternMatch(sources, "abcde[\\u0066]", "findme");
119+
// Expanded \\u can't be interpreted as a char class to expand (\\u0064 == 'd')
120+
assertNoPatternMatch(sources, "\\\\u0064");
121+
}
122+
123+
@Test
124+
void testUnicode() {
125+
Map<String, String> sources = new HashMap<>();
126+
sources.put("emoji", "😀");
127+
sources.put("water", "水");
128+
129+
// Can find emoji
130+
assertPatternMatch(sources, "\\uD83D\\uDE00", "emoji");
131+
// Can find 3-byte character
132+
assertPatternMatch(sources, "\\u6c34", "water");
133+
// Can not match on partial surrogate pairs (only equivalent on java 15+, prior to that
134+
// the java Pattern class could match half pairs.)
135+
// assertNoPatternMatch(sources, "\\uD83D");
136+
// assertNoPatternMatch(sources, "\\uDE00");
105137
}
106138
}

lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,126 @@ void testBackslashEscapeCounts() {
210210
assertNoAnchorReplacement("foo\\\\\\^bar");
211211
}
212212

213+
private void assertEscapeExpansion(String expected, String regex) {
214+
assertThat(RegexRewriter.expandEscapeCodes(regex)).isEqualTo(expected);
215+
}
216+
217+
private void assertNoEscapeExpansion(String regex) {
218+
assertThat(RegexRewriter.expandEscapeCodes(regex)).isEqualTo(regex);
219+
}
220+
221+
@Test
222+
void testEscapeExpansionLiteralString() {
223+
// empty literal passthru
224+
assertNoEscapeExpansion("\"\"");
225+
// escape in a quoted literal should not expand
226+
assertNoEscapeExpansion("\"\\t\"");
227+
// multiple quoted literals should not expand
228+
assertNoEscapeExpansion("\"\\t\" \"\\t\"");
229+
// unclosed literal should not expand
230+
assertNoEscapeExpansion("\"\\t");
231+
// should expand before a literal
232+
assertEscapeExpansion("\n\"\\n\"", "\\n\"\\n\"");
233+
// should expand after a literal
234+
assertEscapeExpansion("\"\\n\"\n", "\"\\n\"\\n");
235+
// should expand between quoted literals
236+
assertEscapeExpansion("\"\\t\"\n\"\\t\"", "\"\\t\"\\n\"\\t\"");
237+
}
238+
239+
@Test
240+
void testEscapeExpansionCharClasses() {
241+
// unclosed character class
242+
assertEscapeExpansion("[\t", "[\\t");
243+
// empty character class
244+
assertNoEscapeExpansion("[]");
245+
// escape in a character class should expand
246+
assertEscapeExpansion("[\r\n]", "[\\r\\n]");
247+
// nested char class (invalid but should handle)
248+
assertEscapeExpansion("[\t[\n]]", "[\\t[\\n]]");
249+
}
250+
251+
@Test
252+
void testEscapeExpansionUnicodeEdgeCases() {
253+
// Unicode at end of string (incomplete)
254+
assertNoEscapeExpansion("\\u123");
255+
assertNoEscapeExpansion("\\u12");
256+
assertNoEscapeExpansion("\\u1");
257+
assertNoEscapeExpansion("\\u");
258+
259+
// Unicode with uppercase hex
260+
assertEscapeExpansion("\\\u000C", "\\u000C");
261+
assertEscapeExpansion("\\\u00FF", "\\u00FF");
262+
263+
// Unicode with mixed case
264+
assertEscapeExpansion("\\\u00Af", "\\u00Af");
265+
266+
// Multiple unicode escapes
267+
assertEscapeExpansion("\\\u0009\\\n", "\\u0009\\u000A");
268+
269+
// Unicode zero
270+
assertEscapeExpansion("\\\u0000", "\\u0000");
271+
272+
// Unicode with non-hex after valid 4 chars
273+
assertEscapeExpansion("\\\u0009g", "\\u0009g");
274+
275+
// Invalid hex characters
276+
assertNoEscapeExpansion("\\u000g");
277+
assertNoEscapeExpansion("\\u00g0");
278+
assertNoEscapeExpansion("\\ug000");
279+
assertNoEscapeExpansion("\\u000G"); // uppercase G invalid
280+
281+
// invalid followed by valid
282+
assertEscapeExpansion("\\u00\\\u000e", "\\u00\\u000e");
283+
}
284+
285+
@Test
286+
void testEscapeExpansionBackslashCount() {
287+
// only backslashes
288+
assertNoEscapeExpansion("\\");
289+
assertNoEscapeExpansion("\\\\");
290+
// single backslash
291+
assertEscapeExpansion("\n", "\\n");
292+
// double backslash
293+
assertNoEscapeExpansion("\\\\n");
294+
// triple backslash
295+
assertEscapeExpansion("\\\\\n", "\\\\\\n");
296+
}
297+
298+
@Test
299+
void testEscapeExpansionStateTransitions() {
300+
// Transition from literal to character class
301+
assertEscapeExpansion("\"abc\"[\t]", "\"abc\"[\\t]");
302+
// Transition from character class to literal
303+
assertEscapeExpansion("[\t]\"\\n\"", "[\\t]\"\\n\"");
304+
// Multiple state changes in one regex
305+
assertEscapeExpansion("\\\\t\"\\n\"[\n]\\\\t", "\\\\t\"\\n\"[\\n]\\\\t");
306+
}
307+
308+
@Test
309+
void testEscapeExpansionErrorRecovery() {
310+
// Malformed but should continue processing
311+
assertEscapeExpansion("\\uabcq\t", "\\uabcq\\t");
312+
// Mixed valid and invalid
313+
assertEscapeExpansion("\\\u0009\\uabcq\n", "\\u0009\\uabcq\\n");
314+
}
315+
316+
@Test
317+
void testBasicEscapeSequenceExpansion() {
318+
// escape with nothing after should passthru
319+
assertNoEscapeExpansion("\\");
320+
// no defined expansion
321+
assertNoEscapeExpansion("\\q\\.");
322+
// simple expansion
323+
assertEscapeExpansion("\t\r\n", "\\t\\r\\n");
324+
// unicode escapes should expand
325+
assertEscapeExpansion("\\\u000c", "\\u000c");
326+
// unicode escapes with non-hex value should passthru
327+
assertNoEscapeExpansion("\\uabcq");
328+
// short unicode escape should passthru
329+
assertNoEscapeExpansion("\\u00");
330+
}
331+
332+
213333
@Test
214334
void testMultipleAnchors() {
215335
assertAnchorReplacement("\uFDD0abc\uFDD1|\uFDD0def\uFDD1", "^abc$|^def$");
@@ -223,6 +343,48 @@ void testEdgeCases() {
223343
assertAnchorReplacement("\uFDD0\uFDD1", "^$");
224344
}
225345

346+
@Test
347+
void testUnicodeSurrogatePairs() {
348+
// unicode escapes for characters beyond BMP
349+
// (joined with + because otherwise the compiler complains of illegal escape character)
350+
assertEscapeExpansion("\\\uD835" + "\\\uDC00", "\\uD835\\uDC00"); // Mathematical bold A
351+
assertEscapeExpansion("\\\uD83D" + "\\\uDE00", "\\uD83D\\uDE00"); // Grinning face emoji
352+
assertEscapeExpansion("\\\uD835" + "\\\uDFCF" + "\\\uD835" + "\\\uDFD0", "\\uD835\\uDFCF\\uD835\\uDFD0"); // Mathematical bold digits
353+
// incomplete surrogate pairs (they expand! maybe not ideal). In testing the
354+
// downstream regex engine will not match half a surrogate pair.
355+
assertEscapeExpansion("\\\uD83D", "\\uD83D"); // High surrogate without low
356+
assertEscapeExpansion("\\\uDE00", "\\uDE00"); // Low surrogate without high
357+
// invalid surrogate sequences (also expands! also won't match anything).
358+
assertEscapeExpansion("\\\uuD83D\n", "\\uD83D\\n"); // High surrogate + regular escape
359+
// surrogate pairs in character classes (see also RegexEquivalenceTest.testUnicode)
360+
assertEscapeExpansion("[\\\uD83D" + "\\\uDE00]", "[\\uD83D\\uDE00]");
361+
// surrogate pairs in quoted literals (should not expand)
362+
assertNoEscapeExpansion("\"\\uD83D\\uDE00\"");
363+
}
364+
365+
@Test
366+
void testMixedEscapeTypes() {
367+
// Mix unicode and regular escapes for same character
368+
assertEscapeExpansion("\\\n\n", "\\u000A\\n");
369+
assertEscapeExpansion("\\\t\t", "\\u0009\\t");
370+
assertEscapeExpansion("\\\r\r", "\\u000D\\r");
371+
372+
// Mix valid and invalid unicode escapes
373+
assertEscapeExpansion("\\u00\n", "\\u00\\n");
374+
assertEscapeExpansion("\n\\uabcg", "\\n\\uabcg");
375+
376+
// Escaped backslash before unicode
377+
assertNoEscapeExpansion("\\\\u000A");
378+
assertEscapeExpansion("\\\\\n", "\\\\\\n");
379+
380+
// Multiple mixed escapes in sequence
381+
assertEscapeExpansion("\t\\u00g0\r\n", "\\t\\u00g0\\r\\n");
382+
383+
// Mixed escapes in character classes
384+
assertEscapeExpansion("[\\\t\n]", "[\\u0009\\n]");
385+
assertEscapeExpansion("[\t\\u00g0]", "[\\t\\u00g0]");
386+
}
387+
226388
@Test
227389
void testComplexCharacterClassRanges() {
228390
// Invalid ranges with character class shortcuts
@@ -269,6 +431,11 @@ void testPathologicalBackslashes() {
269431
assertAnchorReplacement("\\\\\uFDD0", "\\\\^"); // Two backslashes + anchor
270432
assertNoAnchorReplacement("\\\\\\^"); // Three backslashes + escaped anchor
271433

434+
// Pathological backslashes with unicode
435+
assertNoEscapeExpansion("\\\\u000A"); // 2 backslashes + unicode (no expansion)
436+
assertEscapeExpansion("\\\\\\\n", "\\\\\\u000A"); // 3 backslashes + unicode (expands)
437+
assertNoEscapeExpansion("\\\\\\\\u000A"); // 4 backslashes + unicode (no expansion)
438+
272439
// Long sequences
273440
String manyBackslashes = "\\\\\\\\\\\\\\\\\\\\"; // 10 backslashes
274441
assertNoCharClassReplacement(manyBackslashes + "d");
@@ -277,5 +444,6 @@ void testPathologicalBackslashes() {
277444
// Backslashes at end of constructs
278445
assertNoCharClassReplacement("[abc\\\\]");
279446
assertNoAnchorReplacement("test\\\\");
447+
assertNoEscapeExpansion("pattern\\\\");
280448
}
281449
}

0 commit comments

Comments
 (0)