@@ -210,6 +210,126 @@ void testBackslashEscapeCounts() {
210210 assertNoAnchorReplacement ("foo\\ \\ \\ ^bar" );
211211 }
212212
213+ private void assertEscapeExpansion (String expected , String regex ) {
214+ assertThat (RegexRewriter .expandEscapeCodes (regex )).isEqualTo (expected );
215+ }
216+
217+ private void assertNoEscapeExpansion (String regex ) {
218+ assertThat (RegexRewriter .expandEscapeCodes (regex )).isEqualTo (regex );
219+ }
220+
221+ @ Test
222+ void testEscapeExpansionLiteralString () {
223+ // empty literal passthru
224+ assertNoEscapeExpansion ("\" \" " );
225+ // escape in a quoted literal should not expand
226+ assertNoEscapeExpansion ("\" \\ t\" " );
227+ // multiple quoted literals should not expand
228+ assertNoEscapeExpansion ("\" \\ t\" \" \\ t\" " );
229+ // unclosed literal should not expand
230+ assertNoEscapeExpansion ("\" \\ t" );
231+ // should expand before a literal
232+ assertEscapeExpansion ("\n \" \\ n\" " , "\\ n\" \\ n\" " );
233+ // should expand after a literal
234+ assertEscapeExpansion ("\" \\ n\" \n " , "\" \\ n\" \\ n" );
235+ // should expand between quoted literals
236+ assertEscapeExpansion ("\" \\ t\" \n \" \\ t\" " , "\" \\ t\" \\ n\" \\ t\" " );
237+ }
238+
239+ @ Test
240+ void testEscapeExpansionCharClasses () {
241+ // unclosed character class
242+ assertEscapeExpansion ("[\t " , "[\\ t" );
243+ // empty character class
244+ assertNoEscapeExpansion ("[]" );
245+ // escape in a character class should expand
246+ assertEscapeExpansion ("[\r \n ]" , "[\\ r\\ n]" );
247+ // nested char class (invalid but should handle)
248+ assertEscapeExpansion ("[\t [\n ]]" , "[\\ t[\\ n]]" );
249+ }
250+
251+ @ Test
252+ void testEscapeExpansionUnicodeEdgeCases () {
253+ // Unicode at end of string (incomplete)
254+ assertNoEscapeExpansion ("\\ u123" );
255+ assertNoEscapeExpansion ("\\ u12" );
256+ assertNoEscapeExpansion ("\\ u1" );
257+ assertNoEscapeExpansion ("\\ u" );
258+
259+ // Unicode with uppercase hex
260+ assertEscapeExpansion ("\\ \u000C " , "\\ u000C" );
261+ assertEscapeExpansion ("\\ \u00FF " , "\\ u00FF" );
262+
263+ // Unicode with mixed case
264+ assertEscapeExpansion ("\\ \u00Af " , "\\ u00Af" );
265+
266+ // Multiple unicode escapes
267+ assertEscapeExpansion ("\\ \u0009 \\ \n " , "\\ u0009\\ u000A" );
268+
269+ // Unicode zero
270+ assertEscapeExpansion ("\\ \u0000 " , "\\ u0000" );
271+
272+ // Unicode with non-hex after valid 4 chars
273+ assertEscapeExpansion ("\\ \u0009 g" , "\\ u0009g" );
274+
275+ // Invalid hex characters
276+ assertNoEscapeExpansion ("\\ u000g" );
277+ assertNoEscapeExpansion ("\\ u00g0" );
278+ assertNoEscapeExpansion ("\\ ug000" );
279+ assertNoEscapeExpansion ("\\ u000G" ); // uppercase G invalid
280+
281+ // invalid followed by valid
282+ assertEscapeExpansion ("\\ u00\\ \u000e " , "\\ u00\\ u000e" );
283+ }
284+
285+ @ Test
286+ void testEscapeExpansionBackslashCount () {
287+ // only backslashes
288+ assertNoEscapeExpansion ("\\ " );
289+ assertNoEscapeExpansion ("\\ \\ " );
290+ // single backslash
291+ assertEscapeExpansion ("\n " , "\\ n" );
292+ // double backslash
293+ assertNoEscapeExpansion ("\\ \\ n" );
294+ // triple backslash
295+ assertEscapeExpansion ("\\ \\ \n " , "\\ \\ \\ n" );
296+ }
297+
298+ @ Test
299+ void testEscapeExpansionStateTransitions () {
300+ // Transition from literal to character class
301+ assertEscapeExpansion ("\" abc\" [\t ]" , "\" abc\" [\\ t]" );
302+ // Transition from character class to literal
303+ assertEscapeExpansion ("[\t ]\" \\ n\" " , "[\\ t]\" \\ n\" " );
304+ // Multiple state changes in one regex
305+ assertEscapeExpansion ("\\ \\ t\" \\ n\" [\n ]\\ \\ t" , "\\ \\ t\" \\ n\" [\\ n]\\ \\ t" );
306+ }
307+
308+ @ Test
309+ void testEscapeExpansionErrorRecovery () {
310+ // Malformed but should continue processing
311+ assertEscapeExpansion ("\\ uabcq\t " , "\\ uabcq\\ t" );
312+ // Mixed valid and invalid
313+ assertEscapeExpansion ("\\ \u0009 \\ uabcq\n " , "\\ u0009\\ uabcq\\ n" );
314+ }
315+
316+ @ Test
317+ void testBasicEscapeSequenceExpansion () {
318+ // escape with nothing after should passthru
319+ assertNoEscapeExpansion ("\\ " );
320+ // no defined expansion
321+ assertNoEscapeExpansion ("\\ q\\ ." );
322+ // simple expansion
323+ assertEscapeExpansion ("\t \r \n " , "\\ t\\ r\\ n" );
324+ // unicode escapes should expand
325+ assertEscapeExpansion ("\\ \u000c " , "\\ u000c" );
326+ // unicode escapes with non-hex value should passthru
327+ assertNoEscapeExpansion ("\\ uabcq" );
328+ // short unicode escape should passthru
329+ assertNoEscapeExpansion ("\\ u00" );
330+ }
331+
332+
213333 @ Test
214334 void testMultipleAnchors () {
215335 assertAnchorReplacement ("\uFDD0 abc\uFDD1 |\uFDD0 def\uFDD1 " , "^abc$|^def$" );
@@ -223,6 +343,48 @@ void testEdgeCases() {
223343 assertAnchorReplacement ("\uFDD0 \uFDD1 " , "^$" );
224344 }
225345
346+ @ Test
347+ void testUnicodeSurrogatePairs () {
348+ // unicode escapes for characters beyond BMP
349+ // (joined with + because otherwise the compiler complains of illegal escape character)
350+ assertEscapeExpansion ("\\ \uD835 " + "\\ \uDC00 " , "\\ uD835\\ uDC00" ); // Mathematical bold A
351+ assertEscapeExpansion ("\\ \uD83D " + "\\ \uDE00 " , "\\ uD83D\\ uDE00" ); // Grinning face emoji
352+ assertEscapeExpansion ("\\ \uD835 " + "\\ \uDFCF " + "\\ \uD835 " + "\\ \uDFD0 " , "\\ uD835\\ uDFCF\\ uD835\\ uDFD0" ); // Mathematical bold digits
353+ // incomplete surrogate pairs (they expand! maybe not ideal). In testing the
354+ // downstream regex engine will not match half a surrogate pair.
355+ assertEscapeExpansion ("\\ \uD83D " , "\\ uD83D" ); // High surrogate without low
356+ assertEscapeExpansion ("\\ \uDE00 " , "\\ uDE00" ); // Low surrogate without high
357+ // invalid surrogate sequences (also expands! also won't match anything).
358+ assertEscapeExpansion ("\\ \uuD83D \n " , "\\ uD83D\\ n" ); // High surrogate + regular escape
359+ // surrogate pairs in character classes (see also RegexEquivalenceTest.testUnicode)
360+ assertEscapeExpansion ("[\\ \uD83D " + "\\ \uDE00 ]" , "[\\ uD83D\\ uDE00]" );
361+ // surrogate pairs in quoted literals (should not expand)
362+ assertNoEscapeExpansion ("\" \\ uD83D\\ uDE00\" " );
363+ }
364+
365+ @ Test
366+ void testMixedEscapeTypes () {
367+ // Mix unicode and regular escapes for same character
368+ assertEscapeExpansion ("\\ \n \n " , "\\ u000A\\ n" );
369+ assertEscapeExpansion ("\\ \t \t " , "\\ u0009\\ t" );
370+ assertEscapeExpansion ("\\ \r \r " , "\\ u000D\\ r" );
371+
372+ // Mix valid and invalid unicode escapes
373+ assertEscapeExpansion ("\\ u00\n " , "\\ u00\\ n" );
374+ assertEscapeExpansion ("\n \\ uabcg" , "\\ n\\ uabcg" );
375+
376+ // Escaped backslash before unicode
377+ assertNoEscapeExpansion ("\\ \\ u000A" );
378+ assertEscapeExpansion ("\\ \\ \n " , "\\ \\ \\ n" );
379+
380+ // Multiple mixed escapes in sequence
381+ assertEscapeExpansion ("\t \\ u00g0\r \n " , "\\ t\\ u00g0\\ r\\ n" );
382+
383+ // Mixed escapes in character classes
384+ assertEscapeExpansion ("[\\ \t \n ]" , "[\\ u0009\\ n]" );
385+ assertEscapeExpansion ("[\t \\ u00g0]" , "[\\ t\\ u00g0]" );
386+ }
387+
226388 @ Test
227389 void testComplexCharacterClassRanges () {
228390 // Invalid ranges with character class shortcuts
@@ -269,6 +431,11 @@ void testPathologicalBackslashes() {
269431 assertAnchorReplacement ("\\ \\ \uFDD0 " , "\\ \\ ^" ); // Two backslashes + anchor
270432 assertNoAnchorReplacement ("\\ \\ \\ ^" ); // Three backslashes + escaped anchor
271433
434+ // Pathological backslashes with unicode
435+ assertNoEscapeExpansion ("\\ \\ u000A" ); // 2 backslashes + unicode (no expansion)
436+ assertEscapeExpansion ("\\ \\ \\ \n " , "\\ \\ \\ u000A" ); // 3 backslashes + unicode (expands)
437+ assertNoEscapeExpansion ("\\ \\ \\ \\ u000A" ); // 4 backslashes + unicode (no expansion)
438+
272439 // Long sequences
273440 String manyBackslashes = "\\ \\ \\ \\ \\ \\ \\ \\ \\ \\ " ; // 10 backslashes
274441 assertNoCharClassReplacement (manyBackslashes + "d" );
@@ -277,5 +444,6 @@ void testPathologicalBackslashes() {
277444 // Backslashes at end of constructs
278445 assertNoCharClassReplacement ("[abc\\ \\ ]" );
279446 assertNoAnchorReplacement ("test\\ \\ " );
447+ assertNoEscapeExpansion ("pattern\\ \\ " );
280448 }
281449}
0 commit comments