Skip to content

Commit c9d1834

Browse files
committed
regex: Separate out regex iteration
We already do this same essential algorithm twice, and a followup patch will add a third occurance. Pull the iteration out into a bit of shared implementation. While here add a couple more test cases around bits that the state tracks. Change-Id: I91490958f2e26062605afc6fdee055e1674a1ab8
1 parent 479aacc commit c9d1834

File tree

2 files changed

+111
-56
lines changed

2 files changed

+111
-56
lines changed

lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java

Lines changed: 55 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -51,40 +51,16 @@ public static CharSequence rewrite(CharSequence regex, boolean replaceAnchors) {
5151
@SuppressWarnings({"CyclomaticComplexity", "NPathComplexity"})
5252
static CharSequence replaceAnchors(CharSequence input) {
5353
StringBuilder result = new StringBuilder();
54-
boolean inCharClass = false;
55-
boolean inLiteral = false;
56-
int backslashCount = 0;
54+
RegexParseState parse = new RegexParseState(input);
5755

5856
for (int i = 0; i < input.length(); i++) {
59-
char c = input.charAt(i);
60-
61-
// Count the number of backslashes preceding this character
62-
boolean escaped = (backslashCount % 2) != 0;
63-
if (c == '\\') {
64-
backslashCount += 1;
65-
} else {
66-
backslashCount = 0;
67-
}
57+
char c = parse.next(i);
6858

69-
if (!inLiteral && !inCharClass && !escaped && c == '"') {
70-
inLiteral = true;
71-
} else if (inLiteral && c == '"') {
72-
inLiteral = false;
73-
}
74-
75-
if (!inLiteral && !escaped) {
76-
if (c == '[') {
77-
inCharClass = true;
78-
} else if (c == ']' && inCharClass) {
79-
inCharClass = false;
80-
}
81-
}
82-
83-
if (inLiteral) {
59+
if (parse.inLiteral) {
8460
result.append(c);
85-
} else if (!inCharClass && !escaped && c == '^') {
61+
} else if (!parse.inCharClass && !parse.escaped && c == '^') {
8662
result.append(START_ANCHOR_MARKER);
87-
} else if (!inCharClass && !escaped && c == '$') {
63+
} else if (!parse.inCharClass && !parse.escaped && c == '$') {
8864
result.append(END_ANCHOR_MARKER);
8965
} else {
9066
result.append(c);
@@ -134,54 +110,77 @@ private static String expandCharClass(CharSequence charClass) {
134110
@SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
135111
static CharSequence replaceCharClasses(CharSequence input) {
136112
StringBuilder result = new StringBuilder();
113+
RegexParseState parse = new RegexParseState(input);
137114
int charClassStart = -1;
138-
boolean inCharClass = false;
139-
boolean inLiteral = false;
140-
int backslashCount = 0;
141115

142116
for (int i = 0; i < input.length(); i++) {
143-
char c = input.charAt(i);
117+
char c = parse.next(i);
144118

145-
// Count the number of backslashes preceding this character
146-
boolean escaped = (backslashCount % 2) != 0;
147-
if (c == '\\') {
148-
backslashCount += 1;
149-
} else {
150-
backslashCount = 0;
151-
}
152-
153-
if (!inLiteral && !inCharClass && !escaped && c == '"') {
154-
inLiteral = true;
155-
} else if (inLiteral && c == '"') {
156-
inLiteral = false;
157-
}
158-
159-
if (inLiteral) {
119+
if (parse.inLiteral) {
160120
result.append(c);
161-
} else if (!inCharClass && !escaped && c == '[') {
162-
inCharClass = true;
121+
} else if (parse.inCharClass && charClassStart == -1) {
163122
charClassStart = i + 1;
164-
} else if (inCharClass && !escaped && c == ']') {
123+
} else if (!parse.inCharClass && charClassStart != -1) {
165124
// While expansion could be mixed into this function, it does a similar walk, it
166125
// seemed less confusing to separate into a dedicated routine.
167126
result.append(expandCharClass(input.subSequence(charClassStart, i)));
168-
inCharClass = false;
169127
charClassStart = -1;
170-
} else if (!inCharClass && !escaped && c == '.') {
128+
} else if (!parse.inCharClass && !parse.escaped && c == '.') {
171129
// . must not match the anchors
172130
result.append("[^").append(START_ANCHOR_MARKER).append(END_ANCHOR_MARKER).append(']');
173-
} else if (!inCharClass && escaped && CHAR_CLASSES.containsKey(c)) {
131+
} else if (!parse.inCharClass && parse.escaped && CHAR_CLASSES.containsKey(c)) {
174132
result.setLength(result.length() - 1);
175133
result.append('[').append(CHAR_CLASSES.get(c)).append(']');
176-
} else if (!inCharClass) {
134+
} else if (!parse.inCharClass) {
177135
result.append(c);
178136
}
179137
}
180-
if (inCharClass) {
138+
if (parse.inCharClass) {
181139
// unclosed char class
182140
result.append('[').append(input.subSequence(charClassStart, input.length()));
183141
}
184142

185143
return result.toString();
186144
}
145+
146+
private static class RegexParseState {
147+
final CharSequence input;
148+
boolean escaped;
149+
boolean inCharClass;
150+
boolean inLiteral;
151+
int backslashCount;
152+
153+
RegexParseState(CharSequence input) {
154+
this.input = input;
155+
}
156+
157+
@SuppressWarnings({"CyclomaticComplexity"})
158+
char next(int i) {
159+
char c = input.charAt(i);
160+
161+
// Count the number of backslashes preceding this character
162+
escaped = (backslashCount % 2) != 0;
163+
if (c == '\\') {
164+
backslashCount += 1;
165+
} else {
166+
backslashCount = 0;
167+
}
168+
169+
if (!inLiteral && !inCharClass && !escaped && c == '"') {
170+
inLiteral = true;
171+
} else if (inLiteral && c == '"') {
172+
inLiteral = false;
173+
}
174+
175+
if (!inLiteral && !escaped) {
176+
if (c == '[') {
177+
inCharClass = true;
178+
} else if (c == ']' && inCharClass) {
179+
inCharClass = false;
180+
}
181+
}
182+
183+
return c;
184+
}
185+
}
187186
}

lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,4 +222,60 @@ void testEdgeCases() {
222222
assertNoAnchorReplacement("\\");
223223
assertAnchorReplacement("\uFDD0\uFDD1", "^$");
224224
}
225+
226+
@Test
227+
void testComplexCharacterClassRanges() {
228+
// Invalid ranges with character class shortcuts
229+
assertCharClassReplacement("[A-0-9]", "[A-\\d]"); // Already covered but worth grouping
230+
assertCharClassReplacement("[0-9-z]", "[\\d-z]");
231+
assertCharClassReplacement("[a-0-9-Z]", "[a-\\d-Z]");
232+
233+
// Shorthand at range boundaries (invalid but should handle gracefully)
234+
assertCharClassReplacement("[a-0-9z]", "[a-\\dz]");
235+
assertCharClassReplacement("[0-9-z]", "[\\d-z]");
236+
237+
// Multiple ranges with shortcuts
238+
assertCharClassReplacement("[a-z0-9A-Z]", "[a-z\\dA-Z]");
239+
assertCharClassReplacement("[0-9a-zA-ZA-Za-z0-9_]", "[\\da-zA-Z\\w]");
240+
241+
// Negated complex ranges
242+
assertCharClassReplacement("[^\uFDD0\uFDD1a-z0-9]", "[^a-z\\d]");
243+
assertCharClassReplacement("[^\uFDD0\uFDD1A-Za-z0-9_a-z]", "[^\\wa-z]");
244+
245+
// Edge case: shorthand immediately after range dash
246+
assertCharClassReplacement("[a-0-9]", "[a-\\d]");
247+
assertCharClassReplacement("[z-A-Za-z0-9_]", "[z-\\w]");
248+
249+
// Multiple shortcuts in ranges
250+
assertCharClassReplacement("[0-9A-Za-z0-9_\f\n\r\t\u0011\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]", "[\\d\\w\\s]");
251+
}
252+
253+
@Test
254+
void testPathologicalBackslashes() {
255+
// Many consecutive backslashes with shortcuts
256+
String twoBackslashes = "\\\\d";
257+
String fourBackslashes = "\\\\\\\\d";
258+
String sixBackslashes = "\\\\\\\\\\\\d";
259+
260+
// two backslashes
261+
assertNoCharClassReplacement("\\\\d");
262+
// four backslashes
263+
assertNoCharClassReplacement("\\\\\\\\d"); // \\\\d -> two literal backslashes + \d expansion
264+
// six backslashes
265+
assertNoCharClassReplacement("\\\\\\\\\\\\d"); // \\\\\\d -> three literal backslashes + d
266+
267+
// Pathological backslashes with anchors
268+
assertNoAnchorReplacement("\\\\\\^"); // Escaped backslash + escaped anchor
269+
assertAnchorReplacement("\\\\\uFDD0", "\\\\^"); // Two backslashes + anchor
270+
assertNoAnchorReplacement("\\\\\\^"); // Three backslashes + escaped anchor
271+
272+
// Long sequences
273+
String manyBackslashes = "\\\\\\\\\\\\\\\\\\\\"; // 10 backslashes
274+
assertNoCharClassReplacement(manyBackslashes + "d");
275+
assertCharClassReplacement(manyBackslashes + "[0-9]", manyBackslashes + "\\d");
276+
277+
// Backslashes at end of constructs
278+
assertNoCharClassReplacement("[abc\\\\]");
279+
assertNoAnchorReplacement("test\\\\");
280+
}
225281
}

0 commit comments

Comments
 (0)