Skip to content

Commit b82feca

Browse files
committed
regex: Support quoted literals
Lucene regexp's include a quoted literal syntax, but the regex rewriter was not taking these into account. Add test cases asserting appropriate expansions, and implement the necessary changes to pass them. This only tests the expansions directly, and not the regex equivalence, because the java Pattern we use to assert equivalence does not support the same quoted literals. Bug: T399162 Change-Id: Ibce4461cfa489b1718bf9e7914f925ee29b204e1
1 parent e03e3fb commit b82feca

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,11 @@ public static CharSequence rewrite(CharSequence regex, boolean replaceAnchors) {
4848
* By replacing the anchors in the regex and adding the anchor markers both in the rechecker and at
4949
* index time via a pattern_replace char_filter, we can offer full support for start and end anchors.
5050
*/
51-
@SuppressWarnings("CyclomaticComplexity")
51+
@SuppressWarnings({"CyclomaticComplexity", "NPathComplexity"})
5252
static CharSequence replaceAnchors(CharSequence input) {
5353
StringBuilder result = new StringBuilder();
5454
boolean inCharClass = false;
55+
boolean inLiteral = false;
5556
int backslashCount = 0;
5657

5758
for (int i = 0; i < input.length(); i++) {
@@ -65,15 +66,23 @@ static CharSequence replaceAnchors(CharSequence input) {
6566
backslashCount = 0;
6667
}
6768

68-
if (!escaped) {
69+
if (!inLiteral && !inCharClass && !escaped && c == '"') {
70+
inLiteral = true;
71+
} else if (inLiteral && c == '"') {
72+
inLiteral = false;
73+
}
74+
75+
if (!inLiteral && !escaped) {
6976
if (c == '[') {
7077
inCharClass = true;
7178
} else if (c == ']' && inCharClass) {
7279
inCharClass = false;
7380
}
7481
}
7582

76-
if (!inCharClass && !escaped && c == '^') {
83+
if (inLiteral) {
84+
result.append(c);
85+
} else if (!inCharClass && !escaped && c == '^') {
7786
result.append(START_ANCHOR_MARKER);
7887
} else if (!inCharClass && !escaped && c == '$') {
7988
result.append(END_ANCHOR_MARKER);
@@ -121,12 +130,13 @@ private static String expandCharClass(CharSequence charClass) {
121130
return result.append(']').toString();
122131
}
123132

124-
@SuppressWarnings("CyclomaticComplexity")
133+
@SuppressWarnings({"CyclomaticComplexity", "NPathComplexity"})
125134
@SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
126135
static CharSequence replaceCharClasses(CharSequence input) {
127136
StringBuilder result = new StringBuilder();
128137
int charClassStart = -1;
129138
boolean inCharClass = false;
139+
boolean inLiteral = false;
130140
int backslashCount = 0;
131141

132142
for (int i = 0; i < input.length(); i++) {
@@ -140,7 +150,15 @@ static CharSequence replaceCharClasses(CharSequence input) {
140150
backslashCount = 0;
141151
}
142152

143-
if (!inCharClass && !escaped && c == '[') {
153+
if (!inLiteral && !inCharClass && !escaped && c == '"') {
154+
inLiteral = true;
155+
} else if (inLiteral && c == '"') {
156+
inLiteral = false;
157+
}
158+
159+
if (inLiteral) {
160+
result.append(c);
161+
} else if (!inCharClass && !escaped && c == '[') {
144162
inCharClass = true;
145163
charClassStart = i + 1;
146164
} else if (inCharClass && !escaped && c == ']') {

lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexRewriteTest.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,55 @@ void testPassthruUppercase() {
9999
assertCharClassReplacement("[^\uFDD0\uFDD1\\W]", "[^\\W]");
100100
}
101101

102+
@Test
103+
void testQuotedLiterals() {
104+
// empty quoted literal passes through
105+
assertNoCharClassReplacement("\"\"");
106+
// Quotes define literals that should not be expanded
107+
assertNoCharClassReplacement("\".\"");
108+
assertCharClassReplacement("[^\uFDD0\uFDD1]\".\"[^\uFDD0\uFDD1]", ".\".\".");
109+
// source query is invalid with unpaired quotes. We passthru to let next stage fail
110+
assertNoCharClassReplacement("\"unclosed");
111+
// Quotes inside a char class do not start a literal
112+
assertCharClassReplacement("[^\uFDD0\uFDD1\"]", "[^\"]");
113+
// escaped quotes do not start a literal
114+
assertCharClassReplacement("\\\"[^\uFDD0\uFDD1]", "\\\".");
115+
// expands shorthands on each edge of the literal
116+
assertCharClassReplacement("[0-9]\"abc\"", "\\d\"abc\"");
117+
assertCharClassReplacement("\"abc\"[0-9]", "\"abc\"\\d");
118+
// no shorthand expansion inside the literal
119+
assertNoCharClassReplacement("\"\\s\\d\\w\"");
120+
// unquoted anchors are replaced when quotes are present
121+
assertAnchorReplacement("\uFDD0\"^$\"\uFDD1", "^\"^$\"$");
122+
// quoted anchors are literals, not to be expanded
123+
assertNoAnchorReplacement("\"^$\"");
124+
// anchors also passthru unclosed quotes to next layer
125+
assertAnchorReplacement("\uFDD0\"^$", "^\"^$");
126+
// Escaped backslash before quote (should start literal)
127+
assertNoCharClassReplacement("\\\\\"literal\"");
128+
// Double-escaped quote (should expand the dot)
129+
assertCharClassReplacement("\\\\\\\"[^\uFDD0\uFDD1]", "\\\\\\\".");
130+
// Quote escaped with multiple backslashes
131+
assertCharClassReplacement("\\\\\\\\\\\"[^\uFDD0\uFDD1]", "\\\\\\\\\\\".");
132+
// Quote at different positions in char class
133+
assertNoCharClassReplacement("[\".]");
134+
assertCharClassReplacement("[^\uFDD0\uFDD1.\"]", "[^.\"]");
135+
assertNoCharClassReplacement("[a\".z]");
136+
// Multiple quotes in char class
137+
assertNoCharClassReplacement("[\"\".]");
138+
// Quote at end of string
139+
assertNoCharClassReplacement("pattern\"");
140+
assertNoAnchorReplacement("pattern\"");
141+
// Only quote character
142+
assertNoCharClassReplacement("\"");
143+
assertNoAnchorReplacement("\"");
144+
// Quote after escape at end
145+
assertNoCharClassReplacement("pattern\\\"");
146+
// quote inside the literal cant be escaped, the escape is literal
147+
assertCharClassReplacement("\"abc\\\"[0-9]", "\"abc\\\"\\d");
148+
assertAnchorReplacement("\"abc\\\"\uFDD1", "\"abc\\\"$");
149+
}
150+
102151
@Test
103152
void testUnknownEscapeSequences() {
104153
assertCharClassReplacement("[0-9]\\q", "\\d\\q");

0 commit comments

Comments
 (0)