Skip to content

Commit 74dca85

Browse files
committed
Add lucene regex rewriting
The lucene regex syntax is lacking some basic features that users expect from regex engines. Some of these features can be added by rewriting the regex, along with transforming the input value. Supported features: * Minimal set of shorthand character classes: \d, \s, and \w * Start/end anchors: ^ and $ The transformation code is being put in the utils module so it can be reused in both the opensearch-extra and the opensearch-cirrus-highlighter plugins. Bug: T317599 Change-Id: Ie5e3ad991235fdd0b6efffc9c747b406e52e8e00
1 parent 68589a9 commit 74dca85

File tree

5 files changed

+497
-0
lines changed

5 files changed

+497
-0
lines changed

lucene-regex-rewriter/pom.xml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3+
<modelVersion>4.0.0</modelVersion>
4+
<parent>
5+
<groupId>org.wikimedia.utils</groupId>
6+
<artifactId>wmf-jvm-utils-parent</artifactId>
7+
<version>1.0.1-SNAPSHOT</version>
8+
</parent>
9+
10+
<artifactId>lucene-regex-rewriter</artifactId>
11+
<packaging>jar</packaging>
12+
13+
<name>Transformations over lucene regular expressions to support additional syntax</name>
14+
15+
<dependencies>
16+
<dependency>
17+
<groupId>com.github.spotbugs</groupId>
18+
<artifactId>spotbugs-annotations</artifactId>
19+
<optional>true</optional>
20+
</dependency>
21+
<!-- Test deps -->
22+
<dependency>
23+
<groupId>org.apache.lucene</groupId>
24+
<artifactId>lucene-misc</artifactId>
25+
<scope>test</scope>
26+
</dependency>
27+
<dependency>
28+
<groupId>org.assertj</groupId>
29+
<artifactId>assertj-core</artifactId>
30+
<scope>test</scope>
31+
</dependency>
32+
<dependency>
33+
<groupId>org.junit.jupiter</groupId>
34+
<artifactId>junit-jupiter</artifactId>
35+
<scope>test</scope>
36+
</dependency>
37+
</dependencies>
38+
</project>
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
package org.wikimedia.utils.regex;
2+
3+
import java.util.Collections;
4+
import java.util.HashMap;
5+
import java.util.Map;
6+
7+
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
8+
9+
public final class RegexRewriter {
10+
public static final char START_ANCHOR_MARKER = '\uFDD0';
11+
public static final char END_ANCHOR_MARKER = '\uFDD1';
12+
13+
private static final Map<Character, String> CHAR_CLASSES;
14+
15+
static {
16+
Map<Character, String> charClasses = new HashMap<>();
17+
charClasses.put('d', "0-9");
18+
charClasses.put('w', "A-Za-z0-9_");
19+
charClasses.put('s', "\f\n\r\t\u0011\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff");
20+
CHAR_CLASSES = Collections.unmodifiableMap(charClasses);
21+
}
22+
23+
private RegexRewriter() {
24+
}
25+
26+
/**
27+
* Applies the necessary transformation to string inputs when using replaceAnchors=true.
28+
*/
29+
public static String anchorTransformation(String input) {
30+
return START_ANCHOR_MARKER + input + END_ANCHOR_MARKER;
31+
}
32+
33+
/**
34+
* Rewrites the provided regex to support character classes and optionally anchors.
35+
* If anchor support is enabled then RegexRewriter.anchorTransformation must be applied
36+
* to strings to be checked.
37+
*/
38+
public static CharSequence rewrite(CharSequence regex, boolean replaceAnchors) {
39+
CharSequence result = replaceCharClasses(regex);
40+
if (replaceAnchors) {
41+
result = replaceAnchors(result);
42+
}
43+
return result;
44+
}
45+
46+
/**
47+
* Replaces anchors, unsupported by lucene regex, with reserved UTF8 characters.
48+
* By replacing the anchors in the regex and adding the anchor markers both in the rechecker and at
49+
* index time via a pattern_replace char_filter, we can offer full support for start and end anchors.
50+
*/
51+
@SuppressWarnings("CyclomaticComplexity")
52+
static CharSequence replaceAnchors(CharSequence input) {
53+
StringBuilder result = new StringBuilder();
54+
boolean inCharClass = false;
55+
int backslashCount = 0;
56+
57+
for (int i = 0; i < input.length(); i++) {
58+
char c = input.charAt(i);
59+
60+
// Count the number of backslashes preceding this character
61+
boolean escaped = (backslashCount % 2) != 0;
62+
if (c == '\\') {
63+
backslashCount += 1;
64+
} else {
65+
backslashCount = 0;
66+
}
67+
68+
if (!escaped) {
69+
if (c == '[') {
70+
inCharClass = true;
71+
} else if (c == ']' && inCharClass) {
72+
inCharClass = false;
73+
}
74+
}
75+
76+
if (!inCharClass && !escaped && c == '^') {
77+
result.append(START_ANCHOR_MARKER);
78+
} else if (!inCharClass && !escaped && c == '$') {
79+
result.append(END_ANCHOR_MARKER);
80+
} else {
81+
result.append(c);
82+
}
83+
}
84+
85+
return result.toString();
86+
}
87+
88+
private static String expandCharClass(CharSequence charClass) {
89+
if (charClass.length() == 0) {
90+
return "[]";
91+
}
92+
StringBuilder result = new StringBuilder("[");
93+
boolean negated = charClass.charAt(0) == '^';
94+
if (negated) {
95+
if (charClass.length() == 1) {
96+
return "[^]";
97+
}
98+
// negated must not match the anchors
99+
result.append('^').append(START_ANCHOR_MARKER).append(END_ANCHOR_MARKER);
100+
}
101+
102+
int backslashCount = 0;
103+
for (int i = negated ? 1 : 0; i < charClass.length(); i++) {
104+
char c = charClass.charAt(i);
105+
106+
boolean escaped = (backslashCount % 2) != 0;
107+
if (c == '\\') {
108+
backslashCount += 1;
109+
} else {
110+
backslashCount = 0;
111+
}
112+
113+
String expandedCharClass = CHAR_CLASSES.get(c);
114+
if (escaped && expandedCharClass != null) {
115+
result.setLength(result.length() - 1);
116+
result.append(expandedCharClass);
117+
} else {
118+
result.append(c);
119+
}
120+
}
121+
return result.append(']').toString();
122+
}
123+
124+
@SuppressWarnings("CyclomaticComplexity")
125+
@SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
126+
static CharSequence replaceCharClasses(CharSequence input) {
127+
StringBuilder result = new StringBuilder();
128+
int charClassStart = -1;
129+
boolean inCharClass = false;
130+
int backslashCount = 0;
131+
132+
for (int i = 0; i < input.length(); i++) {
133+
char c = input.charAt(i);
134+
135+
// Count the number of backslashes preceding this character
136+
boolean escaped = (backslashCount % 2) != 0;
137+
if (c == '\\') {
138+
backslashCount += 1;
139+
} else {
140+
backslashCount = 0;
141+
}
142+
143+
if (!inCharClass && !escaped && c == '[') {
144+
inCharClass = true;
145+
charClassStart = i + 1;
146+
} else if (inCharClass && !escaped && c == ']') {
147+
// While expansion could be mixed into this function, it does a similar walk, it
148+
// seemed less confusing to separate into a dedicated routine.
149+
result.append(expandCharClass(input.subSequence(charClassStart, i)));
150+
inCharClass = false;
151+
charClassStart = -1;
152+
} else if (!inCharClass && !escaped && c == '.') {
153+
// . must not match the anchors
154+
result.append("[^").append(START_ANCHOR_MARKER).append(END_ANCHOR_MARKER).append(']');
155+
} else if (!inCharClass && escaped && CHAR_CLASSES.containsKey(c)) {
156+
result.setLength(result.length() - 1);
157+
result.append('[').append(CHAR_CLASSES.get(c)).append(']');
158+
} else if (!inCharClass) {
159+
result.append(c);
160+
}
161+
}
162+
if (inCharClass) {
163+
// unclosed char class
164+
result.append('[').append(input.subSequence(charClassStart, input.length()));
165+
}
166+
167+
return result.toString();
168+
}
169+
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package org.wikimedia.utils.regex;
2+
3+
import static org.assertj.core.api.Assertions.assertThat;
4+
5+
import java.util.Arrays;
6+
import java.util.HashMap;
7+
import java.util.Map;
8+
import java.util.function.UnaryOperator;
9+
import java.util.regex.Matcher;
10+
import java.util.regex.Pattern;
11+
12+
import org.apache.lucene.util.automaton.Automaton;
13+
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
14+
import org.apache.lucene.util.automaton.RegExp;
15+
import org.junit.jupiter.api.Test;
16+
17+
class RegexEquivalenceTest {
18+
19+
private CharacterRunAutomaton buildLuceneRegex(String input, boolean replaceAnchors) {
20+
CharSequence rewritten = RegexRewriter.rewrite(input, replaceAnchors);
21+
RegExp regex = new RegExp(".*(" + rewritten + ").*");
22+
Automaton automaton = regex.toAutomaton();
23+
return new CharacterRunAutomaton(automaton);
24+
}
25+
26+
private void assertPatternMatch(Map<String, String> sources, String regex, String... expected) {
27+
// First verify the test case is correct by using java regex
28+
Pattern pattern = Pattern.compile(regex);
29+
// Then run our modified lucene regex to verify the same output
30+
boolean replaceAnchors = true;
31+
CharacterRunAutomaton charRun = buildLuceneRegex(regex, replaceAnchors);
32+
UnaryOperator<String> valueTransform = replaceAnchors ? RegexRewriter::anchorTransformation : s -> s;
33+
34+
for (Map.Entry<String, String> entry : sources.entrySet()) {
35+
boolean expectMatch = Arrays.stream(expected).anyMatch(docid -> docid.equals(entry.getKey()));
36+
37+
Matcher javaMatch = pattern.matcher(entry.getValue());
38+
assertThat(javaMatch.find())
39+
.describedAs("java regex `%s` against `%s`:`%s`", regex, entry.getKey(), entry.getValue())
40+
.isEqualTo(expectMatch);
41+
42+
boolean luceneMatch = charRun.run(valueTransform.apply(entry.getValue()));
43+
assertThat(luceneMatch)
44+
.describedAs("lucene regex `%s` against `%s`:`%s`", regex, entry.getKey(), entry.getValue())
45+
.isEqualTo(expectMatch);
46+
}
47+
}
48+
49+
private void assertNoPatternMatch(Map<String, String> sources, String regex) {
50+
assertPatternMatch(sources, regex);
51+
}
52+
53+
@Test
54+
void testPatternEquivalence() {
55+
Map<String, String> sources = new HashMap<>();
56+
sources.put("findme", "abcdef");
57+
sources.put("numbers", "12345");
58+
sources.put("edgecase1", "Start^Middle$End");
59+
sources.put("edgecase2", "^foo bar$");
60+
61+
// Basic start anchor
62+
assertPatternMatch(sources, "^abc", "findme");
63+
// No match if it's not the start of the string
64+
assertNoPatternMatch(sources, "^bc");
65+
// Basic end anchor
66+
assertPatternMatch(sources, "ef$", "findme");
67+
// No match if it's not the end of the string
68+
assertNoPatternMatch(sources, "de$");
69+
// We can match the plain ^ character with proper regex escaping
70+
assertPatternMatch(sources, "Start\\^", "edgecase1");
71+
assertPatternMatch(sources, "Start[\\^]", "edgecase1");
72+
// The unescaped ^ is an anchor and fails to match
73+
assertNoPatternMatch(sources, "Start^");
74+
// Same for plain $
75+
assertPatternMatch(sources, "Middle\\$", "edgecase1");
76+
// And similarly no match when not escaped
77+
assertNoPatternMatch(sources, "Middle$");
78+
// Can match a starting ^ if escaped
79+
assertNoPatternMatch(sources, "^foo");
80+
assertPatternMatch(sources, "\\^foo", "edgecase2");
81+
// or in a character class
82+
assertPatternMatch(sources, "[a^]foo", "edgecase2");
83+
// Similarly for $
84+
assertNoPatternMatch(sources, "bar$");
85+
assertPatternMatch(sources, "bar\\$", "edgecase2");
86+
assertPatternMatch(sources, "bar\\$$", "edgecase2");
87+
assertPatternMatch(sources, "bar[$]", "edgecase2");
88+
assertPatternMatch(sources, "bar[$]$", "edgecase2");
89+
// anchors can be used in parens
90+
assertPatternMatch(sources, "(^|qqq)abc", "findme");
91+
// any match (.) does not match anchors
92+
assertNoPatternMatch(sources, ".findme");
93+
assertNoPatternMatch(sources, "findme.");
94+
// \d matches numbers
95+
assertPatternMatch(sources, "\\d", "numbers");
96+
// [^\d] matches not-numbers
97+
assertPatternMatch(sources, "[^\\d]", "findme", "edgecase1", "edgecase2");
98+
// \s matches spaces
99+
assertPatternMatch(sources, "\\s", "edgecase2");
100+
// [^\s] matches not-spaces
101+
assertPatternMatch(sources, "^[^\\s]+$", "findme", "numbers", "edgecase1");
102+
// \w matches word-like things, it does not match spaces or special chars
103+
assertPatternMatch(sources, "^\\w+$", "findme", "numbers");
104+
assertPatternMatch(sources, "[^\\w]", "edgecase1", "edgecase2");
105+
}
106+
}

0 commit comments

Comments
 (0)