wikimedia
diff --git a/‎lucene-regex-rewriter/pom.xml‎
Lines changed: 38 additions & 0 deletions b/‎lucene-regex-rewriter/pom.xml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java‎
Lines changed: 169 additions & 0 deletions b/‎lucene-regex-rewriter/src/main/java/org/wikimedia/utils/regex/RegexRewriter.java‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java‎
Lines changed: 106 additions & 0 deletions b/‎lucene-regex-rewriter/src/test/java/org/wikimedia/utils/regex/RegexEquivalenceTest.java‎
Lines changed: 106 additions & 0 deletions
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>org.wikimedia.utils</groupId>
+        <artifactId>wmf-jvm-utils-parent</artifactId>
+        <version>1.0.1-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>lucene-regex-rewriter</artifactId>
+    <packaging>jar</packaging>
+
+    <name>Transformations over lucene regular expressions to support additional syntax</name>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.github.spotbugs</groupId>
+            <artifactId>spotbugs-annotations</artifactId>
+            <optional>true</optional>
+        </dependency>
+        <!-- Test deps -->
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-misc</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.assertj</groupId>
+            <artifactId>assertj-core</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>
@@ -0,0 +1,169 @@
+package org.wikimedia.utils.regex;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
+
+public final class RegexRewriter {
+    public static final char START_ANCHOR_MARKER = '\uFDD0';
+    public static final char END_ANCHOR_MARKER = '\uFDD1';
+
+    private static final Map<Character, String> CHAR_CLASSES;
+
+    static {
+        Map<Character, String> charClasses = new HashMap<>();
+        charClasses.put('d', "0-9");
+        charClasses.put('w', "A-Za-z0-9_");
+        charClasses.put('s', "\f\n\r\t\u0011\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff");
+        CHAR_CLASSES = Collections.unmodifiableMap(charClasses);
+    }
+
+    private RegexRewriter() {
+    }
+
+    /**
+     * Applies the necessary transformation to string inputs when using replaceAnchors=true.
+     */
+    public static String anchorTransformation(String input) {
+        return START_ANCHOR_MARKER + input + END_ANCHOR_MARKER;
+    }
+
+    /**
+     * Rewrites the provided regex to support character classes and optionally anchors.
+     * If anchor support is enabled then RegexRewriter.anchorTransformation must be applied
+     * to strings to be checked.
+     */
+    public static CharSequence rewrite(CharSequence regex, boolean replaceAnchors) {
+        CharSequence result = replaceCharClasses(regex);
+        if (replaceAnchors) {
+            result = replaceAnchors(result);
+        }
+        return result;
+    }
+
+    /**
+     * Replaces anchors, unsupported by lucene regex, with reserved UTF8 characters.
+     * By replacing the anchors in the regex and adding the anchor markers both in the rechecker and at
+     * index time via a pattern_replace char_filter, we can offer full support for start and end anchors.
+     */
+    @SuppressWarnings("CyclomaticComplexity")
+    static CharSequence replaceAnchors(CharSequence input) {
+        StringBuilder result = new StringBuilder();
+        boolean inCharClass = false;
+        int backslashCount = 0;
+
+        for (int i = 0; i < input.length(); i++) {
+            char c = input.charAt(i);
+
+            // Count the number of backslashes preceding this character
+            boolean escaped = (backslashCount % 2) != 0;
+            if (c == '\\') {
+                backslashCount += 1;
+            } else {
+                backslashCount = 0;
+            }
+
+            if (!escaped) {
+                if (c == '[') {
+                    inCharClass = true;
+                } else if (c == ']' && inCharClass) {
+                    inCharClass = false;
+                }
+            }
+
+            if (!inCharClass && !escaped && c == '^') {
+                result.append(START_ANCHOR_MARKER);
+            } else if (!inCharClass && !escaped && c == '$') {
+                result.append(END_ANCHOR_MARKER);
+            } else {
+                result.append(c);
+            }
+        }
+
+        return result.toString();
+    }
+
+    private static String expandCharClass(CharSequence charClass) {
+        if (charClass.length() == 0) {
+            return "[]";
+        }
+        StringBuilder result = new StringBuilder("[");
+        boolean negated = charClass.charAt(0) == '^';
+        if (negated) {
+            if (charClass.length() == 1) {
+                return "[^]";
+            }
+            // negated must not match the anchors
+            result.append('^').append(START_ANCHOR_MARKER).append(END_ANCHOR_MARKER);
+        }
+
+        int backslashCount = 0;
+        for (int i = negated ? 1 : 0; i < charClass.length(); i++) {
+            char c = charClass.charAt(i);
+
+            boolean escaped = (backslashCount % 2) != 0;
+            if (c == '\\') {
+                backslashCount += 1;
+            } else {
+                backslashCount = 0;
+            }
+
+            String expandedCharClass = CHAR_CLASSES.get(c);
+            if (escaped && expandedCharClass != null) {
+                result.setLength(result.length() - 1);
+                result.append(expandedCharClass);
+            } else {
+                result.append(c);
+            }
+        }
+        return result.append(']').toString();
+    }
+
+    @SuppressWarnings("CyclomaticComplexity")
+    @SuppressFBWarnings(value = "MUI_CONTAINSKEY_BEFORE_GET", justification = "More obviously correct this way")
+     static CharSequence replaceCharClasses(CharSequence input) {
+        StringBuilder result = new StringBuilder();
+        int charClassStart = -1;
+        boolean inCharClass = false;
+        int backslashCount = 0;
+
+        for (int i = 0; i < input.length(); i++) {
+            char c = input.charAt(i);
+
+            // Count the number of backslashes preceding this character
+            boolean escaped = (backslashCount % 2) != 0;
+            if (c == '\\') {
+                backslashCount += 1;
+            } else {
+                backslashCount = 0;
+            }
+
+            if (!inCharClass && !escaped && c == '[') {
+                inCharClass = true;
+                charClassStart = i + 1;
+            } else if (inCharClass && !escaped && c == ']') {
+                // While expansion could be mixed into this function, it does a similar walk, it
+                // seemed less confusing to separate into a dedicated routine.
+                result.append(expandCharClass(input.subSequence(charClassStart, i)));
+                inCharClass = false;
+                charClassStart = -1;
+            } else if (!inCharClass && !escaped && c == '.') {
+                // . must not match the anchors
+                result.append("[^").append(START_ANCHOR_MARKER).append(END_ANCHOR_MARKER).append(']');
+            } else if (!inCharClass && escaped && CHAR_CLASSES.containsKey(c)) {
+                result.setLength(result.length() - 1);
+                result.append('[').append(CHAR_CLASSES.get(c)).append(']');
+            } else if (!inCharClass) {
+                result.append(c);
+            }
+        }
+        if (inCharClass) {
+            // unclosed char class
+            result.append('[').append(input.subSequence(charClassStart, input.length()));
+        }
+
+        return result.toString();
+    }
+}
@@ -0,0 +1,106 @@
+package org.wikimedia.utils.regex;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.UnaryOperator;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
+import org.junit.jupiter.api.Test;
+
+class RegexEquivalenceTest {
+
+    private CharacterRunAutomaton buildLuceneRegex(String input, boolean replaceAnchors) {
+        CharSequence rewritten = RegexRewriter.rewrite(input, replaceAnchors);
+        RegExp regex = new RegExp(".*(" + rewritten + ").*");
+        Automaton automaton = regex.toAutomaton();
+        return new CharacterRunAutomaton(automaton);
+    }
+
+    private void assertPatternMatch(Map<String, String> sources, String regex, String... expected) {
+        // First verify the test case is correct by using java regex
+        Pattern pattern = Pattern.compile(regex);
+        // Then run our modified lucene regex to verify the same output
+        boolean replaceAnchors = true;
+        CharacterRunAutomaton charRun = buildLuceneRegex(regex, replaceAnchors);
+        UnaryOperator<String> valueTransform = replaceAnchors ? RegexRewriter::anchorTransformation : s -> s;
+
+        for (Map.Entry<String, String> entry : sources.entrySet()) {
+            boolean expectMatch = Arrays.stream(expected).anyMatch(docid -> docid.equals(entry.getKey()));
+
+            Matcher javaMatch = pattern.matcher(entry.getValue());
+            assertThat(javaMatch.find())
+                .describedAs("java regex `%s` against `%s`:`%s`", regex, entry.getKey(), entry.getValue())
+                .isEqualTo(expectMatch);
+
+            boolean luceneMatch = charRun.run(valueTransform.apply(entry.getValue()));
+            assertThat(luceneMatch)
+                .describedAs("lucene regex `%s` against `%s`:`%s`", regex, entry.getKey(), entry.getValue())
+                .isEqualTo(expectMatch);
+        }
+    }
+
+    private void assertNoPatternMatch(Map<String, String> sources, String regex) {
+        assertPatternMatch(sources, regex);
+    }
+
+    @Test
+    void testPatternEquivalence() {
+        Map<String, String> sources = new HashMap<>();
+        sources.put("findme", "abcdef");
+        sources.put("numbers", "12345");
+        sources.put("edgecase1", "Start^Middle$End");
+        sources.put("edgecase2", "^foo bar$");
+
+        // Basic start anchor
+        assertPatternMatch(sources, "^abc", "findme");
+        // No match if it's not the start of the string
+        assertNoPatternMatch(sources, "^bc");
+        // Basic end anchor
+        assertPatternMatch(sources, "ef$", "findme");
+        // No match if it's not the end of the string
+        assertNoPatternMatch(sources, "de$");
+        // We can match the plain ^ character with proper regex escaping
+        assertPatternMatch(sources, "Start\\^", "edgecase1");
+        assertPatternMatch(sources, "Start[\\^]", "edgecase1");
+        // The unescaped ^ is an anchor and fails to match
+        assertNoPatternMatch(sources, "Start^");
+        // Same for plain $
+        assertPatternMatch(sources, "Middle\\$", "edgecase1");
+        // And similarly no match when not escaped
+        assertNoPatternMatch(sources, "Middle$");
+        // Can match a starting ^ if escaped
+        assertNoPatternMatch(sources, "^foo");
+        assertPatternMatch(sources, "\\^foo", "edgecase2");
+        // or in a character class
+        assertPatternMatch(sources, "[a^]foo", "edgecase2");
+        // Similarly for $
+        assertNoPatternMatch(sources, "bar$");
+        assertPatternMatch(sources, "bar\\$", "edgecase2");
+        assertPatternMatch(sources, "bar\\$$", "edgecase2");
+        assertPatternMatch(sources, "bar[$]", "edgecase2");
+        assertPatternMatch(sources, "bar[$]$", "edgecase2");
+        // anchors can be used in parens
+        assertPatternMatch(sources, "(^|qqq)abc", "findme");
+        // any match (.) does not match anchors
+        assertNoPatternMatch(sources, ".findme");
+        assertNoPatternMatch(sources, "findme.");
+        // \d matches numbers
+        assertPatternMatch(sources, "\\d", "numbers");
+        // [^\d] matches not-numbers
+        assertPatternMatch(sources, "[^\\d]", "findme", "edgecase1", "edgecase2");
+        // \s matches spaces
+        assertPatternMatch(sources, "\\s", "edgecase2");
+        // [^\s] matches not-spaces
+        assertPatternMatch(sources, "^[^\\s]+$", "findme", "numbers", "edgecase1");
+        // \w matches word-like things, it does not match spaces or special chars
+        assertPatternMatch(sources, "^\\w+$", "findme", "numbers");
+        assertPatternMatch(sources, "[^\\w]", "edgecase1", "edgecase2");
+    }
+}