Add support for ignore_keywords flag in word delimiter graph token filter

malpani · malpani · commit 751dd13d215b · 2020-07-14T12:13:41.000-07:00
Support ignore_keywords flag for word delimiter graph token filter Lucene's WordDelimiterGraphFilter allows to skip processing of tokens tagged as keyword. However the Elasticsearch word delimiter graph token filter does not support this yet. I would like to update the Elasticsearch implementation to incorporate the ignore_keywords flag to enable better customization of token filters Fix for #59491
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
@@ -38,6 +38,7 @@
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
@@ -87,6 +88,8 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
         // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
         flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
         // If not null is the set of tokens to protect from being delimited
+        flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
+        // If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
         Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
         this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
         this.flags = flags;
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
@@ -118,6 +118,38 @@ public void testAdjustingOffsets() throws IOException {
             expectedIncr, expectedPosLen, null);
     }
 
+    public void testIgnoreKeywords() throws IOException {
+        //test with keywords but ignore is false (default behavior)
+        Settings settings = Settings.builder()
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+                .put("index.analysis.filter.my_keyword.type", "keyword_marker")
+                .put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
+                .put("index.analysis.analyzer.my_analyzer.type", "custom")
+                .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
+                .put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .build();
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        String source = "PowerShot PowerHungry";
+        int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
+        int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
+        String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
+        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
+
+        //test with keywords but ignore_keywords is set as true
+        settings = Settings.builder().put(settings)
+                .put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
+                .build();
+        analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        expectedStartOffsets = new int[]{0, 5, 10};
+        expectedEndOffsets = new int[]{5, 9, 21};
+        expected = new String[]{"Power", "Shot", "PowerHungry"};
+        assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
+    }
+
     public void testPreconfiguredFilter() throws IOException {
         // Before 7.3 we don't adjust offsets
         {