Support ignore_keywords flag for word delimiter graph token filter (#59563)

malpani · romseygeek · commit 0555fef799c0 · 2020-07-21T16:11:55.000+01:00
This commit allows customizing the word delimiter token filters to skip processing tokens tagged as keyword through the `ignore_keywords` flag Lucene's WordDelimiterGraphFilter already exposes. Fix for #59491
diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
@@ -270,6 +270,12 @@ If `true`, the filter includes tokens consisting of only alphabetical characters
 in the output. If `false`, the filter excludes these tokens from the output.
 Defaults to `true`.
 
+`ignore_keywords`::
+(Optional, boolean)
+If `true`, the filter skips tokens with
+a `keyword` attribute of `true`.
+Defaults to `false`.
+
 [[word-delimiter-graph-tokenfilter-preserve-original]]
 `preserve_original`::
 +
@@ -496,4 +502,4 @@ spans one in the token graph, making it invalid.
 
 image::images/analysis/token-graph-wd.svg[align="center"]
 
-====
+====
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
@@ -41,6 +41,7 @@
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
 import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
@@ -93,6 +94,8 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
         // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
         flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
         // If not null is the set of tokens to protect from being delimited
+        flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
+        // If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
         Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
         this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
         this.flags = flags;
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
@@ -118,6 +118,38 @@ public void testAdjustingOffsets() throws IOException {
             expectedIncr, expectedPosLen, null);
     }
 
+    public void testIgnoreKeywords() throws IOException {
+        //test with keywords but ignore is false (default behavior)
+        Settings settings = Settings.builder()
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+                .put("index.analysis.filter.my_keyword.type", "keyword_marker")
+                .put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
+                .put("index.analysis.analyzer.my_analyzer.type", "custom")
+                .put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
+                .put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .build();
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        String source = "PowerShot PowerHungry";
+        int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
+        int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
+        String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
+        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
+
+        //test with keywords but ignore_keywords is set as true
+        settings = Settings.builder().put(settings)
+                .put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
+                .build();
+        analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        expectedStartOffsets = new int[]{0, 5, 10};
+        expectedEndOffsets = new int[]{5, 9, 21};
+        expected = new String[]{"Power", "Shot", "PowerHungry"};
+        assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
+    }
+
     public void testPreconfiguredFilter() throws IOException {
         // Before 7.3 we don't adjust offsets
         {