Skip to content

Commit 0555fef

Browse files
malpaniromseygeek
authored andcommitted
Support ignore_keywords flag for word delimiter graph token filter (#59563)
This commit allows customizing the word delimiter token filters to skip processing tokens tagged as keyword through the `ignore_keywords` flag Lucene's WordDelimiterGraphFilter already exposes. Fix for #59491
1 parent a0ad1a1 commit 0555fef

File tree

3 files changed

+42
-1
lines changed

3 files changed

+42
-1
lines changed

docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,12 @@ If `true`, the filter includes tokens consisting of only alphabetical characters
270270
in the output. If `false`, the filter excludes these tokens from the output.
271271
Defaults to `true`.
272272

273+
`ignore_keywords`::
274+
(Optional, boolean)
275+
If `true`, the filter skips tokens with
276+
a `keyword` attribute of `true`.
277+
Defaults to `false`.
278+
273279
[[word-delimiter-graph-tokenfilter-preserve-original]]
274280
`preserve_original`::
275281
+
@@ -496,4 +502,4 @@ spans one in the token graph, making it invalid.
496502
497503
image::images/analysis/token-graph-wd.svg[align="center"]
498504
499-
====
505+
====

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
4242
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
4343
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
44+
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
4445
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
4546
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
4647
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
@@ -93,6 +94,8 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
9394
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
9495
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
9596
// If not null is the set of tokens to protect from being delimited
97+
flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
98+
// If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
9699
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
97100
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
98101
this.flags = flags;

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,38 @@ public void testAdjustingOffsets() throws IOException {
118118
expectedIncr, expectedPosLen, null);
119119
}
120120

121+
public void testIgnoreKeywords() throws IOException {
122+
//test with keywords but ignore is false (default behavior)
123+
Settings settings = Settings.builder()
124+
.put("index.analysis.filter.my_word_delimiter.type", type)
125+
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
126+
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
127+
.put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
128+
.put("index.analysis.analyzer.my_analyzer.type", "custom")
129+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
130+
.put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
131+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
132+
.build();
133+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
134+
String source = "PowerShot PowerHungry";
135+
int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
136+
int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
137+
String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
138+
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
139+
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
140+
141+
//test with keywords but ignore_keywords is set as true
142+
settings = Settings.builder().put(settings)
143+
.put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
144+
.build();
145+
analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
146+
analyzer = analysis.indexAnalyzers.get("my_analyzer");
147+
expectedStartOffsets = new int[]{0, 5, 10};
148+
expectedEndOffsets = new int[]{5, 9, 21};
149+
expected = new String[]{"Power", "Shot", "PowerHungry"};
150+
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
151+
}
152+
121153
public void testPreconfiguredFilter() throws IOException {
122154
// Before 7.3 we don't adjust offsets
123155
{

0 commit comments

Comments
 (0)