Skip to content

Commit 751dd13

Browse files
committed
Add support for ignore_keywords flag in word delimiter graph token filter
Support ignore_keywords flag for word delimiter graph token filter Lucene's WordDelimiterGraphFilter allows to skip processing of tokens tagged as keyword. However the Elasticsearch word delimiter graph token filter does not support this yet. I would like to update the Elasticsearch implementation to incorporate the ignore_keywords flag to enable better customization of token filters Fix for #59491
1 parent a51dda8 commit 751dd13

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
3939
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
4040
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
41+
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.IGNORE_KEYWORDS;
4142
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
4243
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
4344
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
@@ -87,6 +88,8 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
8788
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
8889
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
8990
// If not null is the set of tokens to protect from being delimited
91+
flags |= getFlag(IGNORE_KEYWORDS, settings, "ignore_keywords", false);
92+
// If set, suppresses processing terms with KeywordAttribute#isKeyword()=true.
9093
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
9194
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
9295
this.flags = flags;

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,38 @@ public void testAdjustingOffsets() throws IOException {
118118
expectedIncr, expectedPosLen, null);
119119
}
120120

121+
public void testIgnoreKeywords() throws IOException {
122+
//test with keywords but ignore is false (default behavior)
123+
Settings settings = Settings.builder()
124+
.put("index.analysis.filter.my_word_delimiter.type", type)
125+
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
126+
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
127+
.put("index.analysis.filter.my_keyword.keywords", "PowerHungry")
128+
.put("index.analysis.analyzer.my_analyzer.type", "custom")
129+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace")
130+
.put("index.analysis.analyzer.my_analyzer.filter", "my_keyword, my_word_delimiter")
131+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
132+
.build();
133+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
134+
String source = "PowerShot PowerHungry";
135+
int[] expectedStartOffsets = new int[]{0, 5, 10, 15};
136+
int[] expectedEndOffsets = new int[]{5, 9, 15, 21};
137+
String[] expected = new String[]{"Power", "Shot", "Power", "Hungry"};
138+
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
139+
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
140+
141+
//test with keywords but ignore_keywords is set as true
142+
settings = Settings.builder().put(settings)
143+
.put("index.analysis.filter.my_word_delimiter.ignore_keywords", "true")
144+
.build();
145+
analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
146+
analyzer = analysis.indexAnalyzers.get("my_analyzer");
147+
expectedStartOffsets = new int[]{0, 5, 10};
148+
expectedEndOffsets = new int[]{5, 9, 21};
149+
expected = new String[]{"Power", "Shot", "PowerHungry"};
150+
assertAnalyzesTo(analyzer, source, expected, expectedStartOffsets, expectedEndOffsets);
151+
}
152+
121153
public void testPreconfiguredFilter() throws IOException {
122154
// Before 7.3 we don't adjust offsets
123155
{

0 commit comments

Comments
 (0)