Mercurial > p > roundup > code
changeset 6353:9d209d2b34ae
Add indexer_language to change stemmer for xapian FTS indexer
Nagy Gabor asked how to enable the hungarian stemmer in roundup. This
required editing indexer_xapian.py replacing hardcoded "english"
term. This value is now exposed in the config file under [main]
index_language.
This only works for xapian currently.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 28 Mar 2021 23:34:43 -0400 |
| parents | d3a5d0d95869 |
| children | b61de764c8cc |
| files | CHANGES.txt roundup/backends/indexer_common.py roundup/backends/indexer_xapian.py roundup/configuration.py test/test_indexer.py |
| diffstat | 5 files changed, 45 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGES.txt Sat Mar 27 13:05:50 2021 -0400 +++ b/CHANGES.txt Sun Mar 28 23:34:43 2021 -0400 @@ -99,6 +99,9 @@ customizing.txt. Result of mailing list question. (John Rouillard) - issue2551109 - Improve keyword editing in jinja2 template. (Cedric Krier) - issue2551117 - Add example systemd config +- Allow admin to configure language used for stemming in xapian + indexer. (John Rouillard request by Nagy Gabor) + 2020-07-13 2.0.0
--- a/roundup/backends/indexer_common.py Sat Mar 27 13:05:50 2021 -0400 +++ b/roundup/backends/indexer_common.py Sun Mar 28 23:34:43 2021 -0400 @@ -23,6 +23,7 @@ # gibberish (encoded text or somesuch) or shorter than 2 characters self.minlength = 2 self.maxlength = 25 + self.language = db.config[('main','indexer_language')] def is_stopword(self, word): return word in self.stopwords
--- a/roundup/backends/indexer_xapian.py Sat Mar 27 13:05:50 2021 -0400 +++ b/roundup/backends/indexer_xapian.py Sun Mar 28 23:34:43 2021 -0400 @@ -6,6 +6,7 @@ from roundup.backends.indexer_common import Indexer as IndexerBase from roundup.anypy.strings import b2s, s2b +from roundup.i18n import _ # TODO: we need to delete documents when a property is *reindexed* @@ -21,6 +22,18 @@ self.reindex = 0 self.transaction_active = False + # self.language defined in IndexerBase.__init__ + # validate it here + try: + xapian.Stem(self.language) + except xapian.InvalidArgumentError: + raise ValueError( + _("Invalid indexer_language %(lang)s for xapian indexer\n" + "Valid languages: %(valid)s") % { + "lang": self.language, + "valid": b2s(xapian.Stem.get_available_languages()) } + ) + def _get_database(self): index = os.path.join(self.db_path, 'text-index') for n in range(10): @@ -80,8 +93,7 @@ #database.begin_transaction() #self.transaction_active = True - # TODO: allow configuration of other languages - stemmer = xapian.Stem("english") + stemmer = xapian.Stem(self.language) # We use the identifier twice: once in the actual "text" being # indexed so we can search on it, and again as the "data" being @@ -115,7 +127,7 @@ database = self._get_database() enquire = xapian.Enquire(database) - stemmer = xapian.Stem("english") + stemmer = xapian.Stem(self.language) terms = [] for term in [word.upper() for word in wordlist if self.minlength <= len(word) <= self.maxlength]:
--- a/roundup/configuration.py Sat Mar 27 13:05:50 2021 -0400 +++ b/roundup/configuration.py Sun Mar 28 23:34:43 2021 -0400 @@ -746,6 +746,12 @@ "If no indexer is supplied, the first available indexer\n" "will be used in the following order:\n" "Possible values: xapian, whoosh, native (internal)."), + (Option, "indexer_language", "english", + "Used to determine what language should be used by the\n" + "indexer above. Currently only affects Xapian indexer. It\n" + "sets the language for the stemmer.\n" + "Possible values: must be a valid language for the indexer,\n" + "see indexer documentation for details."), (WordListOption, "indexer_stopwords", "", "Additional stop-words for the full-text indexer specific to\n" "your tracker. See the indexer source for the default list of\n"
--- a/test/test_indexer.py Sat Mar 27 13:05:50 2021 -0400 +++ b/test/test_indexer.py Sun Mar 28 23:34:43 2021 -0400 @@ -55,6 +55,7 @@ DATABASE = 'test-index' config = config() config[('main', 'indexer_stopwords')] = [] + config[('main', 'indexer_language')] = "english" class IndexerTest(unittest.TestCase): def setUp(self): @@ -194,7 +195,26 @@ self.dex = Indexer(db) def tearDown(self): shutil.rmtree('test-index') + def test_invalid_language(self): + """ make sure we have a reasonable error message if + invalid language is specified """ + l = db.config[('main', 'indexer_language')] + db.config[('main', 'indexer_language')] = "NO_LANG" + from roundup.backends.indexer_xapian import Indexer + with self.assertRaises(ValueError) as cm: + Indexer(db) + # note if Indexer(db) doesn't return ValueError + # all Xapian tests after this point will fail. + # because a valid langage will not be set. + # reset the valid language. + db.config[('main', 'indexer_language')] = l + print(cm) + self.assertIn("ValueError", repr(cm.exception)) + # look for failing language + self.assertIn("NO_LANG", cm.exception.args[0]) + # look for supported language + self.assertIn("english", cm.exception.args[0]) class RDBMSIndexerTest(object): def setUp(self):
