changeset 6353:9d209d2b34ae

Add indexer_language to change stemmer for xapian FTS indexer Nagy Gabor asked how to enable the hungarian stemmer in roundup. This required editing indexer_xapian.py replacing hardcoded "english" term. This value is now exposed in the config file under [main] index_language. This only works for xapian currently.
author John Rouillard <rouilj@ieee.org>
date Sun, 28 Mar 2021 23:34:43 -0400
parents d3a5d0d95869
children b61de764c8cc
files CHANGES.txt roundup/backends/indexer_common.py roundup/backends/indexer_xapian.py roundup/configuration.py test/test_indexer.py
diffstat 5 files changed, 45 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/CHANGES.txt	Sat Mar 27 13:05:50 2021 -0400
+++ b/CHANGES.txt	Sun Mar 28 23:34:43 2021 -0400
@@ -99,6 +99,9 @@
   customizing.txt. Result of mailing list question. (John Rouillard)
 - issue2551109 - Improve keyword editing in jinja2 template. (Cedric Krier)
 - issue2551117 - Add example systemd config
+- Allow admin to configure language used for stemming in xapian
+  indexer. (John Rouillard request by Nagy Gabor)
+
 
 2020-07-13 2.0.0
 
--- a/roundup/backends/indexer_common.py	Sat Mar 27 13:05:50 2021 -0400
+++ b/roundup/backends/indexer_common.py	Sun Mar 28 23:34:43 2021 -0400
@@ -23,6 +23,7 @@
         # gibberish (encoded text or somesuch) or shorter than 2 characters
         self.minlength = 2
         self.maxlength = 25
+        self.language = db.config[('main','indexer_language')]
 
     def is_stopword(self, word):
         return word in self.stopwords
--- a/roundup/backends/indexer_xapian.py	Sat Mar 27 13:05:50 2021 -0400
+++ b/roundup/backends/indexer_xapian.py	Sun Mar 28 23:34:43 2021 -0400
@@ -6,6 +6,7 @@
 
 from roundup.backends.indexer_common import Indexer as IndexerBase
 from roundup.anypy.strings import b2s, s2b
+from roundup.i18n import _
 
 # TODO: we need to delete documents when a property is *reindexed*
 
@@ -21,6 +22,18 @@
         self.reindex = 0
         self.transaction_active = False
 
+        # self.language defined in IndexerBase.__init__
+        # validate it here
+        try:
+            xapian.Stem(self.language)
+        except xapian.InvalidArgumentError:
+            raise ValueError(
+                _("Invalid indexer_language %(lang)s for xapian indexer\n"
+                  "Valid languages: %(valid)s") % {
+                      "lang": self.language,
+                      "valid": b2s(xapian.Stem.get_available_languages()) }
+            )
+
     def _get_database(self):
         index = os.path.join(self.db_path, 'text-index')
         for n in range(10):
@@ -80,8 +93,7 @@
             #database.begin_transaction()
             #self.transaction_active = True
 
-        # TODO: allow configuration of other languages
-        stemmer = xapian.Stem("english")
+        stemmer = xapian.Stem(self.language)
 
         # We use the identifier twice: once in the actual "text" being
         # indexed so we can search on it, and again as the "data" being
@@ -115,7 +127,7 @@
         database = self._get_database()
 
         enquire = xapian.Enquire(database)
-        stemmer = xapian.Stem("english")
+        stemmer = xapian.Stem(self.language)
         terms = []
         for term in [word.upper() for word in wordlist
                           if self.minlength <= len(word) <= self.maxlength]:
--- a/roundup/configuration.py	Sat Mar 27 13:05:50 2021 -0400
+++ b/roundup/configuration.py	Sun Mar 28 23:34:43 2021 -0400
@@ -746,6 +746,12 @@
             "If no indexer is supplied, the first available indexer\n"
             "will be used in the following order:\n"
             "Possible values: xapian, whoosh, native (internal)."),
+        (Option, "indexer_language", "english",
+            "Used to determine what language should be used by the\n"
+            "indexer above. Currently only affects Xapian indexer. It\n"
+            "sets the language for the stemmer.\n"
+            "Possible values: must be a valid language for the indexer,\n"
+            "see indexer documentation for details."),
         (WordListOption, "indexer_stopwords", "",
             "Additional stop-words for the full-text indexer specific to\n"
             "your tracker. See the indexer source for the default list of\n"
--- a/test/test_indexer.py	Sat Mar 27 13:05:50 2021 -0400
+++ b/test/test_indexer.py	Sun Mar 28 23:34:43 2021 -0400
@@ -55,6 +55,7 @@
         DATABASE = 'test-index'
     config = config()
     config[('main', 'indexer_stopwords')] = []
+    config[('main', 'indexer_language')] = "english"
 
 class IndexerTest(unittest.TestCase):
     def setUp(self):
@@ -194,7 +195,26 @@
         self.dex = Indexer(db)
     def tearDown(self):
         shutil.rmtree('test-index')
+    def test_invalid_language(self):
+        """ make sure we have a reasonable error message if
+            invalid language is specified """
+        l = db.config[('main', 'indexer_language')]
+        db.config[('main', 'indexer_language')] = "NO_LANG"
+        from roundup.backends.indexer_xapian import Indexer
+        with self.assertRaises(ValueError) as cm:
+            Indexer(db)
+        # note if Indexer(db) doesn't return ValueError
+        # all Xapian tests after this point will fail.
+        # because a valid langage will not be set.
+        # reset the valid language.
+        db.config[('main', 'indexer_language')] =  l
 
+        print(cm)
+        self.assertIn("ValueError", repr(cm.exception))
+        # look for failing language
+        self.assertIn("NO_LANG", cm.exception.args[0])
+        # look for supported language
+        self.assertIn("english", cm.exception.args[0])
 
 class RDBMSIndexerTest(object):
     def setUp(self):

Roundup Issue Tracker: http://roundup-tracker.org/