diff roundup/backends/indexer_xapian.py @ 5491:e72573996caf

fixed encoding issues for Xapian indexer
author Christof Meerwald <cmeerw@cmeerw.org>
date Mon, 06 Aug 2018 20:52:15 +0100
parents 93832cec4c31
children 5bf7b5debb09
line wrap: on
line diff
--- a/roundup/backends/indexer_xapian.py	Sun Aug 05 14:04:47 2018 +0100
+++ b/roundup/backends/indexer_xapian.py	Mon Aug 06 20:52:15 2018 +0100
@@ -5,9 +5,15 @@
 import xapian
 
 from roundup.backends.indexer_common import Indexer as IndexerBase
+from roundup.anypy.strings import b2s, s2b
 
 # TODO: we need to delete documents when a property is *reindexed*
 
+# Note that Xapian always uses UTF-8 encoded string, see
+# https://xapian.org/docs/bindings/python3/introduction.html#strings:
+# "Where std::string is returned, it's always mapped to bytes in
+# Python..."
+
 class Indexer(IndexerBase):
     def __init__(self, db):
         IndexerBase.__init__(self, db)
@@ -80,7 +86,7 @@
         # We use the identifier twice: once in the actual "text" being
         # indexed so we can search on it, and again as the "data" being
         # indexed so we know what we're matching when we get results
-        identifier = '%s:%s:%s'%identifier
+        identifier = s2b('%s:%s:%s'%identifier)
 
         # create the new document
         doc = xapian.Document()
@@ -93,7 +99,7 @@
             word = match.group(0)
             if self.is_stopword(word):
                 continue
-            term = stemmer(word.lower())
+            term = stemmer(s2b(word.lower()))
             doc.add_posting(term, match.start(0))
 
         database.replace_document(identifier, doc)
@@ -114,12 +120,12 @@
         for term in [word.upper() for word in wordlist
                           if self.minlength <= len(word) <= self.maxlength]:
             if not self.is_stopword(term):
-                terms.append(stemmer(term.lower()))
+                terms.append(stemmer(s2b(term.lower())))
         query = xapian.Query(xapian.Query.OP_AND, terms)
 
         enquire.set_query(query)
         matches = enquire.get_mset(0, database.get_doccount())
 
-        return [tuple(m.document.get_data().split(':'))
+        return [tuple(b2s(m.document.get_data()).split(':'))
             for m in matches]
 

Roundup Issue Tracker: http://roundup-tracker.org/