Mercurial > p > roundup > code
diff roundup/backends/indexer_xapian.py @ 5491:e72573996caf
fixed encoding issues for Xapian indexer
| author | Christof Meerwald <cmeerw@cmeerw.org> |
|---|---|
| date | Mon, 06 Aug 2018 20:52:15 +0100 |
| parents | 93832cec4c31 |
| children | 5bf7b5debb09 |
line wrap: on
line diff
--- a/roundup/backends/indexer_xapian.py Sun Aug 05 14:04:47 2018 +0100 +++ b/roundup/backends/indexer_xapian.py Mon Aug 06 20:52:15 2018 +0100 @@ -5,9 +5,15 @@ import xapian from roundup.backends.indexer_common import Indexer as IndexerBase +from roundup.anypy.strings import b2s, s2b # TODO: we need to delete documents when a property is *reindexed* +# Note that Xapian always uses UTF-8 encoded string, see +# https://xapian.org/docs/bindings/python3/introduction.html#strings: +# "Where std::string is returned, it's always mapped to bytes in +# Python..." + class Indexer(IndexerBase): def __init__(self, db): IndexerBase.__init__(self, db) @@ -80,7 +86,7 @@ # We use the identifier twice: once in the actual "text" being # indexed so we can search on it, and again as the "data" being # indexed so we know what we're matching when we get results - identifier = '%s:%s:%s'%identifier + identifier = s2b('%s:%s:%s'%identifier) # create the new document doc = xapian.Document() @@ -93,7 +99,7 @@ word = match.group(0) if self.is_stopword(word): continue - term = stemmer(word.lower()) + term = stemmer(s2b(word.lower())) doc.add_posting(term, match.start(0)) database.replace_document(identifier, doc) @@ -114,12 +120,12 @@ for term in [word.upper() for word in wordlist if self.minlength <= len(word) <= self.maxlength]: if not self.is_stopword(term): - terms.append(stemmer(term.lower())) + terms.append(stemmer(s2b(term.lower()))) query = xapian.Query(xapian.Query.OP_AND, terms) enquire.set_query(query) matches = enquire.get_mset(0, database.get_doccount()) - return [tuple(m.document.get_data().split(':')) + return [tuple(b2s(m.document.get_data()).split(':')) for m in matches]
