diff roundup/backends/indexer_whoosh.py @ 5096:e74c3611b138

- issue2550636, issue2550909: Added support for Whoosh indexer. Also adds new config.ini setting called indexer to select indexer. See ``doc/upgrading.txt`` for details. Initial patch done by David Wolever. Patch modified (see ticket or below for changes), docs updated and committed. I have an outstanding issue with test/test_indexer.py. I have to comment out all imports and tests for indexers I don't have (i.e. mysql, postgres) otherwise no tests run. With that change made, dbm, sqlite (rdbms), xapian and whoosh indexes are all passing the indexer tests. Changes summary: 1) support native back ends dbm and rdbms. (original patch only fell through to dbm) 2) Developed whoosh stopfilter to not index stopwords or words outside the the maxlength and minlength limits defined in index_common.py. Required to pass the extremewords test_indexer test. Also I removed a call to .lower on the input text as the tokenizer I chose automatically does the lowercase. 3) Added support for max/min length to find. This was needed to pass extremewords test. 4) Added back a call to save_index in add_text. This allowed all but two tests to pass. 5) Fixed a call to: results = searcher.search(query.Term("identifier", identifier)) which had an extra parameter that is an error under current whoosh. 6) Set limit=None in search call for find() otherwise it only return 10 items. This allowed it to pass manyresults test Also due to changes in the roundup code removed the call in indexer_whoosh to from roundup.anypy.sets_ import set since we use the python builtin set.
author John Rouillard <rouilj@ieee.org>
date Sat, 25 Jun 2016 20:10:03 -0400
parents
children 56c9bcdea47f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/roundup/backends/indexer_whoosh.py	Sat Jun 25 20:10:03 2016 -0400
@@ -0,0 +1,129 @@
+''' This implements the full-text indexer using Whoosh.
+'''
+import re, os
+
+from whoosh import fields, qparser, index, query, analysis
+
+from roundup.backends.indexer_common import Indexer as IndexerBase
+
+class Indexer(IndexerBase):
+    def __init__(self, db):
+        IndexerBase.__init__(self, db)
+        self.db_path = db.config.DATABASE
+        self.reindex = 0
+        self.writer = None
+        self.index = None
+        self.deleted = set()
+
+    def _get_index(self):
+        if self.index is None:
+            path = os.path.join(self.db_path, 'whoosh-index')
+            if not os.path.exists(path):
+                # StandardAnalyzer lowercases all words and configure it to
+                # block stopwords and words with lengths not between
+                # self.minlength and self.maxlength from indexer_common
+                stopfilter =  analysis.StandardAnalyzer( #stoplist=self.stopwords,
+                                                        minsize=self.minlength,
+                                                        maxsize=self.maxlength)
+                os.mkdir(path)
+                schema = fields.Schema(identifier=fields.ID(stored=True,
+                                                            unique=True),
+                                       content=fields.TEXT(analyzer=stopfilter))
+                index.create_in(path, schema)
+            self.index = index.open_dir(path)
+        return self.index
+
+    def save_index(self):
+        '''Save the changes to the index.'''
+        if not self.writer:
+            return
+        self.writer.commit()
+        self.deleted = set()
+        self.writer = None
+
+    def close(self):
+        '''close the indexing database'''
+        pass
+
+    def rollback(self):
+        if not self.writer:
+            return
+        self.writer.cancel()
+        self.deleted = set()
+        self.writer = None
+
+    def force_reindex(self):
+        '''Force a reindexing of the database.  This essentially
+        empties the tables ids and index and sets a flag so
+        that the databases are reindexed'''
+        self.reindex = 1
+
+    def should_reindex(self):
+        '''returns True if the indexes need to be rebuilt'''
+        return self.reindex
+
+    def _get_writer(self):
+        if self.writer is None:
+            self.writer = self._get_index().writer()
+        return self.writer
+
+    def _get_searcher(self):
+        return self._get_index().searcher()
+
+    def add_text(self, identifier, text, mime_type='text/plain'):
+        ''' "identifier" is  (classname, itemid, property) '''
+        if mime_type != 'text/plain':
+            return
+
+        if not text:
+            text = u''
+
+        if not isinstance(text, unicode):
+            text = unicode(text, "utf-8", "replace")
+
+        # We use the identifier twice: once in the actual "text" being
+        # indexed so we can search on it, and again as the "data" being
+        # indexed so we know what we're matching when we get results
+        identifier = u"%s:%s:%s"%identifier
+
+        # FIXME need to enhance this to handle the whoosh.store.LockError
+        # that maybe raised if there is already another process with a lock.
+        writer = self._get_writer()
+
+        # Whoosh gets upset if a document is deleted twice in one transaction,
+        # so we keep a list of the documents we have so far deleted to make
+        # sure that we only delete them once.
+        if identifier not in self.deleted:
+            searcher = self._get_searcher()
+            results = searcher.search(query.Term("identifier", identifier))
+            if len(results) > 0:
+                writer.delete_by_term("identifier", identifier)
+                self.deleted.add(identifier)
+
+        # Note: use '.lower()' because it seems like Whoosh gets
+        # better results that way.
+        writer.add_document(identifier=identifier, content=text)
+        self.save_index()
+
+    def find(self, wordlist):
+        '''look up all the words in the wordlist.
+        If none are found return an empty dictionary
+        * more rules here
+        '''
+
+        wordlist = [ word for word in wordlist
+                     if (self.minlength <= len(word) <= self.maxlength) and
+                        not self.is_stopword(word.upper()) ]
+
+        if not wordlist:
+            return {}
+
+        searcher = self._get_searcher()
+        q = query.And([ query.FuzzyTerm("content", word.lower())
+                        for word in wordlist ])
+
+        results = searcher.search(q, limit=None)
+
+        return [tuple(result["identifier"].split(':'))
+                for result in results]
+

Roundup Issue Tracker: http://roundup-tracker.org/