Mercurial > p > roundup > code
view roundup/backends/indexer_whoosh.py @ 6823:fe0091279f50
Refactor session db logging and key generation for sessions/otks
While I was working on the redis sessiondb stuff, I noticed that
log_wanrning, get_logger ... was duplicated. Also there was code to
generate a unique key for otks that was duplicated.
Changes:
creating new sessions_common.py and SessionsCommon class to provide
methods:
log_warning, log_info, log_debug, get_logger, getUniqueKey
getUniqueKey method is closer to the method used to make
session keys in client.py.
sessions_common.py now report when random_.py chooses a weak
random number generator. Removed same from rest.py.
get_logger reconciles all logging under
roundup.hyperdb.backends.<name of BasicDatabase class>
some backends used to log to root logger.
have BasicDatabase in other sessions_*.py modules inherit from
SessionCommon.
change logging to use log_* methods.
In addition:
remove unused imports reported by flake8 and other formatting
changes
modify actions.py, rest.py, templating.py to use getUniqueKey
method.
add tests for new methods
test_redis_session.py
swap out ModuleNotFoundError for ImportError to prevent crash in
python2 when redis is not present.
allow injection of username:password or just password into redis
connection URL. set pytest_redis_pw envirnment variable to password
or user:password when running test.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 07 Aug 2022 01:51:11 -0400 |
| parents | cb76bb8bfffd |
| children |
line wrap: on
line source
''' This implements the full-text indexer using Whoosh. ''' import os from whoosh import fields, qparser, index, query, analysis from roundup.backends.indexer_common import Indexer as IndexerBase from roundup.anypy.strings import us2u class Indexer(IndexerBase): def __init__(self, db): IndexerBase.__init__(self, db) self.db_path = db.config.DATABASE self.reindex = 0 self.writer = None self.index = None self.deleted = set() def _get_index(self): if self.index is None: path = os.path.join(self.db_path, 'whoosh-index') if not os.path.exists(path): # StandardAnalyzer lowercases all words and configure it to # block stopwords and words with lengths not between # self.minlength and self.maxlength from indexer_common stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopwords, minsize=self.minlength, maxsize=self.maxlength) os.mkdir(path) schema = fields.Schema(identifier=fields.ID(stored=True, unique=True), content=fields.TEXT(analyzer=stopfilter)) index.create_in(path, schema) self.index = index.open_dir(path) return self.index def save_index(self): '''Save the changes to the index.''' if not self.writer: return self.writer.commit() self.deleted = set() self.writer = None def close(self): '''close the indexing database''' pass def rollback(self): if not self.writer: return self.writer.cancel() self.deleted = set() self.writer = None def force_reindex(self): '''Force a reindexing of the database. This essentially empties the tables ids and index and sets a flag so that the databases are reindexed''' self.reindex = 1 def should_reindex(self): '''returns True if the indexes need to be rebuilt''' return self.reindex def _get_writer(self): if self.writer is None: self.writer = self._get_index().writer() return self.writer def _get_searcher(self): return self._get_index().searcher() def add_text(self, identifier, text, mime_type='text/plain'): ''' "identifier" is (classname, itemid, property) ''' if mime_type != 'text/plain': return if not text: text = u'' text = us2u(text, "replace") # We use the identifier twice: once in the actual "text" being # indexed so we can search on it, and again as the "data" being # indexed so we know what we're matching when we get results identifier = u"%s:%s:%s" % identifier # FIXME need to enhance this to handle the whoosh.store.LockError # that maybe raised if there is already another process with a lock. writer = self._get_writer() # Whoosh gets upset if a document is deleted twice in one transaction, # so we keep a list of the documents we have so far deleted to make # sure that we only delete them once. if identifier not in self.deleted: searcher = self._get_searcher() results = searcher.search(query.Term("identifier", identifier)) if len(results) > 0: writer.delete_by_term("identifier", identifier) self.deleted.add(identifier) # Note: use '.lower()' because it seems like Whoosh gets # better results that way. writer.add_document(identifier=identifier, content=text) self.save_index() def find(self, wordlist): '''look up all the words in the wordlist. If none are found return an empty dictionary * more rules here ''' wordlist = [word for word in wordlist if (self.minlength <= len(word) <= self.maxlength) and not self.is_stopword(word.upper())] if not wordlist: return {} searcher = self._get_searcher() q = query.And([query.FuzzyTerm("content", word.lower()) for word in wordlist]) results = searcher.search(q, limit=None) return [tuple(result["identifier"].split(':')) for result in results]
