Mercurial > p > roundup > code
view roundup/backends/indexer_dbm.py @ 7752:b2dbab2b34bc
fix(refactor): multiple fixups using ruff linter; more testing.
Converting to using the ruff linter and its rulesets. Fixed a number
of issues.
admin.py:
sort imports
use immutable tuples as default value markers for parameters where a
None value is valid.
reduced some loops to list comprehensions for performance
used ternary to simplify some if statements
named some variables to make them less magic
(e.g. _default_savepoint_setting = 1000)
fixed some tests for argument counts < 2 becomes != 2 so 3 is an
error.
moved exception handlers outside of loops for performance where
exception handler will abort loop anyway.
renamed variables called 'id' or 'dir' as they shadow builtin
commands.
fix translations of form _("string %s" % value) -> _("string %s") %
value so translation will be looked up with the key before
substitution.
end dicts, tuples with a trailing comma to reduce missing comma
errors if modified
simplified sorted(list(self.setting.keys())) to
sorted(self.setting.keys()) as sorted consumes whole list.
in if conditions put compared variable on left and threshold condition
on right. (no yoda conditions)
multiple noqa: suppression
removed unneeded noqa as lint rulesets are a bit different
do_get - refactor output printing logic: Use fast return if not
special formatting is requested; use isinstance with a tuple
rather than two isinstance calls; cleaned up flow and removed
comments on algorithm as it can be easily read from the code.
do_filter, do_find - refactor output printing logic. Reduce
duplicate code.
do_find - renamed variable 'value' that was set inside a loop. The
loop index variable was also named 'value'.
do_pragma - added hint to use list subcommand if setting was not
found. Replaced condition 'type(x) is bool' with 'isinstance(x,
bool)' for various types.
test_admin.py
added testing for do_list
better test coverage for do_get includes: -S and -d for multilinks,
error case for -d with non-link.
better testing for do_find including all output modes
better testing for do_filter including all output modes
fixed expected output for do_pragma that now includes hint to use
pragma list if setting not found.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Fri, 01 Mar 2024 14:53:18 -0500 |
| parents | d17e57220a62 |
| children |
line wrap: on
line source
# # This module is derived from the module described at: # http://gnosis.cx/publish/programming/charming_python_15.txt # # Author: David Mertz (mertz@gnosis.cx) # Thanks to: Pat Knight (p.knight@ktgroup.co.uk) # Gregory Popovitch (greg@gpy.com) # # The original module was released under this license, and remains under # it: # # This file is released to the public domain. I (dqm) would # appreciate it if you choose to keep derived works under terms # that promote freedom, but obviously am giving up any rights # to compel such. # '''This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of messages, string properties and text files possible. ''' __docformat__ = 'restructuredtext' import errno import marshal import os import re import shutil import zlib from roundup.backends.indexer_common import Indexer as IndexerBase class Indexer(IndexerBase): '''Indexes information from roundup's hyperdb to allow efficient searching. Three structures are created by the indexer:: files {identifier: (fileid, wordcount)} words {word: {fileid: count}} fileids {fileid: identifier} where identifier is (classname, nodeid, propertyname) ''' def __init__(self, db): IndexerBase.__init__(self, db) self.indexdb_path = os.path.join(db.config.DATABASE, 'indexes') self.indexdb = os.path.join(self.indexdb_path, 'index.db') self.reindex = 0 self.quiet = 9 self.changed = 0 # see if we need to reindex because of a change in code version = os.path.join(self.indexdb_path, 'version') if (not os.path.exists(self.indexdb_path) or not os.path.exists(version)): # for now the file itself is a flag self.force_reindex() elif os.path.exists(version): fd = open(version) version = fd.read() fd.close() # check the value and reindex if it's not the latest if version.strip() != '1': self.force_reindex() def force_reindex(self): '''Force a reindex condition ''' if os.path.exists(self.indexdb_path): shutil.rmtree(self.indexdb_path) os.makedirs(self.indexdb_path) os.chmod(self.indexdb_path, 0o775) # nosec - allow group write fd = open(os.path.join(self.indexdb_path, 'version'), 'w') fd.write('1\n') fd.close() self.reindex = 1 self.changed = 1 def should_reindex(self): '''Should we reindex? ''' return self.reindex def add_text(self, identifier, text, mime_type='text/plain'): '''Add some text associated with the (classname, nodeid, property) identifier. ''' # make sure the index is loaded self.load_index() # remove old entries for this identifier if identifier in self.files: self.purge_entry(identifier) # split into words words = self.splitter(text, mime_type) # Find new file index, and assign it to identifier # (_TOP uses trick of negative to avoid conflict with file index) self.files['_TOP'] = (self.files['_TOP'][0]-1, None) file_index = abs(self.files['_TOP'][0]) self.files[identifier] = (file_index, len(words)) self.fileids[file_index] = identifier # find the unique words filedict = {} for word in words: if self.is_stopword(word): continue if word in filedict: filedict[word] = filedict[word]+1 else: filedict[word] = 1 # now add to the totals for word in filedict: # each word has a dict of {identifier: count} if word in self.words: entry = self.words[word] else: # new word entry = {} self.words[word] = entry # make a reference to the file for this word entry[file_index] = filedict[word] # save needed self.changed = 1 def splitter(self, text, ftype): '''Split the contents of a text string into a list of 'words' ''' if ftype == 'text/plain': words = self.text_splitter(text) else: return [] return words def text_splitter(self, text): """Split text/plain string into a list of words """ if not text: return [] # case insensitive text = text.upper() # Split the raw text return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength), text, re.UNICODE) # we override this to ignore too short and too long words # and also to fix a bug - the (fail) case. def find(self, wordlist): '''Locate files that match ALL the words in wordlist ''' if not hasattr(self, 'words'): self.load_index() self.load_index(wordlist=wordlist) entries = {} hits = None for word in wordlist: if not self.minlength <= len(word) <= self.maxlength: # word outside the bounds of what we index - ignore continue word = word.upper() if self.is_stopword(word): continue entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) return {} if hits is None: hits = {} for k in entry: if k not in self.fileids: raise ValueError('Index is corrupted: re-generate it') hits[k] = self.fileids[k] else: # Eliminate hits for every non-match for fileid in list(hits): if fileid not in entry: del hits[fileid] if hits is None: return {} return list(hits.values()) segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!" def load_index(self, reload=0, wordlist=None): # Unless reload is indicated, do not load twice if self.index_loaded() and not reload: return 0 # Ok, now let's actually load it db = {'WORDS': {}, 'FILES': {'_TOP': (0, None)}, 'FILEIDS': {}} # Identify the relevant word-dictionary segments if not wordlist: segments = self.segments else: segments = ['-', '#'] for word in wordlist: initchar = word[0].upper() if initchar not in self.segments: initchar = '_' segments.append(initchar) # Load the segments for segment in segments: try: f = open(self.indexdb + segment, 'rb') except IOError as error: # probably just nonexistent segment index file if error.errno != errno.ENOENT: raise # noqa: E701 else: pickle_str = zlib.decompress(f.read()) f.close() dbslice = marshal.loads(pickle_str) if dbslice.get('WORDS'): # if it has some words, add them for word, entry in dbslice['WORDS'].items(): db['WORDS'][word] = entry if dbslice.get('FILES'): # if it has some files, add them db['FILES'] = dbslice['FILES'] if dbslice.get('FILEIDS'): # if it has fileids, add them db['FILEIDS'] = dbslice['FILEIDS'] self.words = db['WORDS'] self.files = db['FILES'] self.fileids = db['FILEIDS'] self.changed = 0 def save_index(self): # only save if the index is loaded and changed if not self.index_loaded() or not self.changed: return # brutal space saver... delete all the small segments for segment in self.segments: try: os.remove(self.indexdb + segment) except OSError as error: # probably just nonexistent segment index file if error.errno != errno.ENOENT: raise # noqa: E701 # First write the much simpler filename/fileid dictionaries dbfil = {'WORDS': None, 'FILES': self.files, 'FILEIDS': self.fileids} marshal_fh = open(self.indexdb+'-', 'wb') marshal_fh.write(zlib.compress(marshal.dumps(dbfil))) marshal_fh.close() # The hard part is splitting the word dictionary up, of course letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_" segdicts = {} # Need batch of empty dicts for segment in letters: segdicts[segment] = {} for word, entry in self.words.items(): # Split into segment dicts initchar = word[0].upper() if initchar not in letters: # if it's a unicode character, add it to the '_' segment initchar = '_' segdicts[initchar][word] = entry # save for initchar in letters: db = {'WORDS': segdicts[initchar], 'FILES': None, 'FILEIDS': None} pickle_str = marshal.dumps(db) filename = self.indexdb + initchar pickle_fh = open(filename, 'wb') pickle_fh.write(zlib.compress(pickle_str)) pickle_fh.close() os.chmod(filename, 0o664) # save done self.changed = 0 def purge_entry(self, identifier): '''Remove a file from file index and word index ''' self.load_index() if identifier not in self.files: return file_index = self.files[identifier][0] del self.files[identifier] del self.fileids[file_index] # The much harder part, cleanup the word index for _key, occurs in self.words.items(): if file_index in occurs: del occurs[file_index] # save needed self.changed = 1 def index_loaded(self): return (hasattr(self, 'fileids') and hasattr(self, 'files') and hasattr(self, 'words')) def rollback(self): ''' load last saved index info. ''' self.load_index(reload=1) def close(self): pass # vim: set filetype=python ts=4 sw=4 et si
