Mercurial > p > roundup > code
view roundup/backends/indexer_dbm.py @ 3192:eb00a2fa0e0e maint-0.8 0.8.0
pre-release stuff
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Wed, 16 Feb 2005 00:29:18 +0000 |
| parents | d530b68e4b42 |
| children | 1c063814d567 |
line wrap: on
line source
# # This module is derived from the module described at: # http://gnosis.cx/publish/programming/charming_python_15.txt # # Author: David Mertz (mertz@gnosis.cx) # Thanks to: Pat Knight (p.knight@ktgroup.co.uk) # Gregory Popovitch (greg@gpy.com) # # The original module was released under this license, and remains under # it: # # This file is released to the public domain. I (dqm) would # appreciate it if you choose to keep derived works under terms # that promote freedom, but obviously am giving up any rights # to compel such. # #$Id: indexer_dbm.py,v 1.2 2004-11-05 05:10:07 richard Exp $ '''This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of messages, string properties and text files possible. ''' __docformat__ = 'restructuredtext' import os, shutil, re, mimetypes, marshal, zlib, errno from roundup.hyperdb import Link, Multilink stopwords = [ "A", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY", "FOR", "IF", "IN", "INTO", "IS", "IT", "NO", "NOT", "OF", "ON", "OR", "SUCH", "THAT", "THE", "THEIR", "THEN", "THERE", "THESE", "THEY", "THIS", "TO", "WAS", "WILL", "WITH" ] is_stopword = {} for word in stopwords: is_stopword[word] = None is_stopword = is_stopword.has_key class Indexer: '''Indexes information from roundup's hyperdb to allow efficient searching. Three structures are created by the indexer:: files {identifier: (fileid, wordcount)} words {word: {fileid: count}} fileids {fileid: identifier} where identifier is (classname, nodeid, propertyname) ''' def __init__(self, db_path): self.indexdb_path = os.path.join(db_path, 'indexes') self.indexdb = os.path.join(self.indexdb_path, 'index.db') self.reindex = 0 self.quiet = 9 self.changed = 0 # see if we need to reindex because of a change in code version = os.path.join(self.indexdb_path, 'version') if (not os.path.exists(self.indexdb_path) or not os.path.exists(version)): # for now the file itself is a flag self.force_reindex() elif os.path.exists(version): version = open(version).read() # check the value and reindex if it's not the latest if version.strip() != '1': self.force_reindex() def force_reindex(self): '''Force a reindex condition ''' if os.path.exists(self.indexdb_path): shutil.rmtree(self.indexdb_path) os.makedirs(self.indexdb_path) os.chmod(self.indexdb_path, 0775) open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') self.reindex = 1 self.changed = 1 def should_reindex(self): '''Should we reindex? ''' return self.reindex def add_text(self, identifier, text, mime_type='text/plain'): '''Add some text associated with the (classname, nodeid, property) identifier. ''' # make sure the index is loaded self.load_index() # remove old entries for this identifier if self.files.has_key(identifier): self.purge_entry(identifier) # split into words words = self.splitter(text, mime_type) # Find new file index, and assign it to identifier # (_TOP uses trick of negative to avoid conflict with file index) self.files['_TOP'] = (self.files['_TOP'][0]-1, None) file_index = abs(self.files['_TOP'][0]) self.files[identifier] = (file_index, len(words)) self.fileids[file_index] = identifier # find the unique words filedict = {} for word in words: if is_stopword(word): continue if filedict.has_key(word): filedict[word] = filedict[word]+1 else: filedict[word] = 1 # now add to the totals for word in filedict.keys(): # each word has a dict of {identifier: count} if self.words.has_key(word): entry = self.words[word] else: # new word entry = {} self.words[word] = entry # make a reference to the file for this word entry[file_index] = filedict[word] # save needed self.changed = 1 def splitter(self, text, ftype): '''Split the contents of a text string into a list of 'words' ''' if ftype == 'text/plain': words = self.text_splitter(text) else: return [] return words def text_splitter(self, text): """Split text/plain string into a list of words """ # case insensitive text = str(text).upper() # Split the raw text, losing anything longer than 25 characters # since that'll be gibberish (encoded text or somesuch) or shorter # than 3 characters since those short words appear all over the # place return re.findall(r'\b\w{2,25}\b', text) def search(self, search_terms, klass, ignore={}, dre=re.compile(r'([^\d]+)(\d+)')): '''Display search results looking for [search, terms] associated with the hyperdb Class "klass". Ignore hits on {class: property}. "dre" is a helper, not an argument. ''' # do the index lookup hits = self.find(search_terms) if not hits: return {} designator_propname = {} for nm, propclass in klass.getprops().items(): if isinstance(propclass, Link) or isinstance(propclass, Multilink): designator_propname[propclass.classname] = nm # build a dictionary of nodes and their associated messages # and files nodeids = {} # this is the answer propspec = {} # used to do the klass.find for propname in designator_propname.values(): propspec[propname] = {} # used as a set (value doesn't matter) for classname, nodeid, property in hits.values(): # skip this result if we don't care about this class/property if ignore.has_key((classname, property)): continue # if it's a property on klass, it's easy if classname == klass.classname: if not nodeids.has_key(nodeid): nodeids[nodeid] = {} continue # make sure the class is a linked one, otherwise ignore if not designator_propname.has_key(classname): continue # it's a linked class - set up to do the klass.find linkprop = designator_propname[classname] # eg, msg -> messages propspec[linkprop][nodeid] = 1 # retain only the meaningful entries for propname, idset in propspec.items(): if not idset: del propspec[propname] # klass.find tells me the klass nodeids the linked nodes relate to for resid in klass.find(**propspec): resid = str(resid) if not nodeids.has_key(id): nodeids[resid] = {} node_dict = nodeids[resid] # now figure out where it came from for linkprop in propspec.keys(): for nodeid in klass.get(resid, linkprop): if propspec[linkprop].has_key(nodeid): # OK, this node[propname] has a winner if not node_dict.has_key(linkprop): node_dict[linkprop] = [nodeid] else: node_dict[linkprop].append(nodeid) return nodeids # we override this to ignore not 2 < word < 25 and also to fix a bug - # the (fail) case. def find(self, wordlist): '''Locate files that match ALL the words in wordlist ''' if not hasattr(self, 'words'): self.load_index() self.load_index(wordlist=wordlist) entries = {} hits = None for word in wordlist: if not 2 < len(word) < 25: # word outside the bounds of what we index - ignore continue word = word.upper() entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) return {} if hits is None: hits = {} for k in entry.keys(): if not self.fileids.has_key(k): raise ValueError, 'Index is corrupted: re-generate it' hits[k] = self.fileids[k] else: # Eliminate hits for every non-match for fileid in hits.keys(): if not entry.has_key(fileid): del hits[fileid] if hits is None: return {} return hits segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!" def load_index(self, reload=0, wordlist=None): # Unless reload is indicated, do not load twice if self.index_loaded() and not reload: return 0 # Ok, now let's actually load it db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}} # Identify the relevant word-dictionary segments if not wordlist: segments = self.segments else: segments = ['-','#'] for word in wordlist: segments.append(word[0].upper()) # Load the segments for segment in segments: try: f = open(self.indexdb + segment, 'rb') except IOError, error: # probably just nonexistent segment index file if error.errno != errno.ENOENT: raise else: pickle_str = zlib.decompress(f.read()) f.close() dbslice = marshal.loads(pickle_str) if dbslice.get('WORDS'): # if it has some words, add them for word, entry in dbslice['WORDS'].items(): db['WORDS'][word] = entry if dbslice.get('FILES'): # if it has some files, add them db['FILES'] = dbslice['FILES'] if dbslice.get('FILEIDS'): # if it has fileids, add them db['FILEIDS'] = dbslice['FILEIDS'] self.words = db['WORDS'] self.files = db['FILES'] self.fileids = db['FILEIDS'] self.changed = 0 def save_index(self): # only save if the index is loaded and changed if not self.index_loaded() or not self.changed: return # brutal space saver... delete all the small segments for segment in self.segments: try: os.remove(self.indexdb + segment) except OSError, error: # probably just nonexistent segment index file if error.errno != errno.ENOENT: raise # First write the much simpler filename/fileid dictionaries dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids} open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil))) # The hard part is splitting the word dictionary up, of course letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_" segdicts = {} # Need batch of empty dicts for segment in letters: segdicts[segment] = {} for word, entry in self.words.items(): # Split into segment dicts initchar = word[0].upper() segdicts[initchar][word] = entry # save for initchar in letters: db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None} pickle_str = marshal.dumps(db) filename = self.indexdb + initchar pickle_fh = open(filename, 'wb') pickle_fh.write(zlib.compress(pickle_str)) os.chmod(filename, 0664) # save done self.changed = 0 def purge_entry(self, identifier): '''Remove a file from file index and word index ''' self.load_index() if not self.files.has_key(identifier): return file_index = self.files[identifier][0] del self.files[identifier] del self.fileids[file_index] # The much harder part, cleanup the word index for key, occurs in self.words.items(): if occurs.has_key(file_index): del occurs[file_index] # save needed self.changed = 1 def index_loaded(self): return (hasattr(self,'fileids') and hasattr(self,'files') and hasattr(self,'words')) def rollback(self): ''' load last saved index info. ''' self.load_index(reload=1) # vim: set filetype=python ts=4 sw=4 et si
