diff roundup/backends/indexer_dbm.py @ 2089:93f03c6714d8

A few big changes in this commit: 1. The current indexer has been moved to backends/indexer_dbm in anticipation of my writing an indexer_rdbms, 2. Changed indexer invocation during create / set to follow the pattern set by the metakit backend, which was much cleaner, and 3. The "content" property of FileClass is now mutable in all but the metakit backend. Metakit needs to be changed to support the editing of "content". Hey, and I learnt today that the metakit backend implements its own indexer. How about that... :)
author Richard Jones <richard@users.sourceforge.net>
date Fri, 19 Mar 2004 04:47:59 +0000
parents
children d530b68e4b42 49d1fd44881a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/roundup/backends/indexer_dbm.py	Fri Mar 19 04:47:59 2004 +0000
@@ -0,0 +1,349 @@
+#
+# This module is derived from the module described at:
+#   http://gnosis.cx/publish/programming/charming_python_15.txt
+# 
+# Author: David Mertz (mertz@gnosis.cx)
+# Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
+#            Gregory Popovitch (greg@gpy.com)
+# 
+# The original module was released under this license, and remains under
+# it:
+#
+#     This file is released to the public domain.  I (dqm) would
+#     appreciate it if you choose to keep derived works under terms
+#     that promote freedom, but obviously am giving up any rights
+#     to compel such.
+# 
+#$Id: indexer_dbm.py,v 1.1 2004-03-19 04:47:59 richard Exp $
+'''This module provides an indexer class, RoundupIndexer, that stores text
+indices in a roundup instance.  This class makes searching the content of
+messages, string properties and text files possible.
+'''
+__docformat__ = 'restructuredtext'
+
+import os, shutil, re, mimetypes, marshal, zlib, errno
+from roundup.hyperdb import Link, Multilink
+
+class Indexer:
+    '''Indexes information from roundup's hyperdb to allow efficient
+    searching.
+
+    Three structures are created by the indexer::
+
+          files   {identifier: (fileid, wordcount)}
+          words   {word: {fileid: count}}
+          fileids {fileid: identifier}
+
+    where identifier is (classname, nodeid, propertyname)
+    '''
+    def __init__(self, db_path):
+        self.indexdb_path = os.path.join(db_path, 'indexes')
+        self.indexdb = os.path.join(self.indexdb_path, 'index.db')
+        self.reindex = 0
+        self.quiet = 9
+        self.changed = 0
+
+        # see if we need to reindex because of a change in code
+        version = os.path.join(self.indexdb_path, 'version')
+        if (not os.path.exists(self.indexdb_path) or
+                not os.path.exists(version)):
+            # for now the file itself is a flag
+            self.force_reindex()
+        elif os.path.exists(version):
+            version = open(version).read()
+            # check the value and reindex if it's not the latest
+            if version.strip() != '1':
+                self.force_reindex()
+
+    def force_reindex(self):
+        '''Force a reindex condition
+        '''
+        if os.path.exists(self.indexdb_path):
+            shutil.rmtree(self.indexdb_path)
+        os.makedirs(self.indexdb_path)
+        os.chmod(self.indexdb_path, 0775)
+        open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
+        self.reindex = 1
+        self.changed = 1
+
+    def should_reindex(self):
+        '''Should we reindex?
+        '''
+        return self.reindex
+
+    def add_text(self, identifier, text, mime_type='text/plain'):
+        '''Add some text associated with the (classname, nodeid, property)
+        identifier.
+        '''
+        # make sure the index is loaded
+        self.load_index()
+
+        # remove old entries for this identifier
+        if self.files.has_key(identifier):
+            self.purge_entry(identifier)
+
+        # split into words
+        words = self.splitter(text, mime_type)
+
+        # Find new file index, and assign it to identifier
+        # (_TOP uses trick of negative to avoid conflict with file index)
+        self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
+        file_index = abs(self.files['_TOP'][0])
+        self.files[identifier] = (file_index, len(words))
+        self.fileids[file_index] = identifier
+
+        # find the unique words
+        filedict = {}
+        for word in words:
+            if filedict.has_key(word):
+                filedict[word] = filedict[word]+1
+            else:
+                filedict[word] = 1
+
+        # now add to the totals
+        for word in filedict.keys():
+            # each word has a dict of {identifier: count}
+            if self.words.has_key(word):
+                entry = self.words[word]
+            else:
+                # new word
+                entry = {}
+                self.words[word] = entry
+
+            # make a reference to the file for this word
+            entry[file_index] = filedict[word]
+
+        # save needed
+        self.changed = 1
+
+    def splitter(self, text, ftype):
+        '''Split the contents of a text string into a list of 'words'
+        '''
+        if ftype == 'text/plain':
+            words = self.text_splitter(text)
+        else:
+            return []
+        return words
+
+    def text_splitter(self, text):
+        """Split text/plain string into a list of words
+        """
+        # case insensitive
+        text = str(text).upper()
+
+        # Split the raw text, losing anything longer than 25 characters
+        # since that'll be gibberish (encoded text or somesuch) or shorter
+        # than 3 characters since those short words appear all over the
+        # place
+        return re.findall(r'\b\w{2,25}\b', text)
+
+    def search(self, search_terms, klass, ignore={},
+            dre=re.compile(r'([^\d]+)(\d+)')):
+        '''Display search results looking for [search, terms] associated
+        with the hyperdb Class "klass". Ignore hits on {class: property}.
+
+        "dre" is a helper, not an argument.
+        '''
+        # do the index lookup
+        hits = self.find(search_terms)
+        if not hits:
+            return {}
+
+        designator_propname = {}
+        for nm, propclass in klass.getprops().items():
+            if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+                designator_propname[propclass.classname] = nm
+
+        # build a dictionary of nodes and their associated messages
+        # and files
+        nodeids = {}      # this is the answer
+        propspec = {}     # used to do the klass.find
+        for propname in designator_propname.values():
+            propspec[propname] = {}   # used as a set (value doesn't matter)
+        for classname, nodeid, property in hits.values():
+            # skip this result if we don't care about this class/property
+            if ignore.has_key((classname, property)):
+                continue
+
+            # if it's a property on klass, it's easy
+            if classname == klass.classname:
+                if not nodeids.has_key(nodeid):
+                    nodeids[nodeid] = {}
+                continue
+
+            # make sure the class is a linked one, otherwise ignore
+            if not designator_propname.has_key(classname):
+                continue
+
+            # it's a linked class - set up to do the klass.find
+            linkprop = designator_propname[classname]   # eg, msg -> messages
+            propspec[linkprop][nodeid] = 1
+
+        # retain only the meaningful entries
+        for propname, idset in propspec.items():
+            if not idset:
+                del propspec[propname]
+        
+        # klass.find tells me the klass nodeids the linked nodes relate to
+        for resid in klass.find(**propspec):
+            resid = str(resid)
+            if not nodeids.has_key(id):
+                nodeids[resid] = {}
+            node_dict = nodeids[resid]
+            # now figure out where it came from
+            for linkprop in propspec.keys():
+                for nodeid in klass.get(resid, linkprop):
+                    if propspec[linkprop].has_key(nodeid):
+                        # OK, this node[propname] has a winner
+                        if not node_dict.has_key(linkprop):
+                            node_dict[linkprop] = [nodeid]
+                        else:
+                            node_dict[linkprop].append(nodeid)
+        return nodeids
+
+    # we override this to ignore not 2 < word < 25 and also to fix a bug -
+    # the (fail) case.
+    def find(self, wordlist):
+        '''Locate files that match ALL the words in wordlist
+        '''
+        if not hasattr(self, 'words'):
+            self.load_index()
+        self.load_index(wordlist=wordlist)
+        entries = {}
+        hits = None
+        for word in wordlist:
+            if not 2 < len(word) < 25:
+                # word outside the bounds of what we index - ignore
+                continue
+            word = word.upper()
+            entry = self.words.get(word)    # For each word, get index
+            entries[word] = entry           #   of matching files
+            if not entry:                   # Nothing for this one word (fail)
+                return {}
+            if hits is None:
+                hits = {}
+                for k in entry.keys():
+                    if not self.fileids.has_key(k):
+                        raise ValueError, 'Index is corrupted: re-generate it'
+                    hits[k] = self.fileids[k]
+            else:
+                # Eliminate hits for every non-match
+                for fileid in hits.keys():
+                    if not entry.has_key(fileid):
+                        del hits[fileid]
+        if hits is None:
+            return {}
+        return hits
+
+    segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
+    def load_index(self, reload=0, wordlist=None):
+        # Unless reload is indicated, do not load twice
+        if self.index_loaded() and not reload:
+            return 0
+
+        # Ok, now let's actually load it
+        db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
+
+        # Identify the relevant word-dictionary segments
+        if not wordlist:
+            segments = self.segments
+        else:
+            segments = ['-','#']
+            for word in wordlist:
+                segments.append(word[0].upper())
+
+        # Load the segments
+        for segment in segments:
+            try:
+                f = open(self.indexdb + segment, 'rb')
+            except IOError, error:
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
+            else:
+                pickle_str = zlib.decompress(f.read())
+                f.close()
+                dbslice = marshal.loads(pickle_str)
+                if dbslice.get('WORDS'):
+                    # if it has some words, add them
+                    for word, entry in dbslice['WORDS'].items():
+                        db['WORDS'][word] = entry
+                if dbslice.get('FILES'):
+                    # if it has some files, add them
+                    db['FILES'] = dbslice['FILES']
+                if dbslice.get('FILEIDS'):
+                    # if it has fileids, add them
+                    db['FILEIDS'] = dbslice['FILEIDS']
+
+        self.words = db['WORDS']
+        self.files = db['FILES']
+        self.fileids = db['FILEIDS']
+        self.changed = 0
+
+    def save_index(self):
+        # only save if the index is loaded and changed
+        if not self.index_loaded() or not self.changed:
+            return
+
+        # brutal space saver... delete all the small segments
+        for segment in self.segments:
+            try:
+                os.remove(self.indexdb + segment)
+            except OSError, error:
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
+
+        # First write the much simpler filename/fileid dictionaries
+        dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
+        open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
+
+        # The hard part is splitting the word dictionary up, of course
+        letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
+        segdicts = {}                           # Need batch of empty dicts
+        for segment in letters:
+            segdicts[segment] = {}
+        for word, entry in self.words.items():  # Split into segment dicts
+            initchar = word[0].upper()
+            segdicts[initchar][word] = entry
+
+        # save
+        for initchar in letters:
+            db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
+            pickle_str = marshal.dumps(db)
+            filename = self.indexdb + initchar
+            pickle_fh = open(filename, 'wb')
+            pickle_fh.write(zlib.compress(pickle_str))
+            os.chmod(filename, 0664)
+
+        # save done
+        self.changed = 0
+
+    def purge_entry(self, identifier):
+        '''Remove a file from file index and word index
+        '''
+        self.load_index()
+
+        if not self.files.has_key(identifier):
+            return
+
+        file_index = self.files[identifier][0]
+        del self.files[identifier]
+        del self.fileids[file_index]
+
+        # The much harder part, cleanup the word index
+        for key, occurs in self.words.items():
+            if occurs.has_key(file_index):
+                del occurs[file_index]
+
+        # save needed
+        self.changed = 1
+
+    def index_loaded(self):
+        return (hasattr(self,'fileids') and hasattr(self,'files') and
+            hasattr(self,'words'))
+
+
+    def rollback(self):
+        ''' load last saved index info. '''
+        self.load_index(reload=1)
+
+# vim: set filetype=python ts=4 sw=4 et si

Roundup Issue Tracker: http://roundup-tracker.org/