Mercurial > p > roundup > code

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/roundup/backends/indexer_dbm.py	Fri Mar 19 04:47:59 2004 +0000
@@ -0,0 +1,349 @@
+#
+# This module is derived from the module described at:
+#   http://gnosis.cx/publish/programming/charming_python_15.txt
+#
+# Author: David Mertz (mertz@gnosis.cx)
+# Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
+#            Gregory Popovitch (greg@gpy.com)
+#
+# The original module was released under this license, and remains under
+# it:
+#
+#     This file is released to the public domain.  I (dqm) would
+#     appreciate it if you choose to keep derived works under terms
+#     that promote freedom, but obviously am giving up any rights
+#     to compel such.
+#
+#$Id: indexer_dbm.py,v 1.1 2004-03-19 04:47:59 richard Exp $
+'''This module provides an indexer class, RoundupIndexer, that stores text
+indices in a roundup instance.  This class makes searching the content of
+messages, string properties and text files possible.
+'''
+__docformat__ = 'restructuredtext'
+
+import os, shutil, re, mimetypes, marshal, zlib, errno
+from roundup.hyperdb import Link, Multilink
+
+class Indexer:
+    '''Indexes information from roundup's hyperdb to allow efficient
+    searching.
+
+    Three structures are created by the indexer::
+
+          files   {identifier: (fileid, wordcount)}
+          words   {word: {fileid: count}}
+          fileids {fileid: identifier}
+
+    where identifier is (classname, nodeid, propertyname)
+    '''
+    def __init__(self, db_path):
+        self.indexdb_path = os.path.join(db_path, 'indexes')
+        self.indexdb = os.path.join(self.indexdb_path, 'index.db')
+        self.reindex = 0
+        self.quiet = 9
+        self.changed = 0
+
+        # see if we need to reindex because of a change in code
+        version = os.path.join(self.indexdb_path, 'version')
+        if (not os.path.exists(self.indexdb_path) or
+                not os.path.exists(version)):
+            # for now the file itself is a flag
+            self.force_reindex()
+        elif os.path.exists(version):
+            version = open(version).read()
+            # check the value and reindex if it's not the latest
+            if version.strip() != '1':
+                self.force_reindex()
+
+    def force_reindex(self):
+        '''Force a reindex condition
+        '''
+        if os.path.exists(self.indexdb_path):
+            shutil.rmtree(self.indexdb_path)
+        os.makedirs(self.indexdb_path)
+        os.chmod(self.indexdb_path, 0775)
+        open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
+        self.reindex = 1
+        self.changed = 1
+
+    def should_reindex(self):
+        '''Should we reindex?
+        '''
+        return self.reindex
+
+    def add_text(self, identifier, text, mime_type='text/plain'):
+        '''Add some text associated with the (classname, nodeid, property)
+        identifier.
+        '''
+        # make sure the index is loaded
+        self.load_index()
+
+        # remove old entries for this identifier
+        if self.files.has_key(identifier):
+            self.purge_entry(identifier)
+
+        # split into words
+        words = self.splitter(text, mime_type)
+
+        # Find new file index, and assign it to identifier
+        # (_TOP uses trick of negative to avoid conflict with file index)
+        self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
+        file_index = abs(self.files['_TOP'][0])
+        self.files[identifier] = (file_index, len(words))
+        self.fileids[file_index] = identifier
+
+        # find the unique words
+        filedict = {}
+        for word in words:
+            if filedict.has_key(word):
+                filedict[word] = filedict[word]+1
+            else:
+                filedict[word] = 1
+
+        # now add to the totals
+        for word in filedict.keys():
+            # each word has a dict of {identifier: count}
+            if self.words.has_key(word):
+                entry = self.words[word]
+            else:
+                # new word
+                entry = {}
+                self.words[word] = entry
+
+            # make a reference to the file for this word
+            entry[file_index] = filedict[word]
+
+        # save needed
+        self.changed = 1
+
+    def splitter(self, text, ftype):
+        '''Split the contents of a text string into a list of 'words'
+        '''
+        if ftype == 'text/plain':
+            words = self.text_splitter(text)
+        else:
+            return []
+        return words
+
+    def text_splitter(self, text):
+        """Split text/plain string into a list of words
+        """
+        # case insensitive
+        text = str(text).upper()
+
+        # Split the raw text, losing anything longer than 25 characters
+        # since that'll be gibberish (encoded text or somesuch) or shorter
+        # than 3 characters since those short words appear all over the
+        # place
+        return re.findall(r'\b\w{2,25}\b', text)
+
+    def search(self, search_terms, klass, ignore={},
+            dre=re.compile(r'([^\d]+)(\d+)')):
+        '''Display search results looking for [search, terms] associated
+        with the hyperdb Class "klass". Ignore hits on {class: property}.
+
+        "dre" is a helper, not an argument.
+        '''
+        # do the index lookup
+        hits = self.find(search_terms)
+        if not hits:
+            return {}
+
+        designator_propname = {}
+        for nm, propclass in klass.getprops().items():
+            if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+                designator_propname[propclass.classname] = nm
+
+        # build a dictionary of nodes and their associated messages
+        # and files
+        nodeids = {}      # this is the answer
+        propspec = {}     # used to do the klass.find
+        for propname in designator_propname.values():
+            propspec[propname] = {}   # used as a set (value doesn't matter)
+        for classname, nodeid, property in hits.values():
+            # skip this result if we don't care about this class/property
+            if ignore.has_key((classname, property)):
+                continue
+
+            # if it's a property on klass, it's easy
+            if classname == klass.classname:
+                if not nodeids.has_key(nodeid):
+                    nodeids[nodeid] = {}
+                continue
+
+            # make sure the class is a linked one, otherwise ignore
+            if not designator_propname.has_key(classname):
+                continue
+
+            # it's a linked class - set up to do the klass.find
+            linkprop = designator_propname[classname]   # eg, msg -> messages
+            propspec[linkprop][nodeid] = 1
+
+        # retain only the meaningful entries
+        for propname, idset in propspec.items():
+            if not idset:
+                del propspec[propname]
+
+        # klass.find tells me the klass nodeids the linked nodes relate to
+        for resid in klass.find(**propspec):
+            resid = str(resid)
+            if not nodeids.has_key(id):
+                nodeids[resid] = {}
+            node_dict = nodeids[resid]
+            # now figure out where it came from
+            for linkprop in propspec.keys():
+                for nodeid in klass.get(resid, linkprop):
+                    if propspec[linkprop].has_key(nodeid):
+                        # OK, this node[propname] has a winner
+                        if not node_dict.has_key(linkprop):
+                            node_dict[linkprop] = [nodeid]
+                        else:
+                            node_dict[linkprop].append(nodeid)
+        return nodeids
+
+    # we override this to ignore not 2 < word < 25 and also to fix a bug -
+    # the (fail) case.
+    def find(self, wordlist):
+        '''Locate files that match ALL the words in wordlist
+        '''
+        if not hasattr(self, 'words'):
+            self.load_index()
+        self.load_index(wordlist=wordlist)
+        entries = {}
+        hits = None
+        for word in wordlist:
+            if not 2 < len(word) < 25:
+                # word outside the bounds of what we index - ignore
+                continue
+            word = word.upper()
+            entry = self.words.get(word)    # For each word, get index
+            entries[word] = entry           #   of matching files
+            if not entry:                   # Nothing for this one word (fail)
+                return {}
+            if hits is None:
+                hits = {}
+                for k in entry.keys():
+                    if not self.fileids.has_key(k):
+                        raise ValueError, 'Index is corrupted: re-generate it'
+                    hits[k] = self.fileids[k]
+            else:
+                # Eliminate hits for every non-match
+                for fileid in hits.keys():
+                    if not entry.has_key(fileid):
+                        del hits[fileid]
+        if hits is None:
+            return {}
+        return hits
+
+    segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
+    def load_index(self, reload=0, wordlist=None):
+        # Unless reload is indicated, do not load twice
+        if self.index_loaded() and not reload:
+            return 0
+
+        # Ok, now let's actually load it
+        db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
+
+        # Identify the relevant word-dictionary segments
+        if not wordlist:
+            segments = self.segments
+        else:
+            segments = ['-','#']
+            for word in wordlist:
+                segments.append(word[0].upper())
+
+        # Load the segments
+        for segment in segments:
+            try:
+                f = open(self.indexdb + segment, 'rb')
+            except IOError, error:
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
+            else:
+                pickle_str = zlib.decompress(f.read())
+                f.close()
+                dbslice = marshal.loads(pickle_str)
+                if dbslice.get('WORDS'):
+                    # if it has some words, add them
+                    for word, entry in dbslice['WORDS'].items():
+                        db['WORDS'][word] = entry
+                if dbslice.get('FILES'):
+                    # if it has some files, add them
+                    db['FILES'] = dbslice['FILES']
+                if dbslice.get('FILEIDS'):
+                    # if it has fileids, add them
+                    db['FILEIDS'] = dbslice['FILEIDS']
+
+        self.words = db['WORDS']
+        self.files = db['FILES']
+        self.fileids = db['FILEIDS']
+        self.changed = 0
+
+    def save_index(self):
+        # only save if the index is loaded and changed
+        if not self.index_loaded() or not self.changed:
+            return
+
+        # brutal space saver... delete all the small segments
+        for segment in self.segments:
+            try:
+                os.remove(self.indexdb + segment)
+            except OSError, error:
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
+
+        # First write the much simpler filename/fileid dictionaries
+        dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
+        open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
+
+        # The hard part is splitting the word dictionary up, of course
+        letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
+        segdicts = {}                           # Need batch of empty dicts
+        for segment in letters:
+            segdicts[segment] = {}
+        for word, entry in self.words.items():  # Split into segment dicts
+            initchar = word[0].upper()
+            segdicts[initchar][word] = entry
+
+        # save
+        for initchar in letters:
+            db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
+            pickle_str = marshal.dumps(db)
+            filename = self.indexdb + initchar
+            pickle_fh = open(filename, 'wb')
+            pickle_fh.write(zlib.compress(pickle_str))
+            os.chmod(filename, 0664)
+
+        # save done
+        self.changed = 0
+
+    def purge_entry(self, identifier):
+        '''Remove a file from file index and word index
+        '''
+        self.load_index()
+
+        if not self.files.has_key(identifier):
+            return
+
+        file_index = self.files[identifier][0]
+        del self.files[identifier]
+        del self.fileids[file_index]
+
+        # The much harder part, cleanup the word index
+        for key, occurs in self.words.items():
+            if occurs.has_key(file_index):
+                del occurs[file_index]
+
+        # save needed
+        self.changed = 1
+
+    def index_loaded(self):
+        return (hasattr(self,'fileids') and hasattr(self,'files') and
+            hasattr(self,'words'))
+
+
+    def rollback(self):
+        ''' load last saved index info. '''
+        self.load_index(reload=1)
+
+# vim: set filetype=python ts=4 sw=4 et si
author	Richard Jones <richard@users.sourceforge.net>
date	Fri, 19 Mar 2004 04:47:59 +0000
parents
children	d530b68e4b42 49d1fd44881a