Mercurial > p > roundup > code
diff roundup/indexer.py @ 825:0779ea9f1f18
More indexer work:
- all String properties may now be indexed too. Currently there's a bit of
"issue" specific code in the actual searching which needs to be
addressed. In a nutshell:
+ pass 'indexme="yes"' as a String() property initialisation arg, eg:
file = FileClass(db, "file", name=String(), type=String(),
comment=String(indexme="yes"))
+ the comment will then be indexed and be searchable, with the results
related back to the issue that the file is linked to
- as a result of this work, the FileClass has a default MIME type that may
be overridden in a subclass, or by the use of a "type" property as is
done in the default templates.
- the regeneration of the indexes (if necessary) is done once the schema is
set up in the dbinit.
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Tue, 09 Jul 2002 03:02:53 +0000 |
| parents | 254b8d112eec |
| children | 6d7a45c8464a |
line wrap: on
line diff
--- a/roundup/indexer.py Tue Jul 09 01:21:24 2002 +0000 +++ b/roundup/indexer.py Tue Jul 09 03:02:53 2002 +0000 @@ -14,7 +14,7 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer.py,v 1.3 2002-07-08 06:58:15 richard Exp $ +#$Id: indexer.py,v 1.4 2002-07-09 03:02:52 richard Exp $ ''' This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of @@ -23,112 +23,44 @@ import os, shutil, re, mimetypes, marshal, zlib, errno class Indexer: - ''' Indexes messages and files. - - This implements a new splitter based on re.findall '\w+' and the - add_othertext method. + ''' Indexes information from roundup's hyperdb to allow efficient + searching. ''' def __init__(self, db_path): indexdb_path = os.path.join(db_path, 'indexes') - - # see if we need to reindex because of a change in code - if (os.path.exists(indexdb_path) and - not os.path.exists(os.path.join(indexdb_path, 'version'))): - shutil.rmtree(indexdb_path) - - # see if the index exists - index_exists = 0 - if not os.path.exists(indexdb_path): - os.makedirs(indexdb_path) - os.chmod(indexdb_path, 0775) - open(os.path.join(indexdb_path, 'version'), 'w').write('1\n') - else: - index_exists = 1 - - # save off the path to the indexdb self.indexdb = os.path.join(indexdb_path, 'index.db') self.reindex = 0 self.casesensitive = 0 self.quiet = 9 - if not index_exists: - # index everything - files_path = os.path.join(db_path, 'files') - self.add_files(dir=files_path) - self.save_index() + # see if we need to reindex because of a change in code + if (not os.path.exists(indexdb_path) or + not os.path.exists(os.path.join(indexdb_path, 'version'))): + # TODO: if the version file exists (in the future) we'll want to + # check the value in it - for now the file itself is a flag + if os.path.exists(indexdb_path): + shutil.rmtree(indexdb_path) + os.makedirs(indexdb_path) + os.chmod(indexdb_path, 0775) + open(os.path.join(indexdb_path, 'version'), 'w').write('1\n') - # override add_files so it's a little smarter about file types - def add_files(self, dir): - if not hasattr(self, 'files'): - self.load_index() - os.path.walk(dir, self.walk_add_file, None) - # Rebuild the fileid index - self.fileids = {} - for fname in self.files.keys(): - fileid = self.files[fname][0] - self.fileids[fileid] = fname + # we need to reindex + self.reindex = 1 + else: + self.reindex = 0 - # override add_file so it can be a little smarter about determining the - # file type - def walk_add_file(self, arg, dname, names, ftype=None): - for name in names: - name = os.path.join(dname, name) - if os.path.isfile(name): - self.add_file(name) - elif os.path.isdir(name): - os.path.walk(name, self.walk_add_file, None) - def add_file(self, fname, ftype=None): - ''' Index the contents of a regular file + def should_reindex(self): + '''Should we reindex? ''' - if not hasattr(self, 'files'): - self.load_index() - # Is file eligible for (re)indexing? - if self.files.has_key(fname): - if self.reindex: - # Reindexing enabled, cleanup dicts - self.purge_entry(fname, self.files, self.words) - else: - # DO NOT reindex this file - if self.quiet < 5: - print "Skipping", fname - return 0 - - # guess the file type - if ftype is None: - ftype = mimetypes.guess_type(fname) - - # read in the file - text = open(fname).read() - if self.quiet < 5: print "Indexing", fname - words = self.splitter(text, ftype) + return self.reindex - # Find new file index, and assign it to filename - # (_TOP uses trick of negative to avoid conflict with file index) - self.files['_TOP'] = (self.files['_TOP'][0]-1, None) - file_index = abs(self.files['_TOP'][0]) - self.files[fname] = (file_index, len(words)) - - filedict = {} - for word in words: - if filedict.has_key(word): - filedict[word] = filedict[word]+1 - else: - filedict[word] = 1 + def add_text(self, identifier, text, mime_type='text/plain'): + ''' Add some text associated with the (classname, nodeid, property) + identifier. + ''' + # make sure the index is loaded + self.load_index() - for word in filedict.keys(): - if self.words.has_key(word): - entry = self.words[word] - else: - entry = {} - entry[file_index] = filedict[word] - self.words[word] = entry - - # NOTE: this method signature deviates from the one specified in - # indexer - I'm not entirely sure where it was expected to the text - # from otherwise... - def add_othertext(self, identifier, text): - ''' Add some text associated with the identifier - ''' # Is file eligible for (re)indexing? if self.files.has_key(identifier): # Reindexing enabled, cleanup dicts @@ -141,7 +73,7 @@ return 0 # split into words - words = self.splitter(text, 'text/plain') + words = self.splitter(text, mime_type) # Find new file index, and assign it to identifier # (_TOP uses trick of negative to avoid conflict with file index) @@ -174,7 +106,7 @@ def splitter(self, text, ftype): ''' Split the contents of a text string into a list of 'words' ''' - if ftype in ('text/plain', 'message/rfc822'): + if ftype == 'text/plain': words = self.text_splitter(text, self.casesensitive) else: return [] @@ -193,37 +125,49 @@ # place return re.findall(r'\b\w{2,25}\b', text) - def search(self, search_terms, klass): - ''' display search results + def search(self, search_terms, klass, ignore={}, + dre=re.compile(r'([^\d]+)(\d+)')): + ''' Display search results looking for [search, terms] associated + with the hyperdb Class "klass". Ignore hits on {class: property}. + + "dre" is a helper, not an argument. ''' + # do the index lookup hits = self.find(search_terms) - links = [] - nodeids = {} + if not hits: + return {} + + # this is specific to "issue" klass ... eugh designator_propname = {'msg': 'messages', 'file': 'files'} - if hits: - hitcount = len(hits) - # build a dictionary of nodes and their associated messages - # and files - for hit in hits.keys(): - filename = hits[hit].split('/')[-1] - for designator, propname in designator_propname.items(): - if not filename.startswith(designator): - continue - nodeid = filename[len(designator):] - result = apply(klass.find, (), {propname:nodeid}) - if not result: - continue + + # build a dictionary of nodes and their associated messages + # and files + nodeids = {} + for classname, nodeid, property in hits.values(): + # skip this result if we don't care about this class/property + if ignore.has_key((classname, property)): + continue + + # if it's a property on klass, it's easy + if classname == klass.classname: + if not nodeids.has_key(nodeid): + nodeids[nodeid] = {} + continue - id = str(result[0]) - if not nodeids.has_key(id): - nodeids[id] = {} + # it's a linked class - find the klass entries that are + # linked to it + linkprop = designator_propname[classname] + for resid in klass.find(**{linkprop: nodeid}): + resid = str(resid) + if not nodeids.has_key(id): + nodeids[resid] = {} - node_dict = nodeids[id] - if not node_dict.has_key(propname): - node_dict[propname] = [nodeid] - elif node_dict.has_key(propname): - node_dict[propname].append(nodeid) - + # update the links for this klass nodeid + node_dict = nodeids[resid] + if not node_dict.has_key(linkprop): + node_dict[linkprop] = [nodeid] + elif node_dict.has_key(linkprop): + node_dict[linkprop].append(nodeid) return nodeids # we override this to ignore not 2 < word < 25 and also to fix a bug - @@ -303,6 +247,9 @@ self.fileids = db['FILEIDS'] def save_index(self): + # make sure we're loaded + self.load_index() + # brutal space saver... delete all the small segments for segment in self.segments: try: @@ -354,6 +301,15 @@ # #$Log: not supported by cvs2svn $ +#Revision 1.3 2002/07/08 06:58:15 richard +#cleaned up the indexer code: +# - it splits more words out (much simpler, faster splitter) +# - removed code we'll never use (roundup.roundup_indexer has the full +# implementation, and replaces roundup.indexer) +# - only index text/plain and rfc822/message (ideas for other text formats to +# index are welcome) +# - added simple unit test for indexer. Needs more tests for regression. +# #Revision 1.2 2002/05/25 07:16:24 rochecompaan #Merged search_indexing-branch with HEAD #
