Mercurial > p > roundup > code
changeset 683:7f5b51ffe92d search_indexing-0-4-2-branch
Removed dos control characters.
| author | Roche Compaan <rochecompaan@users.sourceforge.net> |
|---|---|
| date | Wed, 03 Apr 2002 12:05:15 +0000 |
| parents | b4d13f7cc6c4 |
| children | 3d8ce8e2dcee |
| files | roundup/indexer.py |
| diffstat | 1 files changed, 787 insertions(+), 784 deletions(-) [+] |
line wrap: on
line diff
--- a/roundup/indexer.py Wed Apr 03 12:01:55 2002 +0000 +++ b/roundup/indexer.py Wed Apr 03 12:05:15 2002 +0000 @@ -1,784 +1,787 @@ -#!/usr/bin/env python - -"""Create full-text indexes and search them - -Notes: - - See http://gnosis.cx/publish/programming/charming_python_15.txt - for a detailed discussion of this module. - - This version requires Python 1.6+. It turns out that the use - of string methods rather than [string] module functions is - enough faster in a tight loop so as to provide a quite - remarkable 25% speedup in overall indexing. However, only FOUR - lines in TextSplitter.text_splitter() were changed away from - Python 1.5 compatibility. Those lines are followed by comments - beginning with "# 1.52: " that show the old forms. Python - 1.5 users can restore these lines, and comment out those just - above them. - -Classes: - - GenericIndexer -- Abstract class - TextSplitter -- Mixin class - Index - ShelveIndexer - FlatIndexer - XMLPickleIndexer - PickleIndexer - ZPickleIndexer - SlicedZPickleIndexer - -Functions: - - echo_fname(fname) - recurse_files(...) - -Index Formats: - - *Indexer.files: filename --> (fileid, wordcount) - *Indexer.fileids: fileid --> filename - *Indexer.words: word --> {fileid1:occurs, fileid2:occurs, ...} - -Module Usage: - - There are a few ways to use this module. Just to utilize existing - functionality, something like the following is a likely - pattern: - - import gnosis.indexer as indexer - index = indexer.MyFavoriteIndexer() # For some concrete Indexer - index.load_index('myIndex.db') - index.add_files(dir='/this/that/otherdir', pattern='*.txt') - hits = index.find(['spam','eggs','bacon']) - index.print_report(hits) - - To customize the basic classes, something like the following is likely: - - class MySplitter: - def splitter(self, text, ftype): - "Peform much better splitting than default (for filetypes)" - # ... - return words - - class MyIndexer(indexer.GenericIndexer, MySplitter): - def load_index(self, INDEXDB=None): - "Retrieve three dictionaries from clever storage method" - # ... - self.words, self.files, self.fileids = WORDS, FILES, FILEIDS - def save_index(self, INDEXDB=None): - "Save three dictionaries to clever storage method" - - index = MyIndexer() - # ...etc... - -Benchmarks: - - As we know, there are lies, damn lies, and benchmarks. Take - the below with an adequate dose of salt. In version 0.10 of - the concrete indexers, some performance was tested. The - test case was a set of mail/news archives, that were about - 43 mB, and 225 files. In each case, an index was generated - (if possible), and a search for the words "xml python" was - performed. - - - Index w/ PickleIndexer: 482s, 2.4 mB - - Search w/ PickleIndexer: 1.74s - - Index w/ ZPickleIndexer: 484s, 1.2 mB - - Search w/ ZPickleIndexer: 1.77s - - Index w/ FlatIndexer: 492s, 2.6 mB - - Search w/ FlatIndexer: 53s - - Index w/ ShelveIndexer: (dumbdbm) Many minutes, tens of mBs - - Search w/ ShelveIndexer: Aborted before completely indexed - - Index w/ ShelveIndexer: (dbhash) Long time (partial crash), 10 mB - - Search w/ ShelveIndexer: N/A. Too many glitches - - Index w/ XMLPickleIndexer: Memory error (xml_pickle uses bad string - composition for large output) - - Search w/ XMLPickleIndexer: N/A - - grep search (xml|python): 20s (cached: <5s) - - 'srch' utility (python): 12s -""" -#$Id: indexer.py,v 1.1.2.2 2002-04-03 12:01:55 rochecompaan Exp $ - -__shell_usage__ = """ -Shell Usage: [python] indexer.py [options] [search_words] - - -h, /h, -?, /?, ?, --help: Show this help screen - -index: Add files to index - -reindex: Refresh files already in the index - (can take much more time) - -casesensitive: Maintain the case of indexed words - (can lead to MUCH larger indices) - -norecurse, -local: Only index starting dir, not subdirs - -dir=<directory>: Starting directory for indexing - (default is current directory) - -indexdb=<database>: Use specified index database - (environ variable INDEXER_DB is preferred) - -regex=<pattern>: Index files matching regular expression - -glob=<pattern>: Index files matching glob pattern - -filter=<pattern> Only display results matching pattern - -output=<op>, -format=<opt>: How much detail on matches? - -<digit>: Quiet level (0=verbose ... 9=quiet) - -Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES, -FILENAMES/NAMES/FILES, SUMMARY/REPORT""" - -__version__ = "$Revision: 1.1.2.2 $" -__author__=["David Mertz (mertz@gnosis.cx)",] -__thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)", - "Gregory Popovitch (greg@gpy.com)", ] -__copyright__=""" - This file is released to the public domain. I (dqm) would - appreciate it if you choose to keep derived works under terms - that promote freedom, but obviously am giving up any rights - to compel such. -""" - -__history__=""" - 0.1 Initial version. - - 0.11 Tweaked TextSplitter after some random experimentation. - - 0.12 Added SlicedZPickleIndexer (best choice, so far). - - 0.13 Pat Knight pointed out need for binary open()'s of - certain files under Windows. - - 0.14 Added '-filter' switch to search results. - - 0.15 Added direct read of gzip files - - 0.20 Gregory Popovitch did some profiling on TextSplitter, - and provided both huge speedups to the Python version - and hooks to a C extension class (ZopeTextSplitter). - A little refactoring by he and I (dqm) has nearly - doubled the speed of indexing - - 0.30 Module refactored into gnosis package. This is a - first pass, and various documentation and test cases - should be added later. -""" -import string, re, os, fnmatch, sys, copy, gzip -from types import * - -#-- Silly "do nothing" default recursive file processor -def echo_fname(fname): print fname - -#-- "Recurse and process files" utility function -def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw): - "Recursively process file pattern" - subdirs, files = [],[] - level = kw.get('level',0) - - for name in os.listdir(curdir): - fname = os.path.join(curdir, name) - if name[-4:] in exclusions: - pass # do not include binary file type - elif os.path.isdir(fname) and not os.path.islink(fname): - subdirs.append(fname) - # kludge to detect a regular expression across python versions - elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject): - if pattern.match(name): - files.append(fname) - elif sys.version[0]=='2' and type(pattern)==type(re.compile('')): - if pattern.match(name): - files.append(fname) - elif type(pattern) is StringType: - if fnmatch.fnmatch(name, pattern): - files.append(fname) - - for fname in files: - apply(func, (fname,)+args) - for subdir in subdirs: - recurse_files(subdir, pattern, exclusions, func, level=level+1) - -#-- Data bundle for index dictionaries -class Index: - def __init__(self, words, files, fileids): - if words is not None: self.WORDS = words - if files is not None: self.FILES = files - if fileids is not None: self.FILEIDS = fileids - -#-- "Split plain text into words" utility function -class TextSplitter: - def initSplitter(self): - prenum = string.join(map(chr, range(0,48)), '') - num2cap = string.join(map(chr, range(58,65)), '') - cap2low = string.join(map(chr, range(91,97)), '') - postlow = string.join(map(chr, range(123,256)), '') - nonword = prenum + num2cap + cap2low + postlow - self.word_only = string.maketrans(nonword, " "*len(nonword)) - self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '') - self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '') - self.ident = string.join(map(chr, range(256)), '') - self.init = 1 - - def splitter(self, text, ftype): - "Split the contents of a text string into a list of 'words'" - if ftype == 'text/plain': - words = self.text_splitter(text, self.casesensitive) - else: - raise NotImplementedError - return words - - def text_splitter(self, text, casesensitive=0): - """Split text/plain string into a list of words - - In version 0.20 this function is still fairly weak at - identifying "real" words, and excluding gibberish - strings. As long as the indexer looks at "real" text - files, it does pretty well; but if indexing of binary - data is attempted, a lot of gibberish gets indexed. - Suggestions on improving this are GREATLY APPRECIATED. - """ - # Initialize some constants - if not hasattr(self,'init'): self.initSplitter() - - # Speedup trick: attributes into local scope - word_only = self.word_only - ident = self.ident - alpha = self.alpha - nondigits = self.nondigits - translate = string.translate - - # Let's adjust case if not case-sensitive - if not casesensitive: text = string.upper(text) - - # Split the raw text - allwords = string.split(text) - - # Finally, let's skip some words not worth indexing - words = [] - for word in allwords: - if len(word) > 25: continue # too long (probably gibberish) - - # Identify common patterns in non-word data (binary, UU/MIME, etc) - num_nonalpha = len(word.translate(ident, alpha)) - numdigits = len(word.translate(ident, nondigits)) - # 1.52: num_nonalpha = len(translate(word, ident, alpha)) - # 1.52: numdigits = len(translate(word, ident, nondigits)) - if numdigits > len(word)-2: # almost all digits - if numdigits > 5: # too many digits is gibberish - continue # a moderate number is year/zipcode/etc - elif num_nonalpha*3 > len(word): # too much scattered nonalpha = gibberish - continue - - word = word.translate(word_only) # Let's strip funny byte values - # 1.52: word = translate(word, word_only) - subwords = word.split() # maybe embedded non-alphanumeric - # 1.52: subwords = string.split(word) - for subword in subwords: # ...so we might have subwords - if len(subword) <= 2: continue # too short a subword - words.append(subword) - return words - -class ZopeTextSplitter: - def initSplitter(self): - import Splitter - stop_words=( - 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across', - 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', - 'along', 'already', 'also', 'although', 'always', 'am', 'among', - 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', - 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', - 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', - 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', - 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', - 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could', - 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due', - 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', - 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', - 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', - 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', - 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', - 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', - 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', - 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', - 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', - 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', - 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', - 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', - 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', - 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', - 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', - 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', - 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps', - 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem', - 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', - 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', - 'somehow', 'someone', 'something', 'sometime', 'sometimes', - 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the', - 'their', 'them', 'themselves', 'then', 'thence', 'there', - 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', - 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', - 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', - 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', - 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', - 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', - 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', - 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', - 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', - 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', - ) - self.stop_word_dict={} - for word in stop_words: self.stop_word_dict[word]=None - self.splitterobj = Splitter.getSplitter() - self.init = 1 - - def goodword(self, word): - return len(word) < 25 - - def splitter(self, text, ftype): - """never case-sensitive""" - if not hasattr(self,'init'): self.initSplitter() - return filter(self.goodword, self.splitterobj(text, self.stop_word_dict)) - - -#-- "Abstract" parent class for inherited indexers -# (does not handle storage in parent, other methods are primitive) - -class GenericIndexer: - def __init__(self, **kw): - apply(self.configure, (), kw) - - def whoami(self): - return self.__class__.__name__ - - def configure(self, REINDEX=0, CASESENSITIVE=0, - INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'), - ADD_PATTERN='*', QUIET=5): - "Configure settings used by indexing and storage/retrieval" - self.indexdb = INDEXDB - self.reindex = REINDEX - self.casesensitive = CASESENSITIVE - self.add_pattern = ADD_PATTERN - self.quiet = QUIET - self.filter = None - - def add_files(self, dir=os.getcwd(), pattern=None, descend=1): - self.load_index() - exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir') - if not pattern: - pattern = self.add_pattern - recurse_files(dir, pattern, exclusions, self.add_file) - # Rebuild the fileid index - self.fileids = {} - for fname in self.files.keys(): - fileid = self.files[fname][0] - self.fileids[fileid] = fname - - def add_file(self, fname, ftype='text/plain'): - "Index the contents of a regular file" - if self.files.has_key(fname): # Is file eligible for (re)indexing? - if self.reindex: # Reindexing enabled, cleanup dicts - self.purge_entry(fname, self.files, self.words) - else: # DO NOT reindex this file - if self.quiet < 5: print "Skipping", fname - return 0 - - # Read in the file (if possible) - try: - if fname[-3:] == '.gz': - text = gzip.open(fname).read() - else: - text = open(fname).read() - if self.quiet < 5: print "Indexing", fname - except IOError: - return 0 - words = self.splitter(text, ftype) - - # Find new file index, and assign it to filename - # (_TOP uses trick of negative to avoid conflict with file index) - self.files['_TOP'] = (self.files['_TOP'][0]-1, None) - file_index = abs(self.files['_TOP'][0]) - self.files[fname] = (file_index, len(words)) - - filedict = {} - for word in words: - if filedict.has_key(word): - filedict[word] = filedict[word]+1 - else: - filedict[word] = 1 - - for word in filedict.keys(): - if self.words.has_key(word): - entry = self.words[word] - else: - entry = {} - entry[file_index] = filedict[word] - self.words[word] = entry - - def add_othertext(self, identifier): - """Index a textual source other than a plain file - - A child class might want to implement this method (or a similar one) - in order to index textual sources such as SQL tables, URLs, clay - tablets, or whatever else. The identifier should uniquely pick out - the source of the text (whatever it is) - """ - raise NotImplementedError - - def save_index(self, INDEXDB=None): - raise NotImplementedError - - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - raise NotImplementedError - - def find(self, wordlist, print_report=0): - "Locate files that match ALL the words in wordlist" - self.load_index(wordlist=wordlist) - entries = {} - hits = copy.copy(self.fileids) # Copy of fileids index - for word in wordlist: - if not self.casesensitive: - word = string.upper(word) - entry = self.words.get(word) # For each word, get index - entries[word] = entry # of matching files - if not entry: # Nothing for this one word (fail) - return 0 - for fileid in hits.keys(): # Eliminate hits for every non-match - if not entry.has_key(fileid): - del hits[fileid] - if print_report: - self.print_report(hits, wordlist, entries) - return hits - - def print_report(self, hits={}, wordlist=[], entries={}): - # Figure out what to actually print (based on QUIET level) - output = [] - for fileid,fname in hits.items(): - message = fname - if self.quiet <= 3: - wordcount = self.files[fname][1] - matches = 0 - countmess = '\n'+' '*13+`wordcount`+' words; ' - for word in wordlist: - if not self.casesensitive: - word = string.upper(word) - occurs = entries[word][fileid] - matches = matches+occurs - countmess = countmess +`occurs`+' '+word+'; ' - message = string.ljust('[RATING: ' - +`1000*matches/wordcount`+']',13)+message - if self.quiet <= 2: message = message +countmess +'\n' - if self.filter: # Using an output filter - if fnmatch.fnmatch(message, self.filter): - output.append(message) - else: - output.append(message) - - if self.quiet <= 5: - print string.join(output,'\n') - sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+ - `wordlist`+'\n') - return output - - def purge_entry(self, fname, file_dct, word_dct): - "Remove a file from file index and word index" - try: # The easy part, cleanup the file index - file_index = file_dct[fname] - del file_dct[fname] - except KeyError: - pass # We'll assume we only encounter KeyError's - # The much harder part, cleanup the word index - for word, occurs in word_dct.items(): - if occurs.has_key(file_index): - del occurs[file_index] - word_dct[word] = occurs - - def index_loaded(self): - return ( hasattr(self,'fileids') and - hasattr(self,'files') and - hasattr(self,'words') ) - -#-- Provide an actual storage facility for the indexes (i.e. shelve) -class ShelveIndexer(GenericIndexer, TextSplitter): - """Concrete Indexer utilizing [shelve] for storage - - Unfortunately, [shelve] proves far too slow in indexing, while - creating monstrously large indexes. Not recommend, at least under - the default dbm's tested. Also, class may be broken because - shelves do not, apparently, support the .values() and .items() - methods. Fixing this is a low priority, but the sample code is - left here. - """ - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - INDEXDB = INDEXDB or self.indexdb - import shelve - self.words = shelve.open(INDEXDB+".WORDS") - self.files = shelve.open(INDEXDB+".FILES") - self.fileids = shelve.open(INDEXDB+".FILEIDS") - if not FILES: # New index - self.files['_TOP'] = (0,None) - - def save_index(self, INDEXDB=None): - INDEXDB = INDEXDB or self.indexdb - pass - -class FlatIndexer(GenericIndexer, TextSplitter): - """Concrete Indexer utilizing flat-file for storage - - See the comments in the referenced article for details; in - brief, this indexer has about the same timing as the best in - -creating- indexes and the storage requirements are - reasonable. However, actually -using- a flat-file index is - more than an order of magnitude worse than the best indexer - (ZPickleIndexer wins overall). - - On the other hand, FlatIndexer creates a wonderfully easy to - parse database format if you have a reason to transport the - index to a different platform or programming language. And - should you perform indexing as part of a long-running - process, the overhead of initial file parsing becomes - irrelevant. - """ - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - INDEXDB = INDEXDB or self.indexdb - self.words = {} - self.files = {'_TOP':(0,None)} - self.fileids = {} - try: # Read index contents - for line in open(INDEXDB).readlines(): - fields = string.split(line) - if fields[0] == '-': # Read a file/fileid line - fileid = eval(fields[2]) - wordcount = eval(fields[3]) - fname = fields[1] - self.files[fname] = (fileid, wordcount) - self.fileids[fileid] = fname - else: # Read a word entry (dict of hits) - entries = {} - word = fields[0] - for n in range(1,len(fields),2): - fileid = eval(fields[n]) - occurs = eval(fields[n+1]) - entries[fileid] = occurs - self.words[word] = entries - except: - pass # New index - - def save_index(self, INDEXDB=None): - INDEXDB = INDEXDB or self.indexdb - tab, lf, sp = '\t','\n',' ' - indexdb = open(INDEXDB,'w') - for fname,entry in self.files.items(): - indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf) - for word,entry in self.words.items(): - indexdb.write(word +tab+tab) - for fileid,occurs in entry.items(): - indexdb.write(`fileid` +sp +`occurs` +sp) - indexdb.write(lf) - -class PickleIndexer(GenericIndexer, TextSplitter): - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - import cPickle - INDEXDB = INDEXDB or self.indexdb - try: - pickle_str = open(INDEXDB,'rb').read() - db = cPickle.loads(pickle_str) - except: # New index - db = Index({}, {'_TOP':(0,None)}, {}) - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def save_index(self, INDEXDB=None): - import cPickle - INDEXDB = INDEXDB or self.indexdb - db = Index(self.words, self.files, self.fileids) - open(INDEXDB,'wb').write(cPickle.dumps(db, 1)) - -class XMLPickleIndexer(PickleIndexer): - """Concrete Indexer utilizing XML for storage - - While this is, as expected, a verbose format, the possibility - of using XML as a transport format for indexes might be - useful. However, [xml_pickle] is in need of some redesign to - avoid gross inefficiency when creating very large - (multi-megabyte) output files (fixed in [xml_pickle] version - 0.48 or above) - """ - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - from gnosis.xml.pickle import XML_Pickler - INDEXDB = INDEXDB or self.indexdb - try: # XML file exists - xml_str = open(INDEXDB).read() - db = XML_Pickler().loads(xml_str) - except: # New index - db = Index({}, {'_TOP':(0,None)}, {}) - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def save_index(self, INDEXDB=None): - from gnosis.xml.pickle import XML_Pickler - INDEXDB = INDEXDB or self.indexdb - db = Index(self.words, self.files, self.fileids) - open(INDEXDB,'w').write(XML_Pickler(db).dumps()) - -class ZPickleIndexer(PickleIndexer): - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - try: - pickle_str = zlib.decompress(open(INDEXDB+'!','rb').read()) - db = cPickle.loads(pickle_str) - except: # New index - db = Index({}, {'_TOP':(0,None)}, {}) - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def save_index(self, INDEXDB=None): - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - db = Index(self.words, self.files, self.fileids) - pickle_fh = open(INDEXDB+'!','wb') - pickle_fh.write(zlib.compress(cPickle.dumps(db, 1))) - - -class SlicedZPickleIndexer(ZPickleIndexer): - segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!" - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - db = Index({}, {'_TOP':(0,None)}, {}) - # Identify the relevant word-dictionary segments - if not wordlist: - segments = self.segments - else: - segments = ['-','#'] - for word in wordlist: - segments.append(string.upper(word[0])) - # Load the segments - for segment in segments: - try: - pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read()) - dbslice = cPickle.loads(pickle_str) - if dbslice.__dict__.get('WORDS'): # If it has some words, add them - for word,entry in dbslice.WORDS.items(): - db.WORDS[word] = entry - if dbslice.__dict__.get('FILES'): # If it has some files, add them - db.FILES = dbslice.FILES - if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them - db.FILEIDS = dbslice.FILEIDS - except: - pass # No biggie, couldn't find this segment - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def julienne(self, INDEXDB=None): - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - segments = self.segments # all the (little) indexes - for segment in segments: - try: # brutal space saver... delete all the small segments - os.remove(INDEXDB+segment) - except OSError: - pass # probably just nonexistent segment index file - # First write the much simpler filename/fileid dictionaries - dbfil = Index(None, self.files, self.fileids) - open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1))) - # The hard part is splitting the word dictionary up, of course - letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - segdicts = {} # Need batch of empty dicts - for segment in letters+'#': - segdicts[segment] = {} - for word, entry in self.words.items(): # Split into segment dicts - initchar = string.upper(word[0]) - if initchar in letters: - segdicts[initchar][word] = entry - else: - segdicts['#'][word] = entry - for initchar in letters+'#': - db = Index(segdicts[initchar], None, None) - pickle_str = cPickle.dumps(db, 1) - filename = INDEXDB+initchar - pickle_fh = open(filename,'wb') - pickle_fh.write(zlib.compress(pickle_str)) - os.chmod(filename,0664) - - save_index = julienne - -PreferredIndexer = SlicedZPickleIndexer - -#-- If called from command-line, parse arguments and take actions -if __name__ == '__main__': - import time - start = time.time() - search_words = [] # Word search list (if specified) - opts = 0 # Any options specified? - if len(sys.argv) < 2: - pass # No options given - else: - upper = string.upper - dir = os.getcwd() # Default to indexing from current directory - descend = 1 # Default to recursive indexing - ndx = PreferredIndexer() - for opt in sys.argv[1:]: - if opt in ('-h','/h','-?','/?','?','--help'): # help screen - print __shell_usage__ - opts = -1 - break - elif opt[0] in '/-': # a switch! - opts = opts+1 - if upper(opt[1:]) == 'INDEX': # Index files - ndx.quiet = 0 - pass # Use defaults if no other options - elif upper(opt[1:]) == 'REINDEX': # Reindex - ndx.reindex = 1 - elif upper(opt[1:]) == 'CASESENSITIVE': # Case sensitive - ndx.casesensitive = 1 - elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion - descend = 0 - elif upper(opt[1:4]) == 'DIR': # Dir to index - dir = opt[5:] - elif upper(opt[1:8]) == 'INDEXDB': # Index specified - ndx.indexdb = opt[9:] - sys.stderr.write( - "Use of INDEXER_DB environment variable is STRONGLY recommended.\n") - elif upper(opt[1:6]) == 'REGEX': # RegEx files to index - ndx.add_pattern = re.compile(opt[7:]) - elif upper(opt[1:5]) == 'GLOB': # Glob files to index - ndx.add_pattern = opt[6:] - elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look? - opts = opts-1 # this is not an option for indexing purposes - level = upper(opt[8:]) - if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'): - ndx.quiet = 0 - elif level in ('RATINGS','SCORES','HIGH'): - ndx.quiet = 3 - elif level in ('FILENAMES','NAMES','FILES','MID'): - ndx.quiet = 5 - elif level in ('SUMMARY','MIN'): - ndx.quiet = 9 - elif upper(opt[1:7]) == 'FILTER': # Regex filter output - opts = opts-1 # this is not an option for indexing purposes - ndx.filter = opt[8:] - elif opt[1:] in string.digits: - opts = opts-1 - ndx.quiet = eval(opt[1]) - else: - search_words.append(opt) # Search words - - if opts > 0: - ndx.add_files(dir=dir) - ndx.save_index() - if search_words: - ndx.find(search_words, print_report=1) - if not opts and not search_words: - sys.stderr.write("Perhaps you would like to use the --help option?\n") - else: - sys.stderr.write('Processed in %.3f seconds (%s)' - % (time.time()-start, ndx.whoami())) - -# -#$Log: not supported by cvs2svn $ +#!/usr/bin/env python + +"""Create full-text indexes and search them + +Notes: + + See http://gnosis.cx/publish/programming/charming_python_15.txt + for a detailed discussion of this module. + + This version requires Python 1.6+. It turns out that the use + of string methods rather than [string] module functions is + enough faster in a tight loop so as to provide a quite + remarkable 25% speedup in overall indexing. However, only FOUR + lines in TextSplitter.text_splitter() were changed away from + Python 1.5 compatibility. Those lines are followed by comments + beginning with "# 1.52: " that show the old forms. Python + 1.5 users can restore these lines, and comment out those just + above them. + +Classes: + + GenericIndexer -- Abstract class + TextSplitter -- Mixin class + Index + ShelveIndexer + FlatIndexer + XMLPickleIndexer + PickleIndexer + ZPickleIndexer + SlicedZPickleIndexer + +Functions: + + echo_fname(fname) + recurse_files(...) + +Index Formats: + + *Indexer.files: filename --> (fileid, wordcount) + *Indexer.fileids: fileid --> filename + *Indexer.words: word --> {fileid1:occurs, fileid2:occurs, ...} + +Module Usage: + + There are a few ways to use this module. Just to utilize existing + functionality, something like the following is a likely + pattern: + + import gnosis.indexer as indexer + index = indexer.MyFavoriteIndexer() # For some concrete Indexer + index.load_index('myIndex.db') + index.add_files(dir='/this/that/otherdir', pattern='*.txt') + hits = index.find(['spam','eggs','bacon']) + index.print_report(hits) + + To customize the basic classes, something like the following is likely: + + class MySplitter: + def splitter(self, text, ftype): + "Peform much better splitting than default (for filetypes)" + # ... + return words + + class MyIndexer(indexer.GenericIndexer, MySplitter): + def load_index(self, INDEXDB=None): + "Retrieve three dictionaries from clever storage method" + # ... + self.words, self.files, self.fileids = WORDS, FILES, FILEIDS + def save_index(self, INDEXDB=None): + "Save three dictionaries to clever storage method" + + index = MyIndexer() + # ...etc... + +Benchmarks: + + As we know, there are lies, damn lies, and benchmarks. Take + the below with an adequate dose of salt. In version 0.10 of + the concrete indexers, some performance was tested. The + test case was a set of mail/news archives, that were about + 43 mB, and 225 files. In each case, an index was generated + (if possible), and a search for the words "xml python" was + performed. + + - Index w/ PickleIndexer: 482s, 2.4 mB + - Search w/ PickleIndexer: 1.74s + - Index w/ ZPickleIndexer: 484s, 1.2 mB + - Search w/ ZPickleIndexer: 1.77s + - Index w/ FlatIndexer: 492s, 2.6 mB + - Search w/ FlatIndexer: 53s + - Index w/ ShelveIndexer: (dumbdbm) Many minutes, tens of mBs + - Search w/ ShelveIndexer: Aborted before completely indexed + - Index w/ ShelveIndexer: (dbhash) Long time (partial crash), 10 mB + - Search w/ ShelveIndexer: N/A. Too many glitches + - Index w/ XMLPickleIndexer: Memory error (xml_pickle uses bad string + composition for large output) + - Search w/ XMLPickleIndexer: N/A + - grep search (xml|python): 20s (cached: <5s) + - 'srch' utility (python): 12s +""" +#$Id: indexer.py,v 1.1.2.3 2002-04-03 12:05:15 rochecompaan Exp $ + +__shell_usage__ = """ +Shell Usage: [python] indexer.py [options] [search_words] + + -h, /h, -?, /?, ?, --help: Show this help screen + -index: Add files to index + -reindex: Refresh files already in the index + (can take much more time) + -casesensitive: Maintain the case of indexed words + (can lead to MUCH larger indices) + -norecurse, -local: Only index starting dir, not subdirs + -dir=<directory>: Starting directory for indexing + (default is current directory) + -indexdb=<database>: Use specified index database + (environ variable INDEXER_DB is preferred) + -regex=<pattern>: Index files matching regular expression + -glob=<pattern>: Index files matching glob pattern + -filter=<pattern> Only display results matching pattern + -output=<op>, -format=<opt>: How much detail on matches? + -<digit>: Quiet level (0=verbose ... 9=quiet) + +Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES, +FILENAMES/NAMES/FILES, SUMMARY/REPORT""" + +__version__ = "$Revision: 1.1.2.3 $" +__author__=["David Mertz (mertz@gnosis.cx)",] +__thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)", + "Gregory Popovitch (greg@gpy.com)", ] +__copyright__=""" + This file is released to the public domain. I (dqm) would + appreciate it if you choose to keep derived works under terms + that promote freedom, but obviously am giving up any rights + to compel such. +""" + +__history__=""" + 0.1 Initial version. + + 0.11 Tweaked TextSplitter after some random experimentation. + + 0.12 Added SlicedZPickleIndexer (best choice, so far). + + 0.13 Pat Knight pointed out need for binary open()'s of + certain files under Windows. + + 0.14 Added '-filter' switch to search results. + + 0.15 Added direct read of gzip files + + 0.20 Gregory Popovitch did some profiling on TextSplitter, + and provided both huge speedups to the Python version + and hooks to a C extension class (ZopeTextSplitter). + A little refactoring by he and I (dqm) has nearly + doubled the speed of indexing + + 0.30 Module refactored into gnosis package. This is a + first pass, and various documentation and test cases + should be added later. +""" +import string, re, os, fnmatch, sys, copy, gzip +from types import * + +#-- Silly "do nothing" default recursive file processor +def echo_fname(fname): print fname + +#-- "Recurse and process files" utility function +def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw): + "Recursively process file pattern" + subdirs, files = [],[] + level = kw.get('level',0) + + for name in os.listdir(curdir): + fname = os.path.join(curdir, name) + if name[-4:] in exclusions: + pass # do not include binary file type + elif os.path.isdir(fname) and not os.path.islink(fname): + subdirs.append(fname) + # kludge to detect a regular expression across python versions + elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject): + if pattern.match(name): + files.append(fname) + elif sys.version[0]=='2' and type(pattern)==type(re.compile('')): + if pattern.match(name): + files.append(fname) + elif type(pattern) is StringType: + if fnmatch.fnmatch(name, pattern): + files.append(fname) + + for fname in files: + apply(func, (fname,)+args) + for subdir in subdirs: + recurse_files(subdir, pattern, exclusions, func, level=level+1) + +#-- Data bundle for index dictionaries +class Index: + def __init__(self, words, files, fileids): + if words is not None: self.WORDS = words + if files is not None: self.FILES = files + if fileids is not None: self.FILEIDS = fileids + +#-- "Split plain text into words" utility function +class TextSplitter: + def initSplitter(self): + prenum = string.join(map(chr, range(0,48)), '') + num2cap = string.join(map(chr, range(58,65)), '') + cap2low = string.join(map(chr, range(91,97)), '') + postlow = string.join(map(chr, range(123,256)), '') + nonword = prenum + num2cap + cap2low + postlow + self.word_only = string.maketrans(nonword, " "*len(nonword)) + self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '') + self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '') + self.ident = string.join(map(chr, range(256)), '') + self.init = 1 + + def splitter(self, text, ftype): + "Split the contents of a text string into a list of 'words'" + if ftype == 'text/plain': + words = self.text_splitter(text, self.casesensitive) + else: + raise NotImplementedError + return words + + def text_splitter(self, text, casesensitive=0): + """Split text/plain string into a list of words + + In version 0.20 this function is still fairly weak at + identifying "real" words, and excluding gibberish + strings. As long as the indexer looks at "real" text + files, it does pretty well; but if indexing of binary + data is attempted, a lot of gibberish gets indexed. + Suggestions on improving this are GREATLY APPRECIATED. + """ + # Initialize some constants + if not hasattr(self,'init'): self.initSplitter() + + # Speedup trick: attributes into local scope + word_only = self.word_only + ident = self.ident + alpha = self.alpha + nondigits = self.nondigits + translate = string.translate + + # Let's adjust case if not case-sensitive + if not casesensitive: text = string.upper(text) + + # Split the raw text + allwords = string.split(text) + + # Finally, let's skip some words not worth indexing + words = [] + for word in allwords: + if len(word) > 25: continue # too long (probably gibberish) + + # Identify common patterns in non-word data (binary, UU/MIME, etc) + num_nonalpha = len(word.translate(ident, alpha)) + numdigits = len(word.translate(ident, nondigits)) + # 1.52: num_nonalpha = len(translate(word, ident, alpha)) + # 1.52: numdigits = len(translate(word, ident, nondigits)) + if numdigits > len(word)-2: # almost all digits + if numdigits > 5: # too many digits is gibberish + continue # a moderate number is year/zipcode/etc + elif num_nonalpha*3 > len(word): # too much scattered nonalpha = gibberish + continue + + word = word.translate(word_only) # Let's strip funny byte values + # 1.52: word = translate(word, word_only) + subwords = word.split() # maybe embedded non-alphanumeric + # 1.52: subwords = string.split(word) + for subword in subwords: # ...so we might have subwords + if len(subword) <= 2: continue # too short a subword + words.append(subword) + return words + +class ZopeTextSplitter: + def initSplitter(self): + import Splitter + stop_words=( + 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across', + 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', + 'along', 'already', 'also', 'although', 'always', 'am', 'among', + 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', + 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', + 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', + 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', + 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', + 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could', + 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due', + 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', + 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', + 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', + 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', + 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', + 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', + 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', + 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', + 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', + 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', + 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', + 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', + 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', + 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', + 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', + 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', + 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps', + 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem', + 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', + 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', + 'somehow', 'someone', 'something', 'sometime', 'sometimes', + 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the', + 'their', 'them', 'themselves', 'then', 'thence', 'there', + 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', + 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', + 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', + 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', + 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', + 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', + 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', + 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', + 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', + 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', + ) + self.stop_word_dict={} + for word in stop_words: self.stop_word_dict[word]=None + self.splitterobj = Splitter.getSplitter() + self.init = 1 + + def goodword(self, word): + return len(word) < 25 + + def splitter(self, text, ftype): + """never case-sensitive""" + if not hasattr(self,'init'): self.initSplitter() + return filter(self.goodword, self.splitterobj(text, self.stop_word_dict)) + + +#-- "Abstract" parent class for inherited indexers +# (does not handle storage in parent, other methods are primitive) + +class GenericIndexer: + def __init__(self, **kw): + apply(self.configure, (), kw) + + def whoami(self): + return self.__class__.__name__ + + def configure(self, REINDEX=0, CASESENSITIVE=0, + INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'), + ADD_PATTERN='*', QUIET=5): + "Configure settings used by indexing and storage/retrieval" + self.indexdb = INDEXDB + self.reindex = REINDEX + self.casesensitive = CASESENSITIVE + self.add_pattern = ADD_PATTERN + self.quiet = QUIET + self.filter = None + + def add_files(self, dir=os.getcwd(), pattern=None, descend=1): + self.load_index() + exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir') + if not pattern: + pattern = self.add_pattern + recurse_files(dir, pattern, exclusions, self.add_file) + # Rebuild the fileid index + self.fileids = {} + for fname in self.files.keys(): + fileid = self.files[fname][0] + self.fileids[fileid] = fname + + def add_file(self, fname, ftype='text/plain'): + "Index the contents of a regular file" + if self.files.has_key(fname): # Is file eligible for (re)indexing? + if self.reindex: # Reindexing enabled, cleanup dicts + self.purge_entry(fname, self.files, self.words) + else: # DO NOT reindex this file + if self.quiet < 5: print "Skipping", fname + return 0 + + # Read in the file (if possible) + try: + if fname[-3:] == '.gz': + text = gzip.open(fname).read() + else: + text = open(fname).read() + if self.quiet < 5: print "Indexing", fname + except IOError: + return 0 + words = self.splitter(text, ftype) + + # Find new file index, and assign it to filename + # (_TOP uses trick of negative to avoid conflict with file index) + self.files['_TOP'] = (self.files['_TOP'][0]-1, None) + file_index = abs(self.files['_TOP'][0]) + self.files[fname] = (file_index, len(words)) + + filedict = {} + for word in words: + if filedict.has_key(word): + filedict[word] = filedict[word]+1 + else: + filedict[word] = 1 + + for word in filedict.keys(): + if self.words.has_key(word): + entry = self.words[word] + else: + entry = {} + entry[file_index] = filedict[word] + self.words[word] = entry + + def add_othertext(self, identifier): + """Index a textual source other than a plain file + + A child class might want to implement this method (or a similar one) + in order to index textual sources such as SQL tables, URLs, clay + tablets, or whatever else. The identifier should uniquely pick out + the source of the text (whatever it is) + """ + raise NotImplementedError + + def save_index(self, INDEXDB=None): + raise NotImplementedError + + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + raise NotImplementedError + + def find(self, wordlist, print_report=0): + "Locate files that match ALL the words in wordlist" + self.load_index(wordlist=wordlist) + entries = {} + hits = copy.copy(self.fileids) # Copy of fileids index + for word in wordlist: + if not self.casesensitive: + word = string.upper(word) + entry = self.words.get(word) # For each word, get index + entries[word] = entry # of matching files + if not entry: # Nothing for this one word (fail) + return 0 + for fileid in hits.keys(): # Eliminate hits for every non-match + if not entry.has_key(fileid): + del hits[fileid] + if print_report: + self.print_report(hits, wordlist, entries) + return hits + + def print_report(self, hits={}, wordlist=[], entries={}): + # Figure out what to actually print (based on QUIET level) + output = [] + for fileid,fname in hits.items(): + message = fname + if self.quiet <= 3: + wordcount = self.files[fname][1] + matches = 0 + countmess = '\n'+' '*13+`wordcount`+' words; ' + for word in wordlist: + if not self.casesensitive: + word = string.upper(word) + occurs = entries[word][fileid] + matches = matches+occurs + countmess = countmess +`occurs`+' '+word+'; ' + message = string.ljust('[RATING: ' + +`1000*matches/wordcount`+']',13)+message + if self.quiet <= 2: message = message +countmess +'\n' + if self.filter: # Using an output filter + if fnmatch.fnmatch(message, self.filter): + output.append(message) + else: + output.append(message) + + if self.quiet <= 5: + print string.join(output,'\n') + sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+ + `wordlist`+'\n') + return output + + def purge_entry(self, fname, file_dct, word_dct): + "Remove a file from file index and word index" + try: # The easy part, cleanup the file index + file_index = file_dct[fname] + del file_dct[fname] + except KeyError: + pass # We'll assume we only encounter KeyError's + # The much harder part, cleanup the word index + for word, occurs in word_dct.items(): + if occurs.has_key(file_index): + del occurs[file_index] + word_dct[word] = occurs + + def index_loaded(self): + return ( hasattr(self,'fileids') and + hasattr(self,'files') and + hasattr(self,'words') ) + +#-- Provide an actual storage facility for the indexes (i.e. shelve) +class ShelveIndexer(GenericIndexer, TextSplitter): + """Concrete Indexer utilizing [shelve] for storage + + Unfortunately, [shelve] proves far too slow in indexing, while + creating monstrously large indexes. Not recommend, at least under + the default dbm's tested. Also, class may be broken because + shelves do not, apparently, support the .values() and .items() + methods. Fixing this is a low priority, but the sample code is + left here. + """ + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + INDEXDB = INDEXDB or self.indexdb + import shelve + self.words = shelve.open(INDEXDB+".WORDS") + self.files = shelve.open(INDEXDB+".FILES") + self.fileids = shelve.open(INDEXDB+".FILEIDS") + if not FILES: # New index + self.files['_TOP'] = (0,None) + + def save_index(self, INDEXDB=None): + INDEXDB = INDEXDB or self.indexdb + pass + +class FlatIndexer(GenericIndexer, TextSplitter): + """Concrete Indexer utilizing flat-file for storage + + See the comments in the referenced article for details; in + brief, this indexer has about the same timing as the best in + -creating- indexes and the storage requirements are + reasonable. However, actually -using- a flat-file index is + more than an order of magnitude worse than the best indexer + (ZPickleIndexer wins overall). + + On the other hand, FlatIndexer creates a wonderfully easy to + parse database format if you have a reason to transport the + index to a different platform or programming language. And + should you perform indexing as part of a long-running + process, the overhead of initial file parsing becomes + irrelevant. + """ + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + INDEXDB = INDEXDB or self.indexdb + self.words = {} + self.files = {'_TOP':(0,None)} + self.fileids = {} + try: # Read index contents + for line in open(INDEXDB).readlines(): + fields = string.split(line) + if fields[0] == '-': # Read a file/fileid line + fileid = eval(fields[2]) + wordcount = eval(fields[3]) + fname = fields[1] + self.files[fname] = (fileid, wordcount) + self.fileids[fileid] = fname + else: # Read a word entry (dict of hits) + entries = {} + word = fields[0] + for n in range(1,len(fields),2): + fileid = eval(fields[n]) + occurs = eval(fields[n+1]) + entries[fileid] = occurs + self.words[word] = entries + except: + pass # New index + + def save_index(self, INDEXDB=None): + INDEXDB = INDEXDB or self.indexdb + tab, lf, sp = '\t','\n',' ' + indexdb = open(INDEXDB,'w') + for fname,entry in self.files.items(): + indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf) + for word,entry in self.words.items(): + indexdb.write(word +tab+tab) + for fileid,occurs in entry.items(): + indexdb.write(`fileid` +sp +`occurs` +sp) + indexdb.write(lf) + +class PickleIndexer(GenericIndexer, TextSplitter): + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + import cPickle + INDEXDB = INDEXDB or self.indexdb + try: + pickle_str = open(INDEXDB,'rb').read() + db = cPickle.loads(pickle_str) + except: # New index + db = Index({}, {'_TOP':(0,None)}, {}) + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def save_index(self, INDEXDB=None): + import cPickle + INDEXDB = INDEXDB or self.indexdb + db = Index(self.words, self.files, self.fileids) + open(INDEXDB,'wb').write(cPickle.dumps(db, 1)) + +class XMLPickleIndexer(PickleIndexer): + """Concrete Indexer utilizing XML for storage + + While this is, as expected, a verbose format, the possibility + of using XML as a transport format for indexes might be + useful. However, [xml_pickle] is in need of some redesign to + avoid gross inefficiency when creating very large + (multi-megabyte) output files (fixed in [xml_pickle] version + 0.48 or above) + """ + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + from gnosis.xml.pickle import XML_Pickler + INDEXDB = INDEXDB or self.indexdb + try: # XML file exists + xml_str = open(INDEXDB).read() + db = XML_Pickler().loads(xml_str) + except: # New index + db = Index({}, {'_TOP':(0,None)}, {}) + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def save_index(self, INDEXDB=None): + from gnosis.xml.pickle import XML_Pickler + INDEXDB = INDEXDB or self.indexdb + db = Index(self.words, self.files, self.fileids) + open(INDEXDB,'w').write(XML_Pickler(db).dumps()) + +class ZPickleIndexer(PickleIndexer): + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + try: + pickle_str = zlib.decompress(open(INDEXDB+'!','rb').read()) + db = cPickle.loads(pickle_str) + except: # New index + db = Index({}, {'_TOP':(0,None)}, {}) + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def save_index(self, INDEXDB=None): + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + db = Index(self.words, self.files, self.fileids) + pickle_fh = open(INDEXDB+'!','wb') + pickle_fh.write(zlib.compress(cPickle.dumps(db, 1))) + + +class SlicedZPickleIndexer(ZPickleIndexer): + segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!" + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + db = Index({}, {'_TOP':(0,None)}, {}) + # Identify the relevant word-dictionary segments + if not wordlist: + segments = self.segments + else: + segments = ['-','#'] + for word in wordlist: + segments.append(string.upper(word[0])) + # Load the segments + for segment in segments: + try: + pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read()) + dbslice = cPickle.loads(pickle_str) + if dbslice.__dict__.get('WORDS'): # If it has some words, add them + for word,entry in dbslice.WORDS.items(): + db.WORDS[word] = entry + if dbslice.__dict__.get('FILES'): # If it has some files, add them + db.FILES = dbslice.FILES + if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them + db.FILEIDS = dbslice.FILEIDS + except: + pass # No biggie, couldn't find this segment + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def julienne(self, INDEXDB=None): + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + segments = self.segments # all the (little) indexes + for segment in segments: + try: # brutal space saver... delete all the small segments + os.remove(INDEXDB+segment) + except OSError: + pass # probably just nonexistent segment index file + # First write the much simpler filename/fileid dictionaries + dbfil = Index(None, self.files, self.fileids) + open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1))) + # The hard part is splitting the word dictionary up, of course + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + segdicts = {} # Need batch of empty dicts + for segment in letters+'#': + segdicts[segment] = {} + for word, entry in self.words.items(): # Split into segment dicts + initchar = string.upper(word[0]) + if initchar in letters: + segdicts[initchar][word] = entry + else: + segdicts['#'][word] = entry + for initchar in letters+'#': + db = Index(segdicts[initchar], None, None) + pickle_str = cPickle.dumps(db, 1) + filename = INDEXDB+initchar + pickle_fh = open(filename,'wb') + pickle_fh.write(zlib.compress(pickle_str)) + os.chmod(filename,0664) + + save_index = julienne + +PreferredIndexer = SlicedZPickleIndexer + +#-- If called from command-line, parse arguments and take actions +if __name__ == '__main__': + import time + start = time.time() + search_words = [] # Word search list (if specified) + opts = 0 # Any options specified? + if len(sys.argv) < 2: + pass # No options given + else: + upper = string.upper + dir = os.getcwd() # Default to indexing from current directory + descend = 1 # Default to recursive indexing + ndx = PreferredIndexer() + for opt in sys.argv[1:]: + if opt in ('-h','/h','-?','/?','?','--help'): # help screen + print __shell_usage__ + opts = -1 + break + elif opt[0] in '/-': # a switch! + opts = opts+1 + if upper(opt[1:]) == 'INDEX': # Index files + ndx.quiet = 0 + pass # Use defaults if no other options + elif upper(opt[1:]) == 'REINDEX': # Reindex + ndx.reindex = 1 + elif upper(opt[1:]) == 'CASESENSITIVE': # Case sensitive + ndx.casesensitive = 1 + elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion + descend = 0 + elif upper(opt[1:4]) == 'DIR': # Dir to index + dir = opt[5:] + elif upper(opt[1:8]) == 'INDEXDB': # Index specified + ndx.indexdb = opt[9:] + sys.stderr.write( + "Use of INDEXER_DB environment variable is STRONGLY recommended.\n") + elif upper(opt[1:6]) == 'REGEX': # RegEx files to index + ndx.add_pattern = re.compile(opt[7:]) + elif upper(opt[1:5]) == 'GLOB': # Glob files to index + ndx.add_pattern = opt[6:] + elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look? + opts = opts-1 # this is not an option for indexing purposes + level = upper(opt[8:]) + if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'): + ndx.quiet = 0 + elif level in ('RATINGS','SCORES','HIGH'): + ndx.quiet = 3 + elif level in ('FILENAMES','NAMES','FILES','MID'): + ndx.quiet = 5 + elif level in ('SUMMARY','MIN'): + ndx.quiet = 9 + elif upper(opt[1:7]) == 'FILTER': # Regex filter output + opts = opts-1 # this is not an option for indexing purposes + ndx.filter = opt[8:] + elif opt[1:] in string.digits: + opts = opts-1 + ndx.quiet = eval(opt[1]) + else: + search_words.append(opt) # Search words + + if opts > 0: + ndx.add_files(dir=dir) + ndx.save_index() + if search_words: + ndx.find(search_words, print_report=1) + if not opts and not search_words: + sys.stderr.write("Perhaps you would like to use the --help option?\n") + else: + sys.stderr.write('Processed in %.3f seconds (%s)' + % (time.time()-start, ndx.whoami())) + +# +#$Log: not supported by cvs2svn $ +#Revision 1.1.2.2 2002/04/03 12:01:55 rochecompaan +#Oops. Forgot to include cvs keywords in file. +#
