Mercurial > p > roundup > code
comparison roundup/indexer.py @ 833:b80aaedba3db
Only save the index if the thing is loaded and changed.
Also, don't load the index just for a save.
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Tue, 09 Jul 2002 21:38:43 +0000 |
| parents | 0a2c1f5e0e5a |
| children | 568eed5fb4fd |
comparison
equal
deleted
inserted
replaced
| 832:d46eab51b536 | 833:b80aaedba3db |
|---|---|
| 12 # This file is released to the public domain. I (dqm) would | 12 # This file is released to the public domain. I (dqm) would |
| 13 # appreciate it if you choose to keep derived works under terms | 13 # appreciate it if you choose to keep derived works under terms |
| 14 # that promote freedom, but obviously am giving up any rights | 14 # that promote freedom, but obviously am giving up any rights |
| 15 # to compel such. | 15 # to compel such. |
| 16 # | 16 # |
| 17 #$Id: indexer.py,v 1.6 2002-07-09 04:26:44 richard Exp $ | 17 #$Id: indexer.py,v 1.7 2002-07-09 21:38:43 richard Exp $ |
| 18 ''' | 18 ''' |
| 19 This module provides an indexer class, RoundupIndexer, that stores text | 19 This module provides an indexer class, RoundupIndexer, that stores text |
| 20 indices in a roundup instance. This class makes searching the content of | 20 indices in a roundup instance. This class makes searching the content of |
| 21 messages and text files possible. | 21 messages and text files possible. |
| 22 ''' | 22 ''' |
| 33 ''' | 33 ''' |
| 34 def __init__(self, db_path): | 34 def __init__(self, db_path): |
| 35 self.indexdb_path = os.path.join(db_path, 'indexes') | 35 self.indexdb_path = os.path.join(db_path, 'indexes') |
| 36 self.indexdb = os.path.join(self.indexdb_path, 'index.db') | 36 self.indexdb = os.path.join(self.indexdb_path, 'index.db') |
| 37 self.reindex = 0 | 37 self.reindex = 0 |
| 38 self.casesensitive = 0 | |
| 39 self.quiet = 9 | 38 self.quiet = 9 |
| 39 self.changed = 0 | |
| 40 | 40 |
| 41 # see if we need to reindex because of a change in code | 41 # see if we need to reindex because of a change in code |
| 42 if (not os.path.exists(self.indexdb_path) or | 42 if (not os.path.exists(self.indexdb_path) or |
| 43 not os.path.exists(os.path.join(self.indexdb_path, 'version'))): | 43 not os.path.exists(os.path.join(self.indexdb_path, 'version'))): |
| 44 # TODO: if the version file exists (in the future) we'll want to | 44 # TODO: if the version file exists (in the future) we'll want to |
| 52 shutil.rmtree(self.indexdb_path) | 52 shutil.rmtree(self.indexdb_path) |
| 53 os.makedirs(self.indexdb_path) | 53 os.makedirs(self.indexdb_path) |
| 54 os.chmod(self.indexdb_path, 0775) | 54 os.chmod(self.indexdb_path, 0775) |
| 55 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') | 55 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') |
| 56 self.reindex = 1 | 56 self.reindex = 1 |
| 57 self.changed = 1 | |
| 57 | 58 |
| 58 def should_reindex(self): | 59 def should_reindex(self): |
| 59 '''Should we reindex? | 60 '''Should we reindex? |
| 60 ''' | 61 ''' |
| 61 return self.reindex | 62 return self.reindex |
| 100 self.words[word] = entry | 101 self.words[word] = entry |
| 101 | 102 |
| 102 # make a reference to the file for this word | 103 # make a reference to the file for this word |
| 103 entry[file_index] = filedict[word] | 104 entry[file_index] = filedict[word] |
| 104 | 105 |
| 106 # save needed | |
| 107 self.changed = 1 | |
| 108 | |
| 105 def splitter(self, text, ftype): | 109 def splitter(self, text, ftype): |
| 106 ''' Split the contents of a text string into a list of 'words' | 110 ''' Split the contents of a text string into a list of 'words' |
| 107 ''' | 111 ''' |
| 108 if ftype == 'text/plain': | 112 if ftype == 'text/plain': |
| 109 words = self.text_splitter(text, self.casesensitive) | 113 words = self.text_splitter(text) |
| 110 else: | 114 else: |
| 111 return [] | 115 return [] |
| 112 return words | 116 return words |
| 113 | 117 |
| 114 def text_splitter(self, text, casesensitive=0): | 118 def text_splitter(self, text): |
| 115 """Split text/plain string into a list of words | 119 """Split text/plain string into a list of words |
| 116 """ | 120 """ |
| 117 # Let's adjust case if not case-sensitive | 121 # case insensitive |
| 118 if not casesensitive: | 122 text = text.upper() |
| 119 text = text.upper() | |
| 120 | 123 |
| 121 # Split the raw text, losing anything longer than 25 characters | 124 # Split the raw text, losing anything longer than 25 characters |
| 122 # since that'll be gibberish (encoded text or somesuch) or shorter | 125 # since that'll be gibberish (encoded text or somesuch) or shorter |
| 123 # than 3 characters since those short words appear all over the | 126 # than 3 characters since those short words appear all over the |
| 124 # place | 127 # place |
| 181 hits = None | 184 hits = None |
| 182 for word in wordlist: | 185 for word in wordlist: |
| 183 if not 2 < len(word) < 25: | 186 if not 2 < len(word) < 25: |
| 184 # word outside the bounds of what we index - ignore | 187 # word outside the bounds of what we index - ignore |
| 185 continue | 188 continue |
| 186 if not self.casesensitive: | 189 word = word.upper() |
| 187 word = word.upper() | |
| 188 entry = self.words.get(word) # For each word, get index | 190 entry = self.words.get(word) # For each word, get index |
| 189 entries[word] = entry # of matching files | 191 entries[word] = entry # of matching files |
| 190 if not entry: # Nothing for this one word (fail) | 192 if not entry: # Nothing for this one word (fail) |
| 191 return {} | 193 return {} |
| 192 if hits is None: | 194 if hits is None: |
| 242 db['FILEIDS'] = dbslice['FILEIDS'] | 244 db['FILEIDS'] = dbslice['FILEIDS'] |
| 243 | 245 |
| 244 self.words = db['WORDS'] | 246 self.words = db['WORDS'] |
| 245 self.files = db['FILES'] | 247 self.files = db['FILES'] |
| 246 self.fileids = db['FILEIDS'] | 248 self.fileids = db['FILEIDS'] |
| 249 self.changed = 0 | |
| 247 | 250 |
| 248 def save_index(self): | 251 def save_index(self): |
| 249 # make sure we're loaded | 252 # only save if the index is loaded and changed |
| 250 self.load_index() | 253 if not self.index_loaded() or not self.changed: |
| 254 return | |
| 251 | 255 |
| 252 # brutal space saver... delete all the small segments | 256 # brutal space saver... delete all the small segments |
| 253 for segment in self.segments: | 257 for segment in self.segments: |
| 254 try: | 258 try: |
| 255 os.remove(self.indexdb + segment) | 259 os.remove(self.indexdb + segment) |
| 278 filename = self.indexdb + initchar | 282 filename = self.indexdb + initchar |
| 279 pickle_fh = open(filename, 'wb') | 283 pickle_fh = open(filename, 'wb') |
| 280 pickle_fh.write(zlib.compress(pickle_str)) | 284 pickle_fh.write(zlib.compress(pickle_str)) |
| 281 os.chmod(filename, 0664) | 285 os.chmod(filename, 0664) |
| 282 | 286 |
| 287 # save done | |
| 288 self.changed = 0 | |
| 289 | |
| 283 def purge_entry(self, identifier): | 290 def purge_entry(self, identifier): |
| 284 ''' Remove a file from file index and word index | 291 ''' Remove a file from file index and word index |
| 285 ''' | 292 ''' |
| 286 if not self.files.has_key(identifier): | 293 if not self.files.has_key(identifier): |
| 287 return | 294 return |
| 293 # The much harder part, cleanup the word index | 300 # The much harder part, cleanup the word index |
| 294 for key, occurs in self.words.items(): | 301 for key, occurs in self.words.items(): |
| 295 if occurs.has_key(file_index): | 302 if occurs.has_key(file_index): |
| 296 del occurs[file_index] | 303 del occurs[file_index] |
| 297 | 304 |
| 305 # save needed | |
| 306 self.changed = 1 | |
| 307 | |
| 298 def index_loaded(self): | 308 def index_loaded(self): |
| 299 return (hasattr(self,'fileids') and hasattr(self,'files') and | 309 return (hasattr(self,'fileids') and hasattr(self,'files') and |
| 300 hasattr(self,'words')) | 310 hasattr(self,'words')) |
| 301 | 311 |
| 302 # | 312 # |
| 303 #$Log: not supported by cvs2svn $ | 313 #$Log: not supported by cvs2svn $ |
| 314 #Revision 1.6 2002/07/09 04:26:44 richard | |
| 315 #We're indexing numbers now, and _underscore words | |
| 316 # | |
| 304 #Revision 1.5 2002/07/09 04:19:09 richard | 317 #Revision 1.5 2002/07/09 04:19:09 richard |
| 305 #Added reindex command to roundup-admin. | 318 #Added reindex command to roundup-admin. |
| 306 #Fixed reindex on first access. | 319 #Fixed reindex on first access. |
| 307 #Also fixed reindexing of entries that change. | 320 #Also fixed reindexing of entries that change. |
| 308 # | 321 # |
