comparison roundup/indexer.py @ 833:b80aaedba3db

Only save the index if the thing is loaded and changed. Also, don't load the index just for a save.
author Richard Jones <richard@users.sourceforge.net>
date Tue, 09 Jul 2002 21:38:43 +0000
parents 0a2c1f5e0e5a
children 568eed5fb4fd
comparison
equal deleted inserted replaced
832:d46eab51b536 833:b80aaedba3db
12 # This file is released to the public domain. I (dqm) would 12 # This file is released to the public domain. I (dqm) would
13 # appreciate it if you choose to keep derived works under terms 13 # appreciate it if you choose to keep derived works under terms
14 # that promote freedom, but obviously am giving up any rights 14 # that promote freedom, but obviously am giving up any rights
15 # to compel such. 15 # to compel such.
16 # 16 #
17 #$Id: indexer.py,v 1.6 2002-07-09 04:26:44 richard Exp $ 17 #$Id: indexer.py,v 1.7 2002-07-09 21:38:43 richard Exp $
18 ''' 18 '''
19 This module provides an indexer class, RoundupIndexer, that stores text 19 This module provides an indexer class, RoundupIndexer, that stores text
20 indices in a roundup instance. This class makes searching the content of 20 indices in a roundup instance. This class makes searching the content of
21 messages and text files possible. 21 messages and text files possible.
22 ''' 22 '''
33 ''' 33 '''
34 def __init__(self, db_path): 34 def __init__(self, db_path):
35 self.indexdb_path = os.path.join(db_path, 'indexes') 35 self.indexdb_path = os.path.join(db_path, 'indexes')
36 self.indexdb = os.path.join(self.indexdb_path, 'index.db') 36 self.indexdb = os.path.join(self.indexdb_path, 'index.db')
37 self.reindex = 0 37 self.reindex = 0
38 self.casesensitive = 0
39 self.quiet = 9 38 self.quiet = 9
39 self.changed = 0
40 40
41 # see if we need to reindex because of a change in code 41 # see if we need to reindex because of a change in code
42 if (not os.path.exists(self.indexdb_path) or 42 if (not os.path.exists(self.indexdb_path) or
43 not os.path.exists(os.path.join(self.indexdb_path, 'version'))): 43 not os.path.exists(os.path.join(self.indexdb_path, 'version'))):
44 # TODO: if the version file exists (in the future) we'll want to 44 # TODO: if the version file exists (in the future) we'll want to
52 shutil.rmtree(self.indexdb_path) 52 shutil.rmtree(self.indexdb_path)
53 os.makedirs(self.indexdb_path) 53 os.makedirs(self.indexdb_path)
54 os.chmod(self.indexdb_path, 0775) 54 os.chmod(self.indexdb_path, 0775)
55 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') 55 open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
56 self.reindex = 1 56 self.reindex = 1
57 self.changed = 1
57 58
58 def should_reindex(self): 59 def should_reindex(self):
59 '''Should we reindex? 60 '''Should we reindex?
60 ''' 61 '''
61 return self.reindex 62 return self.reindex
100 self.words[word] = entry 101 self.words[word] = entry
101 102
102 # make a reference to the file for this word 103 # make a reference to the file for this word
103 entry[file_index] = filedict[word] 104 entry[file_index] = filedict[word]
104 105
106 # save needed
107 self.changed = 1
108
105 def splitter(self, text, ftype): 109 def splitter(self, text, ftype):
106 ''' Split the contents of a text string into a list of 'words' 110 ''' Split the contents of a text string into a list of 'words'
107 ''' 111 '''
108 if ftype == 'text/plain': 112 if ftype == 'text/plain':
109 words = self.text_splitter(text, self.casesensitive) 113 words = self.text_splitter(text)
110 else: 114 else:
111 return [] 115 return []
112 return words 116 return words
113 117
114 def text_splitter(self, text, casesensitive=0): 118 def text_splitter(self, text):
115 """Split text/plain string into a list of words 119 """Split text/plain string into a list of words
116 """ 120 """
117 # Let's adjust case if not case-sensitive 121 # case insensitive
118 if not casesensitive: 122 text = text.upper()
119 text = text.upper()
120 123
121 # Split the raw text, losing anything longer than 25 characters 124 # Split the raw text, losing anything longer than 25 characters
122 # since that'll be gibberish (encoded text or somesuch) or shorter 125 # since that'll be gibberish (encoded text or somesuch) or shorter
123 # than 3 characters since those short words appear all over the 126 # than 3 characters since those short words appear all over the
124 # place 127 # place
181 hits = None 184 hits = None
182 for word in wordlist: 185 for word in wordlist:
183 if not 2 < len(word) < 25: 186 if not 2 < len(word) < 25:
184 # word outside the bounds of what we index - ignore 187 # word outside the bounds of what we index - ignore
185 continue 188 continue
186 if not self.casesensitive: 189 word = word.upper()
187 word = word.upper()
188 entry = self.words.get(word) # For each word, get index 190 entry = self.words.get(word) # For each word, get index
189 entries[word] = entry # of matching files 191 entries[word] = entry # of matching files
190 if not entry: # Nothing for this one word (fail) 192 if not entry: # Nothing for this one word (fail)
191 return {} 193 return {}
192 if hits is None: 194 if hits is None:
242 db['FILEIDS'] = dbslice['FILEIDS'] 244 db['FILEIDS'] = dbslice['FILEIDS']
243 245
244 self.words = db['WORDS'] 246 self.words = db['WORDS']
245 self.files = db['FILES'] 247 self.files = db['FILES']
246 self.fileids = db['FILEIDS'] 248 self.fileids = db['FILEIDS']
249 self.changed = 0
247 250
248 def save_index(self): 251 def save_index(self):
249 # make sure we're loaded 252 # only save if the index is loaded and changed
250 self.load_index() 253 if not self.index_loaded() or not self.changed:
254 return
251 255
252 # brutal space saver... delete all the small segments 256 # brutal space saver... delete all the small segments
253 for segment in self.segments: 257 for segment in self.segments:
254 try: 258 try:
255 os.remove(self.indexdb + segment) 259 os.remove(self.indexdb + segment)
278 filename = self.indexdb + initchar 282 filename = self.indexdb + initchar
279 pickle_fh = open(filename, 'wb') 283 pickle_fh = open(filename, 'wb')
280 pickle_fh.write(zlib.compress(pickle_str)) 284 pickle_fh.write(zlib.compress(pickle_str))
281 os.chmod(filename, 0664) 285 os.chmod(filename, 0664)
282 286
287 # save done
288 self.changed = 0
289
283 def purge_entry(self, identifier): 290 def purge_entry(self, identifier):
284 ''' Remove a file from file index and word index 291 ''' Remove a file from file index and word index
285 ''' 292 '''
286 if not self.files.has_key(identifier): 293 if not self.files.has_key(identifier):
287 return 294 return
293 # The much harder part, cleanup the word index 300 # The much harder part, cleanup the word index
294 for key, occurs in self.words.items(): 301 for key, occurs in self.words.items():
295 if occurs.has_key(file_index): 302 if occurs.has_key(file_index):
296 del occurs[file_index] 303 del occurs[file_index]
297 304
305 # save needed
306 self.changed = 1
307
298 def index_loaded(self): 308 def index_loaded(self):
299 return (hasattr(self,'fileids') and hasattr(self,'files') and 309 return (hasattr(self,'fileids') and hasattr(self,'files') and
300 hasattr(self,'words')) 310 hasattr(self,'words'))
301 311
302 # 312 #
303 #$Log: not supported by cvs2svn $ 313 #$Log: not supported by cvs2svn $
314 #Revision 1.6 2002/07/09 04:26:44 richard
315 #We're indexing numbers now, and _underscore words
316 #
304 #Revision 1.5 2002/07/09 04:19:09 richard 317 #Revision 1.5 2002/07/09 04:19:09 richard
305 #Added reindex command to roundup-admin. 318 #Added reindex command to roundup-admin.
306 #Fixed reindex on first access. 319 #Fixed reindex on first access.
307 #Also fixed reindexing of entries that change. 320 #Also fixed reindexing of entries that change.
308 # 321 #

Roundup Issue Tracker: http://roundup-tracker.org/