Mercurial > p > roundup > code
diff roundup/backends/indexer_dbm.py @ 4252:2ff6f39aa391
Indexers behaviour made more consistent regarding length of indexed words...
...and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
| author | Bernhard Reiter <Bernhard.Reiter@intevation.de> |
|---|---|
| date | Fri, 11 Sep 2009 15:55:11 +0000 |
| parents | 5f4db2650da3 |
| children | 13b3155869e0 |
line wrap: on
line diff
--- a/roundup/backends/indexer_dbm.py Fri Sep 11 15:37:24 2009 +0000 +++ b/roundup/backends/indexer_dbm.py Fri Sep 11 15:55:11 2009 +0000 @@ -135,14 +135,12 @@ # case insensitive text = str(text).upper() - # Split the raw text, losing anything longer than 25 characters - # since that'll be gibberish (encoded text or somesuch) or shorter - # than 3 characters since those short words appear all over the - # place - return re.findall(r'\b\w{2,25}\b', text) + # Split the raw text + return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength), + text) - # we override this to ignore not 2 < word < 25 and also to fix a bug - - # the (fail) case. + # we override this to ignore too short and too long words + # and also to fix a bug - the (fail) case. def find(self, wordlist): '''Locate files that match ALL the words in wordlist ''' @@ -152,10 +150,12 @@ entries = {} hits = None for word in wordlist: - if not 2 < len(word) < 25: + if not self.minlength <= len(word) <= self.maxlength: # word outside the bounds of what we index - ignore continue word = word.upper() + if self.is_stopword(word): + continue entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail)
