diff roundup/backends/indexer_dbm.py @ 4252:2ff6f39aa391

Indexers behaviour made more consistent regarding length of indexed words... ...and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
author Bernhard Reiter <Bernhard.Reiter@intevation.de>
date Fri, 11 Sep 2009 15:55:11 +0000
parents 5f4db2650da3
children 13b3155869e0
line wrap: on
line diff
--- a/roundup/backends/indexer_dbm.py	Fri Sep 11 15:37:24 2009 +0000
+++ b/roundup/backends/indexer_dbm.py	Fri Sep 11 15:55:11 2009 +0000
@@ -135,14 +135,12 @@
         # case insensitive
         text = str(text).upper()
 
-        # Split the raw text, losing anything longer than 25 characters
-        # since that'll be gibberish (encoded text or somesuch) or shorter
-        # than 3 characters since those short words appear all over the
-        # place
-        return re.findall(r'\b\w{2,25}\b', text)
+        # Split the raw text
+        return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
+                          text)
 
-    # we override this to ignore not 2 < word < 25 and also to fix a bug -
-    # the (fail) case.
+    # we override this to ignore too short and too long words
+    # and also to fix a bug - the (fail) case.
     def find(self, wordlist):
         '''Locate files that match ALL the words in wordlist
         '''
@@ -152,10 +150,12 @@
         entries = {}
         hits = None
         for word in wordlist:
-            if not 2 < len(word) < 25:
+            if not self.minlength <= len(word) <= self.maxlength:
                 # word outside the bounds of what we index - ignore
                 continue
             word = word.upper()
+            if self.is_stopword(word):
+                continue
             entry = self.words.get(word)    # For each word, get index
             entries[word] = entry           #   of matching files
             if not entry:                   # Nothing for this one word (fail)

Roundup Issue Tracker: http://roundup-tracker.org/