diff roundup/backends/indexer_rdbms.py @ 4252:2ff6f39aa391

Indexers behaviour made more consistent regarding length of indexed words... ...and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
author Bernhard Reiter <Bernhard.Reiter@intevation.de>
date Fri, 11 Sep 2009 15:55:11 +0000
parents 09e79cbeb827
children 8081d34fefa5
line wrap: on
line diff
--- a/roundup/backends/indexer_rdbms.py	Fri Sep 11 15:37:24 2009 +0000
+++ b/roundup/backends/indexer_rdbms.py	Fri Sep 11 15:55:11 2009 +0000
@@ -66,11 +66,11 @@
         # ok, find all the unique words in the text
         text = unicode(text, "utf-8", "replace").upper()
         wordlist = [w.encode("utf-8")
-            for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
+            for w in re.findall(r'(?u)\b\w{%d,%d}\b'
+                                % (self.minlength, self.maxlength), text)]
         words = set()
         for word in wordlist:
             if self.is_stopword(word): continue
-            if len(word) > 25: continue
             words.add(word)
 
         # for each word, add an entry in the db
@@ -86,7 +86,9 @@
         if not wordlist:
             return []
 
-        l = [word.upper() for word in wordlist if 26 > len(word) > 2]
+        l = [word.upper() for word in wordlist
+             if self.minlength <= len(word) <= self.maxlength]
+        l = [word for word in l if not self.is_stopword(word)]
 
         if not l:
             return []

Roundup Issue Tracker: http://roundup-tracker.org/