diff roundup/backends/indexer_xapian.py @ 4252:2ff6f39aa391

Indexers behaviour made more consistent regarding length of indexed words... ...and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
author Bernhard Reiter <Bernhard.Reiter@intevation.de>
date Fri, 11 Sep 2009 15:55:11 +0000
parents 7518a8ec1d55
children 477f2a47cbca
line wrap: on
line diff
--- a/roundup/backends/indexer_xapian.py	Fri Sep 11 15:37:24 2009 +0000
+++ b/roundup/backends/indexer_xapian.py	Fri Sep 11 15:55:11 2009 +0000
@@ -88,7 +88,9 @@
         doc.set_data(identifier)
         doc.add_posting(identifier, 0)
 
-        for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
+        for match in re.finditer(r'\b\w{%d,%d}\b'
+                                 % (self.minlength, self.maxlength),
+                                 text.upper()):
             word = match.group(0)
             if self.is_stopword(word):
                 continue
@@ -112,8 +114,10 @@
         enquire = xapian.Enquire(database)
         stemmer = xapian.Stem("english")
         terms = []
-        for term in [word.upper() for word in wordlist if 26 > len(word) > 2]:
-            terms.append(stemmer(term.upper()))
+        for term in [word.upper() for word in wordlist
+                          if self.minlength <= len(word) <= self.maxlength]:
+            if not self.is_stopword(term):
+                terms.append(stemmer(term))
         query = xapian.Query(xapian.Query.OP_AND, terms)
 
         enquire.set_query(query)

Roundup Issue Tracker: http://roundup-tracker.org/