changeset 2872:d530b68e4b42

don't index common words [SF#1046612]
author Richard Jones <richard@users.sourceforge.net>
date Fri, 05 Nov 2004 05:10:07 +0000
parents 795cdba40c05
children 407f1f17079d
files CHANGES.txt roundup/backends/indexer_dbm.py roundup/backends/indexer_rdbms.py test/db_test_base.py
diffstat 4 files changed, 25 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/CHANGES.txt	Fri Nov 05 04:55:52 2004 +0000
+++ b/CHANGES.txt	Fri Nov 05 05:10:07 2004 +0000
@@ -50,6 +50,7 @@
 - s/Modifed/Modified (thanks donfu)
 - applied patch fixing some form handling issues in ZRoundup (sf bug 995565)
 - enforce View Permission when serving file content (sf bug 1050470)
+- don't index common words (sf bug 1046612)
 
 
 2004-10-15 0.7.8
--- a/roundup/backends/indexer_dbm.py	Fri Nov 05 04:55:52 2004 +0000
+++ b/roundup/backends/indexer_dbm.py	Fri Nov 05 05:10:07 2004 +0000
@@ -14,7 +14,7 @@
 #     that promote freedom, but obviously am giving up any rights
 #     to compel such.
 # 
-#$Id: indexer_dbm.py,v 1.1 2004-03-19 04:47:59 richard Exp $
+#$Id: indexer_dbm.py,v 1.2 2004-11-05 05:10:07 richard Exp $
 '''This module provides an indexer class, RoundupIndexer, that stores text
 indices in a roundup instance.  This class makes searching the content of
 messages, string properties and text files possible.
@@ -24,6 +24,18 @@
 import os, shutil, re, mimetypes, marshal, zlib, errno
 from roundup.hyperdb import Link, Multilink
 
+stopwords = [
+"A", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY",
+"FOR", "IF", "IN", "INTO", "IS", "IT",
+"NO", "NOT", "OF", "ON", "OR", "SUCH",
+"THAT", "THE", "THEIR", "THEN", "THERE", "THESE",
+"THEY", "THIS", "TO", "WAS", "WILL", "WITH" 
+]
+is_stopword = {}
+for word in stopwords:
+    is_stopword[word] = None
+is_stopword = is_stopword.has_key
+
 class Indexer:
     '''Indexes information from roundup's hyperdb to allow efficient
     searching.
@@ -95,6 +107,8 @@
         # find the unique words
         filedict = {}
         for word in words:
+            if is_stopword(word):
+                continue
             if filedict.has_key(word):
                 filedict[word] = filedict[word]+1
             else:
--- a/roundup/backends/indexer_rdbms.py	Fri Nov 05 04:55:52 2004 +0000
+++ b/roundup/backends/indexer_rdbms.py	Fri Nov 05 05:10:07 2004 +0000
@@ -4,10 +4,9 @@
 '''
 import re
 
-from indexer_dbm import Indexer
+from indexer_dbm import Indexer, is_stopword
 
 class Indexer(Indexer):
-    disallows = {'THE':1, 'THIS':1, 'ZZZ':1, 'THAT':1, 'WITH':1}
     def __init__(self, db):
         self.db = db
         self.reindex = 0
@@ -55,8 +54,9 @@
         wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper())
         words = {}
         for word in wordlist:
-            if not self.disallows.has_key(word):
-                words[word] = 1
+            if is_stopword(word):
+                continue
+            words[word] = 1
         words = words.keys()
 
         # for each word, add an entry in the db
--- a/test/db_test_base.py	Fri Nov 05 04:55:52 2004 +0000
+++ b/test/db_test_base.py	Fri Nov 05 05:10:07 2004 +0000
@@ -15,7 +15,7 @@
 # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
 # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 #
-# $Id: db_test_base.py,v 1.51 2004-10-24 09:57:32 a1s Exp $
+# $Id: db_test_base.py,v 1.52 2004-11-05 05:10:07 richard Exp $
 
 import unittest, os, shutil, errno, imp, sys, time, pprint
 
@@ -684,7 +684,7 @@
         f2 = self.db.file.create(content='world', type="text/frozz",
             comment='blah blah')
         i1 = self.db.issue.create(files=[f1, f2], title="flebble plop")
-        i2 = self.db.issue.create(title="flebble frooz")
+        i2 = self.db.issue.create(title="flebble the frooz")
         self.db.commit()
         self.assertEquals(self.db.indexer.search(['hello'], self.db.issue),
             {i1: {'files': [f1]}})
@@ -694,6 +694,9 @@
         self.assertEquals(self.db.indexer.search(['flebble'], self.db.issue),
             {i1: {}, i2: {}})
 
+        # unindexed stopword
+        self.assertEquals(self.db.indexer.search(['the'], self.db.issue), {})
+
     def testReindexing(self):
         search = self.db.indexer.search
         issue = self.db.issue

Roundup Issue Tracker: http://roundup-tracker.org/