Mercurial > p > roundup > code
changeset 2872:d530b68e4b42
don't index common words [SF#1046612]
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Fri, 05 Nov 2004 05:10:07 +0000 |
| parents | 795cdba40c05 |
| children | 407f1f17079d |
| files | CHANGES.txt roundup/backends/indexer_dbm.py roundup/backends/indexer_rdbms.py test/db_test_base.py |
| diffstat | 4 files changed, 25 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGES.txt Fri Nov 05 04:55:52 2004 +0000 +++ b/CHANGES.txt Fri Nov 05 05:10:07 2004 +0000 @@ -50,6 +50,7 @@ - s/Modifed/Modified (thanks donfu) - applied patch fixing some form handling issues in ZRoundup (sf bug 995565) - enforce View Permission when serving file content (sf bug 1050470) +- don't index common words (sf bug 1046612) 2004-10-15 0.7.8
--- a/roundup/backends/indexer_dbm.py Fri Nov 05 04:55:52 2004 +0000 +++ b/roundup/backends/indexer_dbm.py Fri Nov 05 05:10:07 2004 +0000 @@ -14,7 +14,7 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer_dbm.py,v 1.1 2004-03-19 04:47:59 richard Exp $ +#$Id: indexer_dbm.py,v 1.2 2004-11-05 05:10:07 richard Exp $ '''This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of messages, string properties and text files possible. @@ -24,6 +24,18 @@ import os, shutil, re, mimetypes, marshal, zlib, errno from roundup.hyperdb import Link, Multilink +stopwords = [ +"A", "AND", "ARE", "AS", "AT", "BE", "BUT", "BY", +"FOR", "IF", "IN", "INTO", "IS", "IT", +"NO", "NOT", "OF", "ON", "OR", "SUCH", +"THAT", "THE", "THEIR", "THEN", "THERE", "THESE", +"THEY", "THIS", "TO", "WAS", "WILL", "WITH" +] +is_stopword = {} +for word in stopwords: + is_stopword[word] = None +is_stopword = is_stopword.has_key + class Indexer: '''Indexes information from roundup's hyperdb to allow efficient searching. @@ -95,6 +107,8 @@ # find the unique words filedict = {} for word in words: + if is_stopword(word): + continue if filedict.has_key(word): filedict[word] = filedict[word]+1 else:
--- a/roundup/backends/indexer_rdbms.py Fri Nov 05 04:55:52 2004 +0000 +++ b/roundup/backends/indexer_rdbms.py Fri Nov 05 05:10:07 2004 +0000 @@ -4,10 +4,9 @@ ''' import re -from indexer_dbm import Indexer +from indexer_dbm import Indexer, is_stopword class Indexer(Indexer): - disallows = {'THE':1, 'THIS':1, 'ZZZ':1, 'THAT':1, 'WITH':1} def __init__(self, db): self.db = db self.reindex = 0 @@ -55,8 +54,9 @@ wordlist = re.findall(r'\b\w{2,25}\b', str(text).upper()) words = {} for word in wordlist: - if not self.disallows.has_key(word): - words[word] = 1 + if is_stopword(word): + continue + words[word] = 1 words = words.keys() # for each word, add an entry in the db
--- a/test/db_test_base.py Fri Nov 05 04:55:52 2004 +0000 +++ b/test/db_test_base.py Fri Nov 05 05:10:07 2004 +0000 @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: db_test_base.py,v 1.51 2004-10-24 09:57:32 a1s Exp $ +# $Id: db_test_base.py,v 1.52 2004-11-05 05:10:07 richard Exp $ import unittest, os, shutil, errno, imp, sys, time, pprint @@ -684,7 +684,7 @@ f2 = self.db.file.create(content='world', type="text/frozz", comment='blah blah') i1 = self.db.issue.create(files=[f1, f2], title="flebble plop") - i2 = self.db.issue.create(title="flebble frooz") + i2 = self.db.issue.create(title="flebble the frooz") self.db.commit() self.assertEquals(self.db.indexer.search(['hello'], self.db.issue), {i1: {'files': [f1]}}) @@ -694,6 +694,9 @@ self.assertEquals(self.db.indexer.search(['flebble'], self.db.issue), {i1: {}, i2: {}}) + # unindexed stopword + self.assertEquals(self.db.indexer.search(['the'], self.db.issue), {}) + def testReindexing(self): search = self.db.indexer.search issue = self.db.issue
