annotate roundup/backends/indexer_whoosh.py @ 5191:fe52cab8f5b5

issue2550932 - html_calendar produces templating errors for bad date strings Changed code to trap and ignore unparsible dates. The calendar starts with today highlighted.
author John Rouillard <rouilj@ieee.org>
date Sat, 25 Feb 2017 22:21:15 -0500
parents e74c3611b138
children 56c9bcdea47f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
1 ''' This implements the full-text indexer using Whoosh.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
2 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
3 import re, os
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
4
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
5 from whoosh import fields, qparser, index, query, analysis
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
6
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
7 from roundup.backends.indexer_common import Indexer as IndexerBase
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
8
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
9 class Indexer(IndexerBase):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
10 def __init__(self, db):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
11 IndexerBase.__init__(self, db)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
12 self.db_path = db.config.DATABASE
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
13 self.reindex = 0
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
14 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
15 self.index = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
16 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
17
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
18 def _get_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
19 if self.index is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
20 path = os.path.join(self.db_path, 'whoosh-index')
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
21 if not os.path.exists(path):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
22 # StandardAnalyzer lowercases all words and configure it to
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
23 # block stopwords and words with lengths not between
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
24 # self.minlength and self.maxlength from indexer_common
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
25 stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopwords,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
26 minsize=self.minlength,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
27 maxsize=self.maxlength)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
28 os.mkdir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
29 schema = fields.Schema(identifier=fields.ID(stored=True,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
30 unique=True),
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
31 content=fields.TEXT(analyzer=stopfilter))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
32 index.create_in(path, schema)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
33 self.index = index.open_dir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
34 return self.index
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
35
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
36 def save_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
37 '''Save the changes to the index.'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
38 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
39 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
40 self.writer.commit()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
41 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
42 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
43
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
44 def close(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
45 '''close the indexing database'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
46 pass
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
47
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
48 def rollback(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
49 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
50 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
51 self.writer.cancel()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
52 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
53 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
54
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
55 def force_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
56 '''Force a reindexing of the database. This essentially
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
57 empties the tables ids and index and sets a flag so
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
58 that the databases are reindexed'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
59 self.reindex = 1
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
60
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
61 def should_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
62 '''returns True if the indexes need to be rebuilt'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
63 return self.reindex
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
64
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
65 def _get_writer(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
66 if self.writer is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
67 self.writer = self._get_index().writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
68 return self.writer
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
69
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
70 def _get_searcher(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
71 return self._get_index().searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
72
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
73 def add_text(self, identifier, text, mime_type='text/plain'):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
74 ''' "identifier" is (classname, itemid, property) '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
75 if mime_type != 'text/plain':
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
76 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
77
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
78 if not text:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
79 text = u''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
80
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
81 if not isinstance(text, unicode):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
82 text = unicode(text, "utf-8", "replace")
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
83
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
84 # We use the identifier twice: once in the actual "text" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
85 # indexed so we can search on it, and again as the "data" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
86 # indexed so we know what we're matching when we get results
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
87 identifier = u"%s:%s:%s"%identifier
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
88
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
89 # FIXME need to enhance this to handle the whoosh.store.LockError
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
90 # that maybe raised if there is already another process with a lock.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
91 writer = self._get_writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
92
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
93 # Whoosh gets upset if a document is deleted twice in one transaction,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
94 # so we keep a list of the documents we have so far deleted to make
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
95 # sure that we only delete them once.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
96 if identifier not in self.deleted:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
97 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
98 results = searcher.search(query.Term("identifier", identifier))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
99 if len(results) > 0:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
100 writer.delete_by_term("identifier", identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
101 self.deleted.add(identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
102
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
103 # Note: use '.lower()' because it seems like Whoosh gets
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
104 # better results that way.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
105 writer.add_document(identifier=identifier, content=text)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
106 self.save_index()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
107
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
108 def find(self, wordlist):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
109 '''look up all the words in the wordlist.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
110 If none are found return an empty dictionary
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
111 * more rules here
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
112 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
113
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
114 wordlist = [ word for word in wordlist
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
115 if (self.minlength <= len(word) <= self.maxlength) and
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
116 not self.is_stopword(word.upper()) ]
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
117
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
118 if not wordlist:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
119 return {}
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
120
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
121 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
122 q = query.And([ query.FuzzyTerm("content", word.lower())
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
123 for word in wordlist ])
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
124
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
125 results = searcher.search(q, limit=None)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
126
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
127 return [tuple(result["identifier"].split(':'))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
128 for result in results]
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
129

Roundup Issue Tracker: http://roundup-tracker.org/