annotate roundup/backends/indexer_whoosh.py @ 5852:44b6a79f4e70 maint-1.6

pyme no longer exists at pypi. Try using the git repo of record so we can run the tests on this year old tree. Right after this release in mid 2018, pyme was depricated on 2018-10-16.
author John Rouillard <rouilj@ieee.org>
date Wed, 21 Aug 2019 20:59:32 -0400
parents e74c3611b138
children 56c9bcdea47f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
1 ''' This implements the full-text indexer using Whoosh.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
2 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
3 import re, os
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
4
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
5 from whoosh import fields, qparser, index, query, analysis
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
6
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
7 from roundup.backends.indexer_common import Indexer as IndexerBase
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
8
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
9 class Indexer(IndexerBase):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
10 def __init__(self, db):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
11 IndexerBase.__init__(self, db)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
12 self.db_path = db.config.DATABASE
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
13 self.reindex = 0
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
14 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
15 self.index = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
16 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
17
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
18 def _get_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
19 if self.index is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
20 path = os.path.join(self.db_path, 'whoosh-index')
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
21 if not os.path.exists(path):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
22 # StandardAnalyzer lowercases all words and configure it to
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
23 # block stopwords and words with lengths not between
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
24 # self.minlength and self.maxlength from indexer_common
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
25 stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopwords,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
26 minsize=self.minlength,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
27 maxsize=self.maxlength)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
28 os.mkdir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
29 schema = fields.Schema(identifier=fields.ID(stored=True,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
30 unique=True),
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
31 content=fields.TEXT(analyzer=stopfilter))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
32 index.create_in(path, schema)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
33 self.index = index.open_dir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
34 return self.index
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
35
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
36 def save_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
37 '''Save the changes to the index.'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
38 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
39 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
40 self.writer.commit()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
41 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
42 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
43
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
44 def close(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
45 '''close the indexing database'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
46 pass
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
47
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
48 def rollback(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
49 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
50 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
51 self.writer.cancel()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
52 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
53 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
54
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
55 def force_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
56 '''Force a reindexing of the database. This essentially
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
57 empties the tables ids and index and sets a flag so
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
58 that the databases are reindexed'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
59 self.reindex = 1
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
60
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
61 def should_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
62 '''returns True if the indexes need to be rebuilt'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
63 return self.reindex
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
64
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
65 def _get_writer(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
66 if self.writer is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
67 self.writer = self._get_index().writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
68 return self.writer
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
69
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
70 def _get_searcher(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
71 return self._get_index().searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
72
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
73 def add_text(self, identifier, text, mime_type='text/plain'):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
74 ''' "identifier" is (classname, itemid, property) '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
75 if mime_type != 'text/plain':
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
76 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
77
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
78 if not text:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
79 text = u''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
80
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
81 if not isinstance(text, unicode):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
82 text = unicode(text, "utf-8", "replace")
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
83
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
84 # We use the identifier twice: once in the actual "text" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
85 # indexed so we can search on it, and again as the "data" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
86 # indexed so we know what we're matching when we get results
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
87 identifier = u"%s:%s:%s"%identifier
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
88
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
89 # FIXME need to enhance this to handle the whoosh.store.LockError
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
90 # that maybe raised if there is already another process with a lock.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
91 writer = self._get_writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
92
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
93 # Whoosh gets upset if a document is deleted twice in one transaction,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
94 # so we keep a list of the documents we have so far deleted to make
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
95 # sure that we only delete them once.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
96 if identifier not in self.deleted:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
97 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
98 results = searcher.search(query.Term("identifier", identifier))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
99 if len(results) > 0:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
100 writer.delete_by_term("identifier", identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
101 self.deleted.add(identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
102
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
103 # Note: use '.lower()' because it seems like Whoosh gets
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
104 # better results that way.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
105 writer.add_document(identifier=identifier, content=text)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
106 self.save_index()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
107
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
108 def find(self, wordlist):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
109 '''look up all the words in the wordlist.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
110 If none are found return an empty dictionary
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
111 * more rules here
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
112 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
113
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
114 wordlist = [ word for word in wordlist
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
115 if (self.minlength <= len(word) <= self.maxlength) and
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
116 not self.is_stopword(word.upper()) ]
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
117
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
118 if not wordlist:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
119 return {}
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
120
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
121 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
122 q = query.And([ query.FuzzyTerm("content", word.lower())
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
123 for word in wordlist ])
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
124
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
125 results = searcher.search(q, limit=None)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
126
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
127 return [tuple(result["identifier"].split(':'))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
128 for result in results]
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
129

Roundup Issue Tracker: http://roundup-tracker.org/