annotate roundup/backends/indexer_whoosh.py @ 5746:fea2b6e54492

Check /rest/data.
author John Rouillard <rouilj@ieee.org>
date Wed, 29 May 2019 22:56:50 -0400
parents 56c9bcdea47f
children cb76bb8bfffd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
1 ''' This implements the full-text indexer using Whoosh.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
2 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
3 import re, os
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
4
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
5 from whoosh import fields, qparser, index, query, analysis
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
6
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
7 from roundup.backends.indexer_common import Indexer as IndexerBase
5416
56c9bcdea47f Python 3 preparation: unicode.
Joseph Myers <jsm@polyomino.org.uk>
parents: 5096
diff changeset
8 from roundup.anypy.strings import us2u
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
9
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
10 class Indexer(IndexerBase):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
11 def __init__(self, db):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
12 IndexerBase.__init__(self, db)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
13 self.db_path = db.config.DATABASE
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
14 self.reindex = 0
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
15 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
16 self.index = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
17 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
18
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
19 def _get_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
20 if self.index is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
21 path = os.path.join(self.db_path, 'whoosh-index')
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
22 if not os.path.exists(path):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
23 # StandardAnalyzer lowercases all words and configure it to
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
24 # block stopwords and words with lengths not between
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
25 # self.minlength and self.maxlength from indexer_common
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
26 stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopwords,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
27 minsize=self.minlength,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
28 maxsize=self.maxlength)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
29 os.mkdir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
30 schema = fields.Schema(identifier=fields.ID(stored=True,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
31 unique=True),
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
32 content=fields.TEXT(analyzer=stopfilter))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
33 index.create_in(path, schema)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
34 self.index = index.open_dir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
35 return self.index
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
36
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
37 def save_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
38 '''Save the changes to the index.'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
39 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
40 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
41 self.writer.commit()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
42 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
43 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
44
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
45 def close(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
46 '''close the indexing database'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
47 pass
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
48
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
49 def rollback(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
50 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
51 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
52 self.writer.cancel()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
53 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
54 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
55
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
56 def force_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
57 '''Force a reindexing of the database. This essentially
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
58 empties the tables ids and index and sets a flag so
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
59 that the databases are reindexed'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
60 self.reindex = 1
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
61
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
62 def should_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
63 '''returns True if the indexes need to be rebuilt'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
64 return self.reindex
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
65
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
66 def _get_writer(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
67 if self.writer is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
68 self.writer = self._get_index().writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
69 return self.writer
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
70
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
71 def _get_searcher(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
72 return self._get_index().searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
73
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
74 def add_text(self, identifier, text, mime_type='text/plain'):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
75 ''' "identifier" is (classname, itemid, property) '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
76 if mime_type != 'text/plain':
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
77 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
78
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
79 if not text:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
80 text = u''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
81
5416
56c9bcdea47f Python 3 preparation: unicode.
Joseph Myers <jsm@polyomino.org.uk>
parents: 5096
diff changeset
82 text = us2u(text, "replace")
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
83
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
84 # We use the identifier twice: once in the actual "text" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
85 # indexed so we can search on it, and again as the "data" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
86 # indexed so we know what we're matching when we get results
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
87 identifier = u"%s:%s:%s"%identifier
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
88
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
89 # FIXME need to enhance this to handle the whoosh.store.LockError
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
90 # that maybe raised if there is already another process with a lock.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
91 writer = self._get_writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
92
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
93 # Whoosh gets upset if a document is deleted twice in one transaction,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
94 # so we keep a list of the documents we have so far deleted to make
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
95 # sure that we only delete them once.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
96 if identifier not in self.deleted:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
97 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
98 results = searcher.search(query.Term("identifier", identifier))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
99 if len(results) > 0:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
100 writer.delete_by_term("identifier", identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
101 self.deleted.add(identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
102
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
103 # Note: use '.lower()' because it seems like Whoosh gets
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
104 # better results that way.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
105 writer.add_document(identifier=identifier, content=text)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
106 self.save_index()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
107
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
108 def find(self, wordlist):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
109 '''look up all the words in the wordlist.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
110 If none are found return an empty dictionary
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
111 * more rules here
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
112 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
113
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
114 wordlist = [ word for word in wordlist
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
115 if (self.minlength <= len(word) <= self.maxlength) and
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
116 not self.is_stopword(word.upper()) ]
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
117
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
118 if not wordlist:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
119 return {}
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
120
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
121 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
122 q = query.And([ query.FuzzyTerm("content", word.lower())
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
123 for word in wordlist ])
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
124
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
125 results = searcher.search(q, limit=None)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
126
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
127 return [tuple(result["identifier"].split(':'))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
128 for result in results]
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
129

Roundup Issue Tracker: http://roundup-tracker.org/