annotate roundup/backends/indexer_whoosh.py @ 7402:bca4eaffb212

Rename GNUMakefile to Makefile
author John Rouillard <rouilj@ieee.org>
date Wed, 24 May 2023 20:40:31 -0400
parents cb76bb8bfffd
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
1 ''' This implements the full-text indexer using Whoosh.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
2 '''
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
3 import os
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
4
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
5 from whoosh import fields, qparser, index, query, analysis
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
6
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
7 from roundup.backends.indexer_common import Indexer as IndexerBase
5416
56c9bcdea47f Python 3 preparation: unicode.
Joseph Myers <jsm@polyomino.org.uk>
parents: 5096
diff changeset
8 from roundup.anypy.strings import us2u
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
9
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
10
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
11 class Indexer(IndexerBase):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
12 def __init__(self, db):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
13 IndexerBase.__init__(self, db)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
14 self.db_path = db.config.DATABASE
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
15 self.reindex = 0
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
16 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
17 self.index = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
18 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
19
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
20 def _get_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
21 if self.index is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
22 path = os.path.join(self.db_path, 'whoosh-index')
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
23 if not os.path.exists(path):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
24 # StandardAnalyzer lowercases all words and configure it to
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
25 # block stopwords and words with lengths not between
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
26 # self.minlength and self.maxlength from indexer_common
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
27 stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopwords,
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
28 minsize=self.minlength,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
29 maxsize=self.maxlength)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
30 os.mkdir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
31 schema = fields.Schema(identifier=fields.ID(stored=True,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
32 unique=True),
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
33 content=fields.TEXT(analyzer=stopfilter))
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
34 index.create_in(path, schema)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
35 self.index = index.open_dir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
36 return self.index
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
37
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
38 def save_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
39 '''Save the changes to the index.'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
40 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
41 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
42 self.writer.commit()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
43 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
44 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
45
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
46 def close(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
47 '''close the indexing database'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
48 pass
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
49
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
50 def rollback(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
51 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
52 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
53 self.writer.cancel()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
54 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
55 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
56
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
57 def force_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
58 '''Force a reindexing of the database. This essentially
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
59 empties the tables ids and index and sets a flag so
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
60 that the databases are reindexed'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
61 self.reindex = 1
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
62
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
63 def should_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
64 '''returns True if the indexes need to be rebuilt'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
65 return self.reindex
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
66
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
67 def _get_writer(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
68 if self.writer is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
69 self.writer = self._get_index().writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
70 return self.writer
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
71
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
72 def _get_searcher(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
73 return self._get_index().searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
74
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
75 def add_text(self, identifier, text, mime_type='text/plain'):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
76 ''' "identifier" is (classname, itemid, property) '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
77 if mime_type != 'text/plain':
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
78 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
79
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
80 if not text:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
81 text = u''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
82
5416
56c9bcdea47f Python 3 preparation: unicode.
Joseph Myers <jsm@polyomino.org.uk>
parents: 5096
diff changeset
83 text = us2u(text, "replace")
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
84
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
85 # We use the identifier twice: once in the actual "text" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
86 # indexed so we can search on it, and again as the "data" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
87 # indexed so we know what we're matching when we get results
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
88 identifier = u"%s:%s:%s" % identifier
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
89
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
90 # FIXME need to enhance this to handle the whoosh.store.LockError
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
91 # that maybe raised if there is already another process with a lock.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
92 writer = self._get_writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
93
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
94 # Whoosh gets upset if a document is deleted twice in one transaction,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
95 # so we keep a list of the documents we have so far deleted to make
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
96 # sure that we only delete them once.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
97 if identifier not in self.deleted:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
98 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
99 results = searcher.search(query.Term("identifier", identifier))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
100 if len(results) > 0:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
101 writer.delete_by_term("identifier", identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
102 self.deleted.add(identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
103
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
104 # Note: use '.lower()' because it seems like Whoosh gets
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
105 # better results that way.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
106 writer.add_document(identifier=identifier, content=text)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
107 self.save_index()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
108
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
109 def find(self, wordlist):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
110 '''look up all the words in the wordlist.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
111 If none are found return an empty dictionary
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
112 * more rules here
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
113 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
114
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
115 wordlist = [word for word in wordlist
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
116 if (self.minlength <= len(word) <= self.maxlength) and
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
117 not self.is_stopword(word.upper())]
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
118
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
119 if not wordlist:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
120 return {}
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
121
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
122 searcher = self._get_searcher()
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
123 q = query.And([query.FuzzyTerm("content", word.lower())
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
124 for word in wordlist])
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
125
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
126 results = searcher.search(q, limit=None)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
127
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
128 return [tuple(result["identifier"].split(':'))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
129 for result in results]

Roundup Issue Tracker: http://roundup-tracker.org/