annotate roundup/backends/indexer_whoosh.py @ 7800:2d4684e4702d

fix: enhancement to history command output and % template fix. Rather than using the key field, use the label field for descriptions. Call cls.labelprop(default_to_id=True) so it returns id rather than the first sorted property name. If labelprop() returns 'id' or 'title', we return nothing. 'id' means there is no label set and no properties named 'name' or 'title'. So have the caller do whatever it wants (prepend classname for example) when there is no human readable name. This prevents %(name)s%(key)s from producing: 23(23). Also don't accept the 'title' property. Titles can be too long. Arguably we could: '%(name)20s' to limit the title length. However without ellipses or something truncating the title might be confusing. So again pretend there is no human readable name.
author John Rouillard <rouilj@ieee.org>
date Tue, 12 Mar 2024 11:52:17 -0400
parents cb76bb8bfffd
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
1 ''' This implements the full-text indexer using Whoosh.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
2 '''
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
3 import os
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
4
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
5 from whoosh import fields, qparser, index, query, analysis
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
6
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
7 from roundup.backends.indexer_common import Indexer as IndexerBase
5416
56c9bcdea47f Python 3 preparation: unicode.
Joseph Myers <jsm@polyomino.org.uk>
parents: 5096
diff changeset
8 from roundup.anypy.strings import us2u
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
9
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
10
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
11 class Indexer(IndexerBase):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
12 def __init__(self, db):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
13 IndexerBase.__init__(self, db)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
14 self.db_path = db.config.DATABASE
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
15 self.reindex = 0
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
16 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
17 self.index = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
18 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
19
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
20 def _get_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
21 if self.index is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
22 path = os.path.join(self.db_path, 'whoosh-index')
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
23 if not os.path.exists(path):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
24 # StandardAnalyzer lowercases all words and configure it to
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
25 # block stopwords and words with lengths not between
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
26 # self.minlength and self.maxlength from indexer_common
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
27 stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopwords,
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
28 minsize=self.minlength,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
29 maxsize=self.maxlength)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
30 os.mkdir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
31 schema = fields.Schema(identifier=fields.ID(stored=True,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
32 unique=True),
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
33 content=fields.TEXT(analyzer=stopfilter))
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
34 index.create_in(path, schema)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
35 self.index = index.open_dir(path)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
36 return self.index
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
37
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
38 def save_index(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
39 '''Save the changes to the index.'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
40 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
41 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
42 self.writer.commit()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
43 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
44 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
45
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
46 def close(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
47 '''close the indexing database'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
48 pass
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
49
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
50 def rollback(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
51 if not self.writer:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
52 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
53 self.writer.cancel()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
54 self.deleted = set()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
55 self.writer = None
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
56
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
57 def force_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
58 '''Force a reindexing of the database. This essentially
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
59 empties the tables ids and index and sets a flag so
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
60 that the databases are reindexed'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
61 self.reindex = 1
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
62
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
63 def should_reindex(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
64 '''returns True if the indexes need to be rebuilt'''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
65 return self.reindex
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
66
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
67 def _get_writer(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
68 if self.writer is None:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
69 self.writer = self._get_index().writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
70 return self.writer
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
71
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
72 def _get_searcher(self):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
73 return self._get_index().searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
74
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
75 def add_text(self, identifier, text, mime_type='text/plain'):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
76 ''' "identifier" is (classname, itemid, property) '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
77 if mime_type != 'text/plain':
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
78 return
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
79
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
80 if not text:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
81 text = u''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
82
5416
56c9bcdea47f Python 3 preparation: unicode.
Joseph Myers <jsm@polyomino.org.uk>
parents: 5096
diff changeset
83 text = us2u(text, "replace")
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
84
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
85 # We use the identifier twice: once in the actual "text" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
86 # indexed so we can search on it, and again as the "data" being
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
87 # indexed so we know what we're matching when we get results
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
88 identifier = u"%s:%s:%s" % identifier
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
89
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
90 # FIXME need to enhance this to handle the whoosh.store.LockError
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
91 # that maybe raised if there is already another process with a lock.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
92 writer = self._get_writer()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
93
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
94 # Whoosh gets upset if a document is deleted twice in one transaction,
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
95 # so we keep a list of the documents we have so far deleted to make
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
96 # sure that we only delete them once.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
97 if identifier not in self.deleted:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
98 searcher = self._get_searcher()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
99 results = searcher.search(query.Term("identifier", identifier))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
100 if len(results) > 0:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
101 writer.delete_by_term("identifier", identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
102 self.deleted.add(identifier)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
103
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
104 # Note: use '.lower()' because it seems like Whoosh gets
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
105 # better results that way.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
106 writer.add_document(identifier=identifier, content=text)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
107 self.save_index()
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
108
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
109 def find(self, wordlist):
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
110 '''look up all the words in the wordlist.
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
111 If none are found return an empty dictionary
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
112 * more rules here
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
113 '''
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
114
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
115 wordlist = [word for word in wordlist
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
116 if (self.minlength <= len(word) <= self.maxlength) and
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
117 not self.is_stopword(word.upper())]
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
118
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
119 if not wordlist:
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
120 return {}
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
121
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
122 searcher = self._get_searcher()
6034
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
123 q = query.And([query.FuzzyTerm("content", word.lower())
cb76bb8bfffd flake8 whitespace fixes.
John Rouillard <rouilj@ieee.org>
parents: 5416
diff changeset
124 for word in wordlist])
5096
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
125
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
126 results = searcher.search(q, limit=None)
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
127
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
128 return [tuple(result["identifier"].split(':'))
e74c3611b138 - issue2550636, issue2550909: Added support for Whoosh indexer.
John Rouillard <rouilj@ieee.org>
parents:
diff changeset
129 for result in results]

Roundup Issue Tracker: http://roundup-tracker.org/