view roundup/backends/indexer_sqlite_fts.py @ 8600:892533a65881 default tip

chore(ruff): cleanup indexer_* code after ruff analysis. indexer_common.py: use 'for x, y in adict.items() rather than 'for x in adict' and later reference 'adict[x]'. indexer_dbm.py: use 'with open() ... as fd: <stuff>' rather than 'fd = open()<stuff>close()'. 4 instances. whitespace fixes around operators use 'for x, y in adict.items() rather than 'for x in adict' and later reference 'adict[x]'. replace 'return 0' with 'return' as ruff was reporting function didn't have explicit return at end when internal return returned a not None value. AFICT load_index's return value is never checked. replace 'for _key, value in x.items()' with 'for value in x.values()'. _key is never used. indexer_postgresql_fts.py: import sorting. symbolic string for server version needed for FTS support. rename variable id. remove unneeded flow control (else after raise) indexer_rdbms.py: import sorting. rename variable id 2 places. add variable 'a = self.db.arg' to make scanning SQL code that uses % to insert placeholders easier. Also reduces dereferencing. replaced 'tuple([list comprehension])' with tuple(same comprehension but is a generator) indexer_sqlite_fts.py: import sorting. rename variable id. remove unneeded flow control (else after raise). indexer_whoosh.py: import sorting. remove qparser from whoosh import. Unused andnot needed until we implement using whoosh query language in roundup searches. indexer_xapian.py: import sorting. rewrite loop that appended to a list as a list comprehension
author John Rouillard <rouilj@ieee.org>
date Sun, 26 Apr 2026 19:26:54 -0400
parents 9ff091537f43
children
line wrap: on
line source

""" This implements the full-text indexer using fts5 in sqlite.
The table consists of (Class, propname, itemid) instances as columns
along with a textblob column. The textblob column is searched using
MATCH and the instances returned.

sqlite test commands to manage schema version change required by
this update.

-- check length before and after
select length(schema) from schema;

-- reset from version 7 (with fts index) to version 6
 update schema set schema = (select replace(schema,
   '''version'': 7','''version'': 6') as new_schema from schema);

-- check version. Good thing it's at the front of the schema
 select substr(schema,0,15) from schema;
 {'version': 6,
"""

from roundup.backends.indexer_common import Indexer as IndexerBase
from roundup.cgi.exceptions import IndexerQueryError
from roundup.i18n import _

try:
    import sqlite3 as sqlite
    if sqlite.sqlite_version_info < (3, 9, 0):
        raise ValueError('sqlite minimum version for FTS5 is 3.9.0+ '
                         '- %s found' % sqlite.sqlite_version)
except ImportError:
    raise ValueError('Unable to import sqlite3 to support FTS.')


class Indexer(IndexerBase):
    def __init__(self, db):
        IndexerBase.__init__(self, db)
        self.db = db
        self.reindex = 0
        self.query_language = True

    def close(self):
        """close the indexing database"""
        # just nuke the circular reference
        self.db = None

    def save_index(self):
        """Save the changes to the index."""
        # not necessary - the RDBMS connection will handle this for us
        pass

    def force_reindex(self):
        """Force a reindexing of the database.  This essentially
        empties the __fts table and sets a flag so
        that the databases are reindexed"""
        self.reindex = 1

    def should_reindex(self):
        """returns True if the indexes need to be rebuilt"""
        return self.reindex

    def add_text(self, identifier, text, mime_type='text/plain'):
        """ "identifier" is  (classname, itemid, property) """
        if mime_type != 'text/plain':
            return

        # Ensure all elements of the identifier are strings 'cos the itemid
        # column is varchar even if item ids may be numbers elsewhere in the
        # code. ugh.
        identifier = tuple(map(str, identifier))

        # removed pre-processing of text that incudes only words with:
        # self.minlength <= len(word) <= self.maxlength
        # Not sure if that is correct.

        # first, find the rowid of the (classname, itemid, property)
        a = self.db.arg  # arg is the token for positional parameters
        sql = 'select rowid from __fts where _class=%s and '\
            '_itemid=%s and _prop=%s' % (a, a, a)
        self.db.cursor.execute(sql, identifier)
        r = self.db.cursor.fetchone()
        if not r:
            # not previously indexed
            sql = 'insert into __fts (_class, _itemid, _prop, _textblob)'\
                ' values (%s, %s, %s, %s)' % (a, a, a, a)
            self.db.cursor.execute(sql, identifier + (text,))
        else:
            text_id = int(r[0])
            sql = 'update __fts set _textblob=%s where rowid=%s' % \
                  (a, a)
            self.db.cursor.execute(sql, (text, text_id))

    def find(self, wordlist):
        """look up all the words in the wordlist.
           For testing wordlist is actually a list.
           In production, wordlist is a list of a single string
           that is a sqlite MATCH query.

           https://www.sqlite.org/fts5.html#full_text_query_syntax
        """

        # Filter out stopwords. Other searches tokenize the user query
        # into an list of simple word tokens. For fTS, query
        # tokenization doesn't occur.

        # A user's FTS query is a wordlist with one element.  The
        # element is a string to parse and will probably not match a
        # stop word.
        #
        # However the generic indexer search tests pass in a list of
        # word tokens. We filter the word tokens so it behaves like
        # other backends.  This means that a search for a simple word
        # like 'the' (without quotes) will return no hits, as the test
        # expects.
        wordlist = [w for w in wordlist if not self.is_stopword(w.upper())]

        if not wordlist:
            return []

        a = self.db.arg  # arg is the token for positional parameters

        # removed filtering of word in wordlist to include only
        # words with:  self.minlength <= len(word) <= self.maxlength

        sql = 'select _class, _itemid, _prop from __fts '\
              'where _textblob MATCH %s' % a

        try:
            # tests supply a multi element word list. Join them.
            self.db.cursor.execute(sql, (" ".join(wordlist),))
        except sqlite.OperationalError as e:
            if 'no such column' in e.args[0]:
                raise IndexerQueryError(
                    _("Search failed. Try quoting any terms that "
                      "include a '-' and retry the search."))

            raise IndexerQueryError(e.args[0].replace("fts5:",
                                                      "Query error:"))

        return self.db.cursor.fetchall()

Roundup Issue Tracker: http://roundup-tracker.org/