Mercurial > p > roundup > code
changeset 5867:ee2e8f8d6648
Implement exact string search
.. in the 'filter' method of hyperdb.Class (and the corresponding
backend implementations).
| author | Ralf Schlatterbeck <rsc@runtux.com> |
|---|---|
| date | Mon, 26 Aug 2019 18:18:02 +0200 |
| parents | 04deafac71ab |
| children | d2fac1069028 |
| files | CHANGES.txt doc/design.txt roundup/backends/back_anydbm.py roundup/backends/back_mysql.py roundup/backends/back_sqlite.py roundup/backends/rdbms_common.py roundup/hyperdb.py test/db_test_base.py |
| diffstat | 8 files changed, 278 insertions(+), 132 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGES.txt Mon Aug 26 09:56:20 2019 +0200 +++ b/CHANGES.txt Mon Aug 26 18:18:02 2019 +0200 @@ -85,6 +85,7 @@ - issue2551043: Add X-Roundup-issue-id email header. Add a new header to make it easier to filter notification emails without having to parse the subject line. (John Rouillard) +- The database filter function now can also do an exact string search. Fixed:
--- a/doc/design.txt Mon Aug 26 09:56:20 2019 +0200 +++ b/doc/design.txt Mon Aug 26 18:18:02 2019 +0200 @@ -515,26 +515,45 @@ db.issue.find(messages={'1':1,'3':1}, files={'7':1}) """ - def filter(self, search_matches, filterspec, sort, group): + def filter(self, search_matches, filterspec, sort, group, + retired, exact_match_spec): """Return a list of the ids of the active nodes in this class that match the 'filter' spec, sorted by the group spec and then the - sort spec. + sort spec. The arguments sort, group, retired, and + exact_match_spec are optional. - "search_matches" is a container type + "search_matches" is a container type which by default is + None and optionally contains IDs of items to match. If + non-empty only IDs of the initial set are returned. "filterspec" is {propname: value(s)} + "exact_match_spec" is the same format as "filterspec" but + specifies exact match for the given propnames. This only + makes a difference for String properties, these specify case + insensitive substring search when in "filterspec" and exact + match when in exact_match_spec. "sort" and "group" are [(dir, prop), ...] where dir is '+', '-' or None and prop is a prop name or None. Note that for backward-compatibility reasons a single (dir, prop) tuple is also allowed. + The parameter retired when set to False, returns only live + (un-retired) results. When setting it to True, only retired + items are returned. If None, both retired and unretired + items are returned. The default is False, i.e. only live + items are returned by default. + The filter must match all properties specificed. If the property value to match is a list: 1. String properties must match all elements in the list, and 2. Other properties must match any of the elements in the list. + This also means that for strings in exact_match_spec it + doesn't make sense to specify multiple values because those + cannot all be matched. + The propname in filterspec and prop in a sort/group spec may be transitive, i.e., it may contain properties of the form link.link.link.name, e.g. you can search for all issues where
--- a/roundup/backends/back_anydbm.py Mon Aug 26 09:56:20 2019 +0200 +++ b/roundup/backends/back_anydbm.py Mon Aug 26 18:18:02 2019 +0200 @@ -25,7 +25,7 @@ import os, marshal, re, weakref, string, copy, time, shutil, logging from roundup.anypy.dbm_ import anydbm, whichdb -from roundup.anypy.strings import b2s, bs2b, repr_export, eval_import +from roundup.anypy.strings import b2s, bs2b, repr_export, eval_import, is_us from roundup import hyperdb, date, password, roundupdb, security, support from roundup.backends import locking @@ -1690,12 +1690,16 @@ return res def _filter(self, search_matches, filterspec, proptree, - num_re = re.compile(r'^\d+$'), retired=False): + num_re = re.compile(r'^\d+$'), retired=False, + exact_match_spec={}): """Return a list of the ids of the nodes in this class that match the 'filter' spec, sorted by the group spec and then the sort spec. "filterspec" is {propname: value(s)} + same for "exact_match_spec". The latter specifies exact matching + for String type while String types in "filterspec" are searched + for as case insensitive substring match. "sort" and "group" are (dir, prop) where dir is '+', '-' or None and prop is a prop name or None @@ -1726,82 +1730,86 @@ INTERVAL = 'spec:interval' OTHER = 'spec:other' - for k, v in filterspec.items(): - propclass = props[k] - if isinstance(propclass, hyperdb.Link): - if type(v) is not type([]): - v = [v] - u = [] - for entry in v: + for exact, filtertype in enumerate((filterspec, exact_match_spec)): + for k, v in filtertype.items(): + propclass = props[k] + if isinstance(propclass, hyperdb.Link): + if type(v) is not type([]): + v = [v] + u = [] + for entry in v: + # the value -1 is a special "not set" sentinel + if entry == '-1': + entry = None + u.append(entry) + l.append((LINK, k, u)) + elif isinstance(propclass, hyperdb.Multilink): # the value -1 is a special "not set" sentinel - if entry == '-1': - entry = None - u.append(entry) - l.append((LINK, k, u)) - elif isinstance(propclass, hyperdb.Multilink): - # the value -1 is a special "not set" sentinel - if v in ('-1', ['-1']): - v = [] - elif type(v) is not type([]): - v = [v] - l.append((MULTILINK, k, v)) - elif isinstance(propclass, hyperdb.String) and k != 'id': - if type(v) is not type([]): - v = [v] - for v in v: - # simple glob searching - v = re.sub(r'([\|\{\}\\\.\+\[\]\(\)])', r'\\\1', v) - v = v.replace('?', '.') - v = v.replace('*', '.*?') - l.append((STRING, k, re.compile(v, re.I))) - elif isinstance(propclass, hyperdb.Date): - try: - date_rng = propclass.range_from_raw(v, self.db) - l.append((DATE, k, date_rng)) - except ValueError: - # If range creation fails - ignore that search parameter - pass - elif isinstance(propclass, hyperdb.Interval): - try: - intv_rng = date.Range(v, date.Interval) - l.append((INTERVAL, k, intv_rng)) - except ValueError: - # If range creation fails - ignore that search parameter - pass + if v in ('-1', ['-1']): + v = [] + elif type(v) is not type([]): + v = [v] + l.append((MULTILINK, k, v)) + elif isinstance(propclass, hyperdb.String) and k != 'id': + if type(v) is not type([]): + v = [v] + for x in v: + if exact: + l.append((STRING, k, x)) + else: + # simple glob searching + x = re.sub(r'([\|\{\}\\\.\+\[\]\(\)])', r'\\\1', x) + x = x.replace('?', '.') + x = x.replace('*', '.*?') + l.append((STRING, k, re.compile(x, re.I))) + elif isinstance(propclass, hyperdb.Date): + try: + date_rng = propclass.range_from_raw(v, self.db) + l.append((DATE, k, date_rng)) + except ValueError: + # If range creation fails - ignore that search parameter + pass + elif isinstance(propclass, hyperdb.Interval): + try: + intv_rng = date.Range(v, date.Interval) + l.append((INTERVAL, k, intv_rng)) + except ValueError: + # If range creation fails - ignore that search parameter + pass - elif isinstance(propclass, hyperdb.Boolean): - if type(v) == type(""): - v = v.split(',') - if type(v) != type([]): - v = [v] - bv = [] - for val in v: - if type(val) is type(''): - bv.append(propclass.from_raw (val)) - else: - bv.append(val) - l.append((OTHER, k, bv)) + elif isinstance(propclass, hyperdb.Boolean): + if type(v) == type(""): + v = v.split(',') + if type(v) != type([]): + v = [v] + bv = [] + for val in v: + if type(val) is type(''): + bv.append(propclass.from_raw (val)) + else: + bv.append(val) + l.append((OTHER, k, bv)) - elif k == 'id': - if type(v) != type([]): - v = v.split(',') - l.append((OTHER, k, [str(int(val)) for val in v])) - - elif isinstance(propclass, hyperdb.Number): - if type(v) != type([]): - try : + elif k == 'id': + if type(v) != type([]): v = v.split(',') - except AttributeError : - v = [v] - l.append((OTHER, k, [float(val) for val in v])) + l.append((OTHER, k, [str(int(val)) for val in v])) - elif isinstance(propclass, hyperdb.Integer): - if type(v) != type([]): - try : - v = v.split(',') - except AttributeError : - v = [v] - l.append((OTHER, k, [int(val) for val in v])) + elif isinstance(propclass, hyperdb.Number): + if type(v) != type([]): + try : + v = v.split(',') + except AttributeError : + v = [v] + l.append((OTHER, k, [float(val) for val in v])) + + elif isinstance(propclass, hyperdb.Integer): + if type(v) != type([]): + try : + v = v.split(',') + except AttributeError : + v = [v] + l.append((OTHER, k, [int(val) for val in v])) filterspec = l @@ -1848,8 +1856,12 @@ elif t == STRING: if nv is None: nv = '' - # RE search - match = v.search(nv) + if is_us(v): + # Exact match + match = (nv == v) + else: + # RE search + match = v.search(nv) elif t == DATE or t == INTERVAL: if nv is None: match = v is None
--- a/roundup/backends/back_mysql.py Mon Aug 26 09:56:20 2019 +0200 +++ b/roundup/backends/back_mysql.py Mon Aug 26 18:18:02 2019 +0200 @@ -592,6 +592,7 @@ raise class MysqlClass: + case_sensitive_equal = 'COLLATE utf8_bin =' def supports_subselects(self): # TODO: AFAIK its version dependent for MySQL
--- a/roundup/backends/back_sqlite.py Mon Aug 26 09:56:20 2019 +0200 +++ b/roundup/backends/back_sqlite.py Mon Aug 26 18:18:02 2019 +0200 @@ -455,12 +455,13 @@ class sqliteClass: def filter(self, search_matches, filterspec, sort=(None,None), - group=(None,None), retired=False): + group=(None,None), retired=False, exact_match_spec={}): """ If there's NO matches to a fetch, sqlite returns NULL instead of nothing """ return [f for f in rdbms_common.Class.filter(self, search_matches, - filterspec, sort=sort, group=group, retired=retired) if f] + filterspec, sort=sort, group=group, retired=retired, + exact_match_spec=exact_match_spec) if f] class Class(sqliteClass, rdbms_common.Class): pass
--- a/roundup/backends/rdbms_common.py Mon Aug 26 09:56:20 2019 +0200 +++ b/roundup/backends/rdbms_common.py Mon Aug 26 18:18:02 2019 +0200 @@ -1515,6 +1515,10 @@ # We define the default here, can be changed in derivative class case_insensitive_like = 'LIKE' + # For some databases (mysql) the = operator for strings ignores case. + # We define the default here, can be changed in derivative class + case_sensitive_equal = '=' + def schema(self): """ A dumpable version of the schema that we can store in the database @@ -2399,7 +2403,7 @@ return where, v, True # True to indicate original def _filter_sql (self, search_matches, filterspec, srt=[], grp=[], retr=0, - retired=False): + retired=False, exact_match_spec={}): """ Compute the proptree and the SQL/ARGS for a filter. For argument description see filter below. We return a 3-tuple, the proptree, the sql and the sql-args @@ -2423,7 +2427,7 @@ # figure the WHERE clause from the filterspec mlfilt = 0 # are we joining with Multilink tables? sortattr = self._sortattr (group = grp, sort = srt) - proptree = self._proptree(filterspec, sortattr, retr) + proptree = self._proptree(exact_match_spec, filterspec, sortattr, retr) mlseen = 0 for pt in reversed(proptree.sortattr): p = pt @@ -2468,7 +2472,8 @@ gen_join = True if p.has_values and isinstance(v, type([])): - result = self._filter_multilink_expression(pln, tn, v) + result = self._filter_multilink_expression(pln, + tn, v) # XXX: We dont need an id join if we used the filter gen_join = len(result) == 3 @@ -2506,27 +2511,36 @@ rc = oc = ac = '_%s.id'%pln elif isinstance(propclass, String): if 'search' in p.need_for: + exact = [] if not isinstance(v, type([])): v = [v] - - # Quote special search characters '%' and '_' for - # correct matching with LIKE/ILIKE - # Note that we now pass the elements of v as query - # arguments and don't interpolate the quoted string - # into the sql statement. Should be safer. - v = [self.db.search_stringquote(s) for s in v] + new_v = [] + for x in v: + if isinstance(x, hyperdb.Exact_Match): + exact.append(True) + new_v.append(x.value) + else: + exact.append(False) + # Quote special search characters '%' and '_' for + # correct matching with LIKE/ILIKE + # Note that we now pass the elements of v as query + # arguments and don't interpolate the quoted string + # into the sql statement. Should be safer. + new_v.append(self.db.search_stringquote(x)) + v = new_v # now add to the where clause - where.append('(' - +' and '.join(["_%s._%s %s %s ESCAPE %s"%( - pln, - k, - self.case_insensitive_like, - a, - a) for s in v]) - +')') - for vv in v: - args.extend((vv, '\\')) + w = [] + for vv, ex in zip(v, exact): + if ex: + w.append("_%s._%s %s %s"%( + pln, k, self.case_sensitive_equal, a)) + args.append(vv) + else: + w.append("_%s._%s %s %s ESCAPE %s"%( + pln, k, self.case_insensitive_like, a, a)) + args.extend((vv, '\\')) + where.append ('(' + ' and '.join(w) + ')') if 'sort' in p.need_for: oc = ac = 'lower(_%s._%s)'%(pln, k) elif isinstance(propclass, Link): @@ -2711,7 +2725,7 @@ return proptree, sql, args def filter(self, search_matches, filterspec, sort=[], group=[], - retired=False): + retired=False, exact_match_spec={}): """Return a list of the ids of the active nodes in this class that match the 'filter' spec, sorted by the group spec and then the sort spec @@ -2735,7 +2749,8 @@ start_t = time.time() sq = self._filter_sql (search_matches, filterspec, sort, group, - retired=retired) + retired=retired, + exact_match_spec=exact_match_spec) # nothing to match? if sq is None: return [] @@ -2759,7 +2774,7 @@ return l def filter_iter(self, search_matches, filterspec, sort=[], group=[], - retired=False): + retired=False, exact_match_spec={}): """Iterator similar to filter above with same args. Limitation: We don't sort on multilinks. This uses an optimisation: We put all nodes that are in the @@ -2769,7 +2784,8 @@ cache. We're using our own temporary cursor. """ sq = self._filter_sql(search_matches, filterspec, sort, group, retr=1, - retired=retired) + retired=retired, + exact_match_spec=exact_match_spec) # nothing to match? if sq is None: return
--- a/roundup/hyperdb.py Mon Aug 26 09:56:20 2019 +0200 +++ b/roundup/hyperdb.py Mon Aug 26 18:18:02 2019 +0200 @@ -342,6 +342,12 @@ raise DesignatorError(_('"%s" not a node designator')%designator) return m.group(1), m.group(2) +class Exact_Match(object): + """ Used to encapsulate exact match semantics search values + """ + def __init__(self, value): + self.value = value + class Proptree(object): """ Simple tree data structure for property lookup. Each node in the tree is a roundup Class Property that has to be navigated to @@ -461,13 +467,29 @@ simple _filter call which does the real work """ filterspec = {} + exact_match_spec = {} for p in self.children: if 'search' in p.need_for: if p.children: p.search(sort = False) - filterspec[p.name] = p.val + if isinstance(p.val, type([])): + exact = [] + subst = [] + for v in p.val: + if isinstance(v, Exact_Match): + exact.append(v.value) + else: + subst.append(v) + if exact: + exact_match_spec[p.name] = exact + if subst: + filterspec[p.name] = subst + else: + assert not isinstance(p.val, Exact_Match) + filterspec[p.name] = p.val self.val = self.cls._filter(search_matches, filterspec, sort and self, - retired=retired) + retired=retired, + exact_match_spec=exact_match_spec) return self.val def sort (self, ids=None): @@ -555,15 +577,28 @@ return ids def _set_val(self, val): - """Check if self._val is already defined. If yes, we compute the - intersection of the old and the new value(s) + """ Check if self._val is already defined. If yes, we compute the + intersection of the old and the new value(s) + Note: If self is a Leaf node we need to compute a + union: Normally we intersect (logical and) different + subqueries into a Link or Multilink property. But for + leaves we might have a part of a query in a filterspec and + in an exact_match_spec. These have to be all there, the + generated search will ensure a logical and of all tests for + equality/substring search. """ if self.has_values: v = self._val if not isinstance(self._val, type([])): v = [self._val] vals = set(v) - vals.intersection_update(val) + if not isinstance(val, type([])): + val = [val] + # if cls is None we're a leaf + if self.cls: + vals.intersection_update(val) + else: + vals.update(val) self._val = [v for v in vals] else: self._val = val @@ -1261,32 +1296,43 @@ raise NotImplementedError def _filter(self, search_matches, filterspec, sort=(None,None), - group=(None,None), retired=False): + group=(None,None), retired=False, exact_match_spec={}): """For some backends this implements the non-transitive search, for more information see the filter method. """ raise NotImplementedError - def _proptree(self, filterspec, sortattr=[], retr=False): + def _proptree(self, exact_match_spec, filterspec, sortattr=[], retr=False): """Build a tree of all transitive properties in the given - filterspec. + exact_match_spec/filterspec. If we retrieve (retr is True) linked items we don't follow across multilinks. We also don't follow if the searched value can contain NULL values. """ proptree = Proptree(self.db, self, '', self.getprops(), retr=retr) - for key, v in filterspec.items(): - keys = key.split('.') - p = proptree - mlseen = False - for k in keys: - if isinstance (p.propclass, Multilink): - mlseen = True - isnull = v == '-1' or v is None - nullin = isinstance(v, type([])) and ('-1' in v or None in v) - r = retr and not mlseen and not isnull and not nullin - p = p.append(k, retr=r) - p.val = v + for exact, spec in enumerate((filterspec, exact_match_spec)): + for key, v in spec.items(): + keys = key.split('.') + p = proptree + mlseen = False + for k in keys: + if isinstance (p.propclass, Multilink): + mlseen = True + isnull = v == '-1' or v is None + islist = isinstance(v, type([])) + nullin = islist and ('-1' in v or None in v) + r = retr and not mlseen and not isnull and not nullin + p = p.append(k, retr=r) + if exact: + if isinstance(v, type([])): + vv = [] + for x in v: + vv.append(Exact_Match(x)) + p.val = vv + else: + p.val = [Exact_Match(v)] + else: + p.val = v multilinks = {} for s in sortattr: keys = s[1].split('.') @@ -1353,19 +1399,32 @@ return sortattr def filter(self, search_matches, filterspec, sort=[], group=[], - retired=False): + retired=False, exact_match_spec={}): """Return a list of the ids of the active nodes in this class that match the 'filter' spec, sorted by the group spec and then the sort spec. + "search_matches" is a container type which by default is None + and optionally contains IDs of items to match. If non-empty only + IDs of the initial set are returned. + "filterspec" is {propname: value(s)} + "exact_match_spec" is the same format as "filterspec" but + specifies exact match for the given propnames. This only makes a + difference for String properties, these specify case insensitive + substring search when in "filterspec" and exact match when in + exact_match_spec. "sort" and "group" are [(dir, prop), ...] where dir is '+', '-' or None and prop is a prop name or None. Note that for backward-compatibility reasons a single (dir, prop) tuple is also allowed. - "search_matches" is a container type + The parameter retired when set to False, returns only live + (un-retired) results. When setting it to True, only retired + items are returned. If None, both retired and unretired items + are returned. The default is False, i.e. only live items are + returned by default. The filter must match all properties specificed. If the property value to match is a list: @@ -1373,11 +1432,15 @@ 1. String properties must match all elements in the list, and 2. Other properties must match any of the elements in the list. - Note that now the propname in filterspec and prop in a - sort/group spec may be transitive, i.e., it may contain - properties of the form link.link.link.name, e.g. you can search - for all issues where a message was added by a certain user in - the last week with a filterspec of + This also means that for strings in exact_match_spec it doesn't + make sense to specify multiple values because those cannot all + be matched exactly. + + The propname in filterspec and prop in a sort/group spec may be + transitive, i.e., it may contain properties of the form + link.link.link.name, e.g. you can search for all issues where a + message was added by a certain user in the last week with a + filterspec of {'messages.author' : '42', 'messages.creation' : '.-1w;'} Implementation note: @@ -1388,7 +1451,7 @@ override the filter method instead of implementing _filter. """ sortattr = self._sortattr(sort = sort, group = group) - proptree = self._proptree(filterspec, sortattr) + proptree = self._proptree(exact_match_spec, filterspec, sortattr) proptree.search(search_matches, retired=retired) return proptree.sort()
--- a/test/db_test_base.py Mon Aug 26 09:56:20 2019 +0200 +++ b/test/db_test_base.py Mon Aug 26 18:18:02 2019 +0200 @@ -1749,6 +1749,39 @@ ae(filt(None, {'title': ['One', 'Two']}, ('+','id'), (None,None)), []) + def testFilteringStringExactMatch(self): + ae, filter, filter_iter = self.filteringSetup() + # Change title of issue2 to 'issue' so we can test substring + # search vs exact search + self.db.issue.set('2', title='issue') + #self.db.commit() + for filt in filter, filter_iter: + ae(filt(None, {}, exact_match_spec = + {'title': ['one']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['issue one']}), ['1']) + ae(filt(None, {}, exact_match_spec = + {'title': ['issue', 'one']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['issue']}), ['2']) + ae(filt(None, {}, exact_match_spec = + {'title': ['one', 'two']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['One']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['Issue One']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['ISSUE', 'ONE']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['iSSUE']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['One', 'Two']}), []) + ae(filt(None, {}, exact_match_spec = + {'title': ['non four']}), ['4']) + # Both, filterspec and exact_match_spec on same prop + ae(filt(None, {'title': 'iSSUE'}, exact_match_spec = + {'title': ['issue']}), ['2']) + def testFilteringSpecialChars(self): """ Special characters in SQL search are '%' and '_', some used to lead to a traceback.
