Mercurial > p > roundup > code
view roundup/cgi/TAL/markupbase.py @ 6593:e70e2789bc2c
issue2551189 - increase text search maxlength
This removes I think all the magic references to 25 and 30 (varchar
size) and replaces them with references to maxlength or maxlength+5.
I am not sure why the db column is 5 characters larger than the size
of what should be the max size of a word, but I'll keep the buffer
of 5 as making it 1/5 the size of maxlength makes less sense.
Also added tests for fts search in templating which were missing.
Added postgres, mysql and sqlite native indexing backends in which to
test fts. Added fts test to native-fts as well to make sure it's
working.
I want to commit this now for CI.
Todo:
add test cases for the use of FTS in the csv output in
actions.py. There is no test coverage of the match case there.
change maxlength to a higher value (50) as requested in the ticket.
Modify existing extremewords test cases to allow words > 25 and < 51
write code to migrate column sizes for mysql and postgresql to match
maxlength I will roll this into the version 7 schema update that
supports use of database fts support.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Tue, 25 Jan 2022 13:22:00 -0500 |
| parents | 12fe83f90f0d |
| children |
line wrap: on
line source
"""Shared support for scanning document type declarations in HTML and XHTML.""" import re, string _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match del re class ParserBase: """Parser base class which provides some common support methods used by the SGML/HTML and XHTML parsers.""" def reset(self): self.lineno = 1 self.offset = 0 def getpos(self): """Return current line number and offset.""" return self.lineno, self.offset def error(self, message): """Return an error, showing current line number and offset. Concrete subclasses *must* override this method. """ raise NotImplementedError # Internal -- update line number and offset. This should be # called for each piece of data exactly once, in order -- in other # words the concatenation of all the input strings to this # function should be exactly the entire input. def updatepos(self, i, j): if i >= j: return j rawdata = self.rawdata nlines = rawdata.count("\n", i, j) if nlines: self.lineno = self.lineno + nlines pos = rawdata.rindex("\n", i, j) # Should not fail self.offset = j-(pos+1) else: self.offset = self.offset + j-i return j _decl_otherchars = '' # Internal -- parse declaration (for use by subclasses). def parse_declaration(self, i): # This is some sort of declaration; in "HTML as # deployed," this should only be the document type # declaration ("<!DOCTYPE html...>"). rawdata = self.rawdata import sys j = i + 2 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" if rawdata[j:j+1] in ("-", ""): # Start of comment followed by buffer boundary, # or just a buffer boundary. return -1 # in practice, this should look like: ((name|stringlit) S*)+ '>' n = len(rawdata) decltype, j = self._scan_name(j, i) if j < 0: return j if decltype == "doctype": self._decl_otherchars = '' while j < n: c = rawdata[j] if c == ">": # end of declaration syntax data = rawdata[i+2:j] if decltype == "doctype": self.handle_decl(data) else: self.unknown_decl(data) return j + 1 if c in "\"'": m = _declstringlit_match(rawdata, j) if not m: return -1 # incomplete j = m.end() elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": name, j = self._scan_name(j, i) elif c in self._decl_otherchars: j = j + 1 elif c == "[": if decltype == "doctype": j = self._parse_doctype_subset(j + 1, i) else: self.error("unexpected '[' char in declaration") else: self.error( "unexpected %s char in declaration" % repr(rawdata[j])) if j < 0: return j return -1 # incomplete # Internal -- scan past the internal subset in a <!DOCTYPE declaration, # returning the index just past any whitespace following the trailing ']'. def _parse_doctype_subset(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) j = i while j < n: c = rawdata[j] if c == "<": s = rawdata[j:j+2] if s == "<": # end of buffer; incomplete return -1 if s != "<!": self.updatepos(declstartpos, j + 1) self.error("unexpected char in internal subset (in %s)" % repr(s)) if (j + 2) == n: # end of buffer; incomplete return -1 if (j + 4) > n: # end of buffer; incomplete return -1 if rawdata[j:j+4] == "<!--": j = self.parse_comment(j, report=0) if j < 0: return j continue name, j = self._scan_name(j + 2, declstartpos) if j == -1: return -1 if name not in ("attlist", "element", "entity", "notation"): self.updatepos(declstartpos, j + 2) self.error( "unknown declaration %s in internal subset" % repr(name)) # handle the individual names meth = getattr(self, "_parse_doctype_" + name) j = meth(j, declstartpos) if j < 0: return j elif c == "%": # parameter entity reference if (j + 1) == n: # end of buffer; incomplete return -1 s, j = self._scan_name(j + 1, declstartpos) if j < 0: return j if rawdata[j] == ";": j = j + 1 elif c == "]": j = j + 1 while j < n and rawdata[j] in string.whitespace: j = j + 1 if j < n: if rawdata[j] == ">": return j self.updatepos(declstartpos, j) self.error("unexpected char after internal subset") else: return -1 elif c in string.whitespace: j = j + 1 else: self.updatepos(declstartpos, j) self.error("unexpected char %s in internal subset" % repr(c)) # end of buffer reached return -1 # Internal -- scan past <!ELEMENT declarations def _parse_doctype_element(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) name, j = self._scan_name(i, declstartpos) if j == -1: return -1 # style content model; just skip until '>' if '>' in rawdata[j:]: return rawdata.find(">", j) + 1 return -1 # Internal -- scan past <!ATTLIST declarations def _parse_doctype_attlist(self, i, declstartpos): rawdata = self.rawdata name, j = self._scan_name(i, declstartpos) c = rawdata[j:j+1] if c == "": return -1 if c == ">": return j + 1 while 1: # scan a series of attribute descriptions; simplified: # name type [value] [#constraint] name, j = self._scan_name(j, declstartpos) if j < 0: return j c = rawdata[j:j+1] if c == "": return -1 if c == "(": # an enumerated type; look for ')' if ")" in rawdata[j:]: j = rawdata.find(")", j) + 1 else: return -1 while rawdata[j:j+1].isspace(): j = j + 1 if not rawdata[j:]: # end of buffer, incomplete return -1 else: name, j = self._scan_name(j, declstartpos) c = rawdata[j:j+1] if not c: return -1 if c in "'\"": m = _declstringlit_match(rawdata, j) if m: j = m.end() else: return -1 c = rawdata[j:j+1] if not c: return -1 if c == "#": if rawdata[j:] == "#": # end of buffer return -1 name, j = self._scan_name(j + 1, declstartpos) if j < 0: return j c = rawdata[j:j+1] if not c: return -1 if c == '>': # all done return j + 1 # Internal -- scan past <!NOTATION declarations def _parse_doctype_notation(self, i, declstartpos): name, j = self._scan_name(i, declstartpos) if j < 0: return j rawdata = self.rawdata while 1: c = rawdata[j:j+1] if not c: # end of buffer; incomplete return -1 if c == '>': return j + 1 if c in "'\"": m = _declstringlit_match(rawdata, j) if not m: return -1 j = m.end() else: name, j = self._scan_name(j, declstartpos) if j < 0: return j # Internal -- scan past <!ENTITY declarations def _parse_doctype_entity(self, i, declstartpos): rawdata = self.rawdata if rawdata[i:i+1] == "%": j = i + 1 while 1: c = rawdata[j:j+1] if not c: return -1 if c in string.whitespace: j = j + 1 else: break else: j = i name, j = self._scan_name(j, declstartpos) if j < 0: return j while 1: c = self.rawdata[j:j+1] if not c: return -1 if c in "'\"": m = _declstringlit_match(rawdata, j) if m: j = m.end() else: return -1 # incomplete elif c == ">": return j + 1 else: name, j = self._scan_name(j, declstartpos) if j < 0: return j # Internal -- scan a name token and the new position and the token, or # return -1 if we've reached the end of the buffer. def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) if i == n: return None, -1 m = _declname_match(rawdata, i) if m: s = m.group() name = s.strip() if (i + len(s)) == n: return None, -1 # end of buffer return name.lower(), m.end() else: self.updatepos(declstartpos, i) self.error("expected name token")
