Mercurial > p > roundup > code
view roundup/cgi/TAL/markupbase.py @ 5525:bb7865241f8a
Make CSV import/export compatible across Python versions (also RDBMS journals) (issue 2550976, issue 2550975).
The roundup-admin export and import commands are used for migrating
between different database backends. It is desirable that they should
be usable also for migrations between Python 2 and Python 3, and in
some cases (e.g. with the anydbm backend) this may be required.
To be usable for such migrations, the format of the generated CSV
files needs to be stable, meaning the same as currently used with
Python 2. The export process uses repr() to produce the fields in the
CSV files and eval() to convert them back to Python data structures.
repr() of strings with non-ASCII characters produces different results
for Python 2 and Python 3.
This patch adds repr_export and eval_import functions to
roundup/anypy/strings.py which provide the required operations that
are just repr() and eval() in Python 2, but are more complicated in
Python 3 to use data representations compatible with Python 2. These
functions are then used in the required places for export and import.
repr() and eval() are also used in storing the dict of changed values
in the journal for the RDBMS backends. It is similarly desirable that
the database be compatible between Python 2 and Python 3, so that
export and import do not need to be used for a migration between
Python versions for non-anydbm back ends. Thus, this patch changes
rdbms_common.py in the places involved in storing journals in the
database, not just in those involved in import/export.
Given this patch, import/export with non-ASCII characters appear based
on some limited testing to work across Python versions, and an
instance using the sqlite backend appears to be compatible between
Python versions without needing import/export, *if* the sessions/otks
databases (which use anydbm) are deleted when changing Python version.
| author | Joseph Myers <jsm@polyomino.org.uk> |
|---|---|
| date | Sun, 02 Sep 2018 23:48:04 +0000 |
| parents | 12fe83f90f0d |
| children |
line wrap: on
line source
"""Shared support for scanning document type declarations in HTML and XHTML.""" import re, string _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match del re class ParserBase: """Parser base class which provides some common support methods used by the SGML/HTML and XHTML parsers.""" def reset(self): self.lineno = 1 self.offset = 0 def getpos(self): """Return current line number and offset.""" return self.lineno, self.offset def error(self, message): """Return an error, showing current line number and offset. Concrete subclasses *must* override this method. """ raise NotImplementedError # Internal -- update line number and offset. This should be # called for each piece of data exactly once, in order -- in other # words the concatenation of all the input strings to this # function should be exactly the entire input. def updatepos(self, i, j): if i >= j: return j rawdata = self.rawdata nlines = rawdata.count("\n", i, j) if nlines: self.lineno = self.lineno + nlines pos = rawdata.rindex("\n", i, j) # Should not fail self.offset = j-(pos+1) else: self.offset = self.offset + j-i return j _decl_otherchars = '' # Internal -- parse declaration (for use by subclasses). def parse_declaration(self, i): # This is some sort of declaration; in "HTML as # deployed," this should only be the document type # declaration ("<!DOCTYPE html...>"). rawdata = self.rawdata import sys j = i + 2 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" if rawdata[j:j+1] in ("-", ""): # Start of comment followed by buffer boundary, # or just a buffer boundary. return -1 # in practice, this should look like: ((name|stringlit) S*)+ '>' n = len(rawdata) decltype, j = self._scan_name(j, i) if j < 0: return j if decltype == "doctype": self._decl_otherchars = '' while j < n: c = rawdata[j] if c == ">": # end of declaration syntax data = rawdata[i+2:j] if decltype == "doctype": self.handle_decl(data) else: self.unknown_decl(data) return j + 1 if c in "\"'": m = _declstringlit_match(rawdata, j) if not m: return -1 # incomplete j = m.end() elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": name, j = self._scan_name(j, i) elif c in self._decl_otherchars: j = j + 1 elif c == "[": if decltype == "doctype": j = self._parse_doctype_subset(j + 1, i) else: self.error("unexpected '[' char in declaration") else: self.error( "unexpected %s char in declaration" % repr(rawdata[j])) if j < 0: return j return -1 # incomplete # Internal -- scan past the internal subset in a <!DOCTYPE declaration, # returning the index just past any whitespace following the trailing ']'. def _parse_doctype_subset(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) j = i while j < n: c = rawdata[j] if c == "<": s = rawdata[j:j+2] if s == "<": # end of buffer; incomplete return -1 if s != "<!": self.updatepos(declstartpos, j + 1) self.error("unexpected char in internal subset (in %s)" % repr(s)) if (j + 2) == n: # end of buffer; incomplete return -1 if (j + 4) > n: # end of buffer; incomplete return -1 if rawdata[j:j+4] == "<!--": j = self.parse_comment(j, report=0) if j < 0: return j continue name, j = self._scan_name(j + 2, declstartpos) if j == -1: return -1 if name not in ("attlist", "element", "entity", "notation"): self.updatepos(declstartpos, j + 2) self.error( "unknown declaration %s in internal subset" % repr(name)) # handle the individual names meth = getattr(self, "_parse_doctype_" + name) j = meth(j, declstartpos) if j < 0: return j elif c == "%": # parameter entity reference if (j + 1) == n: # end of buffer; incomplete return -1 s, j = self._scan_name(j + 1, declstartpos) if j < 0: return j if rawdata[j] == ";": j = j + 1 elif c == "]": j = j + 1 while j < n and rawdata[j] in string.whitespace: j = j + 1 if j < n: if rawdata[j] == ">": return j self.updatepos(declstartpos, j) self.error("unexpected char after internal subset") else: return -1 elif c in string.whitespace: j = j + 1 else: self.updatepos(declstartpos, j) self.error("unexpected char %s in internal subset" % repr(c)) # end of buffer reached return -1 # Internal -- scan past <!ELEMENT declarations def _parse_doctype_element(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) name, j = self._scan_name(i, declstartpos) if j == -1: return -1 # style content model; just skip until '>' if '>' in rawdata[j:]: return rawdata.find(">", j) + 1 return -1 # Internal -- scan past <!ATTLIST declarations def _parse_doctype_attlist(self, i, declstartpos): rawdata = self.rawdata name, j = self._scan_name(i, declstartpos) c = rawdata[j:j+1] if c == "": return -1 if c == ">": return j + 1 while 1: # scan a series of attribute descriptions; simplified: # name type [value] [#constraint] name, j = self._scan_name(j, declstartpos) if j < 0: return j c = rawdata[j:j+1] if c == "": return -1 if c == "(": # an enumerated type; look for ')' if ")" in rawdata[j:]: j = rawdata.find(")", j) + 1 else: return -1 while rawdata[j:j+1].isspace(): j = j + 1 if not rawdata[j:]: # end of buffer, incomplete return -1 else: name, j = self._scan_name(j, declstartpos) c = rawdata[j:j+1] if not c: return -1 if c in "'\"": m = _declstringlit_match(rawdata, j) if m: j = m.end() else: return -1 c = rawdata[j:j+1] if not c: return -1 if c == "#": if rawdata[j:] == "#": # end of buffer return -1 name, j = self._scan_name(j + 1, declstartpos) if j < 0: return j c = rawdata[j:j+1] if not c: return -1 if c == '>': # all done return j + 1 # Internal -- scan past <!NOTATION declarations def _parse_doctype_notation(self, i, declstartpos): name, j = self._scan_name(i, declstartpos) if j < 0: return j rawdata = self.rawdata while 1: c = rawdata[j:j+1] if not c: # end of buffer; incomplete return -1 if c == '>': return j + 1 if c in "'\"": m = _declstringlit_match(rawdata, j) if not m: return -1 j = m.end() else: name, j = self._scan_name(j, declstartpos) if j < 0: return j # Internal -- scan past <!ENTITY declarations def _parse_doctype_entity(self, i, declstartpos): rawdata = self.rawdata if rawdata[i:i+1] == "%": j = i + 1 while 1: c = rawdata[j:j+1] if not c: return -1 if c in string.whitespace: j = j + 1 else: break else: j = i name, j = self._scan_name(j, declstartpos) if j < 0: return j while 1: c = self.rawdata[j:j+1] if not c: return -1 if c in "'\"": m = _declstringlit_match(rawdata, j) if m: j = m.end() else: return -1 # incomplete elif c == ">": return j + 1 else: name, j = self._scan_name(j, declstartpos) if j < 0: return j # Internal -- scan a name token and the new position and the token, or # return -1 if we've reached the end of the buffer. def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) if i == n: return None, -1 m = _declname_match(rawdata, i) if m: s = m.group() name = s.strip() if (i + len(s)) == n: return None, -1 # end of buffer return name.lower(), m.end() else: self.updatepos(declstartpos, i) self.error("expected name token")
