Mercurial > p > roundup > code
view roundup/cgi/TAL/HTMLParser.py @ 8180:d02ce1d14acd
feat: issue2551068 - Provide way to retrieve file/msg data via rest endpoint.
Use Allow header to change format of /binary_content endpoint. If
Allow header for endpoint is not application/json, it will be matched
against the mime type for the file. */*, text/* are supported and will
return the native mime type if present.
Changes:
move */* mime type from static dict of supported types. It was
hardcoded to return json only. Now it can return a matching
non-json mime type for the /binary_content endpoint.
Edited some errors to explicitly add */* mime type.
Cleanups to use ', ' separation in lists of valid mime types rather
than just space separated.
Remove ETag header when sending raw content. See issue 2551375 for
background.
Doc added to rest.txt.
Small format fix up (add dash) in CHANGES.txt.
Make passing an unset/None/False accept_mime_type to
format_dispatch_output a 500 error. This used to be the fallback
to produce a 406 error after all processing had happened. It
should no longer be possible to take that code path as all 406
errors (with valid accept_mime_types) are generated before
processing takes place.
Make format_dispatch_output handle output other than json/xml so it
can send back binary_content data.
Removed a spurious client.response_code = 400 that seems to not be
used.
Tests added for all code paths.
Database setup for tests msg and file entry. This required a file
upload test to change so it doesn't look for file1 as the link
returned by the upload. Download the link and verify the data
rather than verifying the link.
Multiple formatting changes to error messages to make all lists of
valid mime types ', ' an not just space separated.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 08 Dec 2024 17:22:33 -0500 |
| parents | 936275dfe1fa |
| children |
line wrap: on
line source
"""A parser for HTML and XHTML.""" # This file is based on sgmllib.py, but the API is slightly different. # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) # and CDATA (character data -- only end tags are special). from . import markupbase import re # Regular expressions used for parsing interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\Z)') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') endtagopen = re.compile('</') commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value |\"[^\"]*\" # LIT-enclosed value |[^'\">\s]+ # bare value ) )? ) )* \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile(r'>') endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') class HTMLParseError(BaseException): """Exception raised for all parse errors.""" def __init__(self, msg, position=(None, None)): assert msg self.msg = msg self.lineno = position[0] self.offset = position[1] def __str__(self): result = self.msg if self.lineno is not None: result = result + ", at line %d" % self.lineno if self.offset is not None: result = result + ", column %d" % (self.offset + 1) return result def _contains_at(s, sub, pos): return s[pos:pos+len(sub)] == sub class HTMLParser(markupbase.ParserBase): """Find tags and other markup and call handler functions. Usage: p = HTMLParser() p.feed(data) ... p.close() Start tags are handled by calling self.handle_starttag() or self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data may be split up in arbitrary chunks). Entity references are passed by calling self.handle_entityref() with the entity reference as the argument. Numeric character references are passed to self.handle_charref() with the string containing the reference as the argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style") def __init__(self): """Initialize and reset this instance.""" self.reset() def reset(self): """Reset this instance. Loses all unprocessed data.""" self.rawdata = '' self.stack = [] self.lasttag = '???' self.interesting = interesting_normal markupbase.ParserBase.reset(self) def feed(self, data): """Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). """ self.rawdata = self.rawdata + data self.goahead(0) def close(self): """Handle any buffered data.""" self.goahead(1) def error(self, message): raise HTMLParseError(message, self.getpos()) __starttag_text = None def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text cdata_endtag = None def set_cdata_mode(self, endtag=None): self.cdata_endtag = endtag self.interesting = interesting_cdata def clear_cdata_mode(self): self.cdata_endtag = None self.interesting = interesting_normal # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: match = self.interesting.search(rawdata, i) # < or & if match: j = match.start() else: j = n if i < j: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break if rawdata[i] == '<': if starttagopen.match(rawdata, i): # < + letter k = self.parse_starttag(i) elif endtagopen.match(rawdata, i): # </ k = self.parse_endtag(i) elif _contains_at(rawdata, "<!--", i): # <!-- k = self.parse_comment(i) elif _contains_at(rawdata, "<!", i): # <! k = self.parse_declaration(i) elif _contains_at(rawdata, "<?", i): # <? k = self.parse_pi(i) elif _contains_at(rawdata, "<?", i): # <! k = self.parse_declaration(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 else: break if k < 0: if end: self.error("EOF in middle of construct") break i = self.updatepos(i, k) elif rawdata[i:i+2] == "&#": match = charref.match(rawdata, i) if match: name = match.group()[2:-1] self.handle_charref(name) k = match.end() if rawdata[k-1] != ';': k = k - 1 i = self.updatepos(i, k) continue else: break elif rawdata[i] == '&': match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if rawdata[k-1] != ';': k = k - 1 i = self.updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: # match.group() will contain at least 2 chars rest = rawdata[i:] if end and match.group() == rest: self.error("EOF in middle of entity or char ref") # incomplete break elif (i + 1) < n: # not the end of the buffer, and can't be confused # with some other construct self.handle_data("&") i = self.updatepos(i, i + 1) else: break else: assert 0, "interesting.search() lied" # end while if end and i < n: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] # Internal -- parse comment, return end or -1 if not terminated def parse_comment(self, i, report=1): rawdata = self.rawdata assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()' match = commentclose.search(rawdata, i+4) if not match: return -1 if report: j = match.start() self.handle_comment(rawdata[i+4: j]) j = match.end() return j # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' match = piclose.search(rawdata, i+2) # > if not match: return -1 j = match.start() self.handle_pi(rawdata[i+2: j]) j = match.end() return j # Internal -- handle starttag, return end or -1 if not terminated def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] # Now parse the data between i+1 and j into a tag and attrs attrs = [] match = tagfind.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = rawdata[i+1:k].lower() while k < endpos: m = attrfind.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) if not rest: attrvalue = None elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() end = rawdata[k:endpos].strip() if end not in (">", "/>"): lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") offset = len(self.__starttag_text) \ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) self.error("junk characters in start tag: %s" % repr(rawdata[k:endpos][:20])) if end[-2:] == '/>': # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos # Internal -- check to see if we have a complete starttag; return end # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata m = locatestarttagend.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] if next == ">": return j + 1 if next == "/": s = rawdata[j:j+2] if s == "/>": return j + 2 if s == "/": # buffer boundary return -1 # else bogus input self.updatepos(i, j + 1) self.error("malformed empty start tag") if next == "": # end of input return -1 if next in ("abcdefghijklmnopqrstuvwxyz=/" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 self.updatepos(i, j) self.error("malformed start tag") raise AssertionError("we should not get here!") # Internal -- parse endtag, return end or -1 if incomplete def parse_endtag(self, i): rawdata = self.rawdata assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" match = endendtag.search(rawdata, i+1) # > if not match: return -1 j = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: self.error("bad end tag: %s" % repr(rawdata[i:j])) tag = match.group(1).lower() if ( self.cdata_endtag is not None and tag != self.cdata_endtag): # Should be a mismatched end tag, but we'll treat it # as text anyway, since most HTML authors aren't # interested in the finer points of syntax. self.handle_data(match.group(0)) else: self.handle_endtag(tag) self.clear_cdata_mode() return j # Overridable -- finish processing of start+end tag: <tag.../> def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) # Overridable -- handle start tag def handle_starttag(self, tag, attrs): pass # Overridable -- handle end tag def handle_endtag(self, tag): pass # Overridable -- handle character reference def handle_charref(self, name): pass # Overridable -- handle entity reference def handle_entityref(self, name): pass # Overridable -- handle data def handle_data(self, data): pass # Overridable -- handle comment def handle_comment(self, data): pass # Overridable -- handle declaration def handle_decl(self, decl): pass # Overridable -- handle processing instruction def handle_pi(self, data): pass def unknown_decl(self, data): self.error("unknown declaration: " + repr(data)) # Internal -- helper to remove special character quoting def unescape(self, s): if '&' not in s: return s s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace("'", "'") s = s.replace(""", '"') s = s.replace("&", "&") # Must be last return s
