Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Lib/html/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def escape(s, quote=True):
return s


# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state

_invalid_charrefs = {
0x00: '\ufffd', # REPLACEMENT CHARACTER
Expand Down
9 changes: 6 additions & 3 deletions Lib/html/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
__all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs']


# maps the HTML entity name to the Unicode code point
# from https://html.spec.whatwg.org/multipage/named-characters.html
# maps HTML4 entity name to the Unicode code point
name2codepoint = {
'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1
Expand Down Expand Up @@ -261,7 +260,11 @@
}


# maps the HTML5 named character references to the equivalent Unicode character(s)
# HTML5 named character references
# Generated by Tools/build/parse_html5_entities.py
# from https://html.spec.whatwg.org/entities.json and
# https://html.spec.whatwg.org/multipage/named-characters.html.
# Map HTML5 named character references to the equivalent Unicode character(s).
html5 = {
'Aacute': '\xc1',
'aacute': '\xe1',
Expand Down
29 changes: 24 additions & 5 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import _markupbase

from html import unescape
from html.entities import html5 as html5_entities


__all__ = ['HTMLParser']
Expand All @@ -23,6 +24,7 @@

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
Expand Down Expand Up @@ -57,6 +59,22 @@
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

# Character reference processing logic specific to attribute values
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
def _replace_attr_charref(match):
ref = match.group(0)
# Numeric / hex char refs must always be unescaped
if ref.startswith('&#'):
return unescape(ref)
# Named character / entity references must only be unescaped
# if they are an exact match, and they are not followed by an equals sign
if not ref.endswith('=') and ref[1:] in html5_entities:
return unescape(ref)
# Otherwise do not unescape
return ref

def _unescape_attrvalue(s):
return attr_charref.sub(_replace_attr_charref, s)


class HTMLParser(_markupbase.ParserBase):
Expand Down Expand Up @@ -89,6 +107,7 @@ def __init__(self, *, convert_charrefs=True):
If convert_charrefs is True (the default), all character references
are automatically converted to the corresponding Unicode characters.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
self.reset()

Expand All @@ -98,7 +117,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
_markupbase.ParserBase.reset(self)
super().reset()

def feed(self, data):
r"""Feed data to the parser.
Expand Down Expand Up @@ -241,7 +260,7 @@ def goahead(self, end):
else:
assert 0, "interesting.search() lied"
# end while
if end and i < n and not self.cdata_elem:
if end and i < n:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n]))
else:
Expand All @@ -259,7 +278,7 @@ def parse_html_declaration(self, i):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+3] == '<![':
elif rawdata[i:i+9] == '<![CDATA[':
return self.parse_marked_section(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
Expand All @@ -276,7 +295,7 @@ def parse_html_declaration(self, i):
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
'parse_comment()')
'parse_bogus_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
Expand Down Expand Up @@ -322,7 +341,7 @@ def parse_starttag(self, i):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = unescape(attrvalue)
attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()

Expand Down
148 changes: 123 additions & 25 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pprint
import unittest

from unittest.mock import patch


class EventCollector(html.parser.HTMLParser):

Expand Down Expand Up @@ -315,6 +317,16 @@ def get_events(self):
("endtag", element_lower)],
collector=Collector(convert_charrefs=False))

def test_EOF_in_cdata(self):
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
'</script' + '>'"""
s = f'<script>{content}'
self._run_check(s, [
("starttag", 'script', []),
("data", content)
])

def test_comments(self):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'
Expand Down Expand Up @@ -346,18 +358,16 @@ def test_convert_charrefs(self):
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
# check charrefs in the middle of the text/attributes
expected = [('starttag', 'a', [('href', 'foo"zar')]),
('data', 'a"z'), ('endtag', 'a')]
# check charrefs in the middle of the text
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
for charref in charrefs:
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
self._run_check('<a>a{0}z</a>'.format(charref),
expected, collector=collector())
# check charrefs at the beginning/end of the text/attributes
expected = [('data', '"'),
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
# check charrefs at the beginning/end of the text
expected = [('data', '"'), ('starttag', 'a', []),
('data', '"'), ('endtag', 'a'), ('data', '"')]
for charref in charrefs:
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
self._run_check('{0}<a>'
'{0}</a>{0}'.format(charref),
expected, collector=collector())
# check charrefs in <script>/<style> elements
Expand All @@ -380,6 +390,35 @@ def test_convert_charrefs(self):
self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector())

def test_convert_charrefs_in_attribute_values(self):
# default value for convert_charrefs is now True
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)

# always unescape terminated entity refs, numeric and hex char refs:
# - regardless whether they are at start, middle, end of attribute
# - or followed by alphanumeric, non-alphanumeric, or equals char
charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
expected = [('starttag', 'a',
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
('endtag', 'a')]
for charref in charrefs:
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
.format(charref), expected, collector=collector())

# only unescape unterminated entity matches if they are not followed by
# an alphanumeric or an equals sign
charref = '&cent'
expected = [('starttag', 'a',
[('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
('endtag', 'a')]
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
.format(charref), expected, collector=collector())

# the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup
def test_tolerant_parsing(self):
Expand Down Expand Up @@ -537,52 +576,99 @@ def test_EOF_in_charref(self):
for html, expected in data:
self._run_check(html, expected)

def test_broken_comments(self):
def test_EOF_in_comments_or_decls(self):
data = [
('<!', [('data', '<!')]),
('<!-', [('data', '<!-')]),
('<!--', [('data', '<!--')]),
('<![', [('data', '<![')]),
('<![CDATA[', [('data', '<![CDATA[')]),
('<![CDATA[x', [('data', '<![CDATA[x')]),
('<!DOCTYPE', [('data', '<!DOCTYPE')]),
('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
]
for html, expected in data:
self._run_check(html, expected)
def test_bogus_comments(self):
html = ('<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!><!<-- this was an empty comment>'
'<!!! another bogus comment !!!>')
'<!!! another bogus comment !!!>'
# see #32876
'<![with square brackets]!>'
'<![\nmultiline\nbogusness\n]!>'
'<![more brackets]-[and a hyphen]!>'
'<![cdata[should be uppercase]]>'
'<![CDATA [whitespaces are not ignored]]>'
'<![CDATA]]>' # required '[' after CDATA
)
expected = [
('comment', ' not really a comment '),
('comment', ' not a comment either --'),
('comment', ' -- close enough --'),
('comment', ''),
('comment', '<-- this was an empty comment'),
('comment', '!! another bogus comment !!!'),
('comment', '[with square brackets]!'),
('comment', '[\nmultiline\nbogusness\n]!'),
('comment', '[more brackets]-[and a hyphen]!'),
('comment', '[cdata[should be uppercase]]'),
('comment', '[CDATA [whitespaces are not ignored]]'),
('comment', '[CDATA]]'),
]
self._run_check(html, expected)

def test_broken_condcoms(self):
# these condcoms are missing the '--' after '<!' and before the '>'
# and they are considered bogus comments according to
# "8.2.4.42. Markup declaration open state"
html = ('<![if !(IE)]>broken condcom<![endif]>'
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
'<![if !ie 6]><b>foo</b><![endif]>'
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
# and "8.2.4.45 Markup declaration open state", comment tokens should
# be emitted instead of 'unknown decl', but calling unknown_decl
# provides more flexibility.
# See also Lib/_markupbase.py:parse_declaration
expected = [
('unknown decl', 'if !(IE)'),
('comment', '[if !(IE)]'),
('data', 'broken condcom'),
('unknown decl', 'endif'),
('unknown decl', 'if ! IE'),
('comment', '[endif]'),
('comment', '[if ! IE]'),
('startendtag', 'link', [('href', 'favicon.tiff')]),
('unknown decl', 'endif'),
('unknown decl', 'if !IE 6'),
('comment', '[endif]'),
('comment', '[if !IE 6]'),
('startendtag', 'img', [('src', 'firefox.png')]),
('unknown decl', 'endif'),
('unknown decl', 'if !ie 6'),
('comment', '[endif]'),
('comment', '[if !ie 6]'),
('starttag', 'b', []),
('data', 'foo'),
('endtag', 'b'),
('unknown decl', 'endif'),
('unknown decl', 'if (!IE)|(lt IE 9)'),
('comment', '[endif]'),
('comment', '[if (!IE)|(lt IE 9)]'),
('startendtag', 'img', [('src', 'mammoth.bmp')]),
('unknown decl', 'endif')
('comment', '[endif]')
]
self._run_check(html, expected)

def test_cdata_declarations(self):
# More tests should be added. See also "8.2.4.42. Markup
# declaration open state", "8.2.4.69. CDATA section state",
# and issue 32876
html = ('<![CDATA[just some plain text]]>')
expected = [('unknown decl', 'CDATA[just some plain text')]
self._run_check(html, expected)

def test_cdata_declarations_multiline(self):
html = ('<code><![CDATA['
' if (a < b && a > b) {'
' printf("[<marquee>How?</marquee>]");'
' }'
']]></code>')
expected = [
('starttag', 'code', []),
('unknown decl',
'CDATA[ if (a < b && a > b) { '
'printf("[<marquee>How?</marquee>]"); }'),
('endtag', 'code')
]
self._run_check(html, expected)

Expand Down Expand Up @@ -787,5 +873,17 @@ def test_weird_chars_in_unquoted_attribute_values(self):
('starttag', 'form',
[('action', 'bogus|&#()value')])])


class TestInheritance(unittest.TestCase):

@patch("_markupbase.ParserBase.__init__")
@patch("_markupbase.ParserBase.reset")
def test_base_class_methods_called(self, super_reset_method, super_init_method):
with patch('_markupbase.ParserBase') as parser_base:
EventCollector()
super_init_method.assert_called_once()
super_reset_method.assert_called_once()


if __name__ == "__main__":
unittest.main()
Loading