Mercurial > p > roundup > code
diff test/html_norm.py @ 7560:5cadcaa13bed
prevent <newline tag mangling
remove charref and entityref handlers; change where newline inserted
after start tag.
A test under python2 ended up with a newline between the opening '<'
and the tag. This was caused by a string that had escaped > and
<.
embedded code block <pre>\n\n<pre> python\nline 1\nline 2\n</pre>
The code was mapping < etc back to < and > and confusing the parser
as to where the tag really started. It inserpreted the real pre tag as
data and inserted a newline.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 23 Jul 2023 16:11:23 -0400 |
| parents | 5bc36b65d06b |
| children |
line wrap: on
line diff
--- a/test/html_norm.py Sat Jul 22 22:05:44 2023 -0400 +++ b/test/html_norm.py Sun Jul 23 16:11:23 2023 -0400 @@ -40,7 +40,7 @@ Note that using this rewrites all attributes parsed by HTMLParser into attr="value" form even though HTMLParser accepts other - attribute specifiction forms. + attribute specification forms. """ debug = False # set to true to enable more verbose output @@ -63,7 +63,7 @@ if self.debug: print(" attr:", attr) self.current_normalized_string += ' %s="%s"' % attr - self.current_normalized_string += ">" + self.current_normalized_string += ">\n" if tag == 'pre': self.preserve_data = True @@ -83,26 +83,11 @@ data = " ".join(data.strip().split()) if data: - self.current_normalized_string += "\n%s" % data + self.current_normalized_string += "%s" % data def handle_comment(self, data): print("Comment :", data) - def handle_entityref(self, name): - c = chr(name2codepoint[name]) - if self.debug: print("Named ent:", c) - - self.current_normalized_string += "%s" % c - - def handle_charref(self, name): - if name.startswith('x'): - c = chr(int(name[1:], 16)) - else: - c = chr(int(name)) - if self.debug: print("Num ent :", c) - - self.current_normalized_string += "%s" % c - def handle_decl(self, data): print("Decl :", data)
