Mercurial > p > roundup > code
diff test/html_norm.py @ 6995:dc83ebff4c90
change test to use html normalizer when comparing html output.
Update to Markdown2 parser changed text output keeping same html
semantics. Broke test_string_markdown_code_block_attribute test. I
hand patched it to get tests working but it needed a better solution.
Write a simple html normalizer using HTMLParser so I don't need third
party (lxml, beautifulsoup) library to clean up the test.
Use the normalizer to parser the expected result and the result
returned by the various markdown libraries. Hopefully this will make
the test less fragile.
This can have multiple uses in template testing where html is
compared. I expect to have to change html_norm.py to make test
writing easier in the future.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 02 Oct 2022 23:18:43 -0400 |
| parents | |
| children | 3546f23ea493 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/html_norm.py Sun Oct 02 23:18:43 2022 -0400 @@ -0,0 +1,138 @@ +"""Minimal html parser/normalizer for use in test_templating. + +When testing markdown -> html coversion libraries, there are +gratuitous whitespace changes in generated output that break the +tests. Use this to try to normalize the generated HTML into something +that tries to preserve the semantic meaning allowing tests to stop +breaking. + +This is not a complete parsing engine. It supports the Roundup issue +tracker unit tests so that no third party libraries are needed to run +the tests. If you find it useful enjoy. + +Ideally this would be done by hijacking in some way +lxml.html.usedoctest to get a liberal parser that will ignore +whitespace. But that means the user has to install lxml to run the +tests. Simlarly BeautifulSoup could be used to pretty print the html +but again then BeautifulSoup would need to be instaled to run the +tests. + +""" +from html.parser import HTMLParser + +try: + from htmlentitydefs import name2codepoint +except ImportError: + pass # assume running under python3, name2codepoint predefined + + +class NormalizingHtmlParser(HTMLParser): + """Handle start/end tags and normalize whitespace in data. + Strip doctype, comments when passed in. + + Implements normalize method that takes input html and returns a + normalized string leaving the instance ready for another call to + normalize for another string. + + + Note that using this rewrites all attributes parsed by HTMLParser + into attr="value" form even though HTMLParser accepts other + attribute specifiction forms. + """ + + debug = False # set to true to enable more verbose output + + current_normalized_string = "" # accumulate result string + preserve_data = False # if inside pre preserve whitespace + + def handle_starttag(self, tag, attrs): + """put tag on new line with attributes. + Note valid attributes according to HTMLParser: + attrs='single_quote' + attrs=noquote + attrs="double_quote" + """ + if self.debug: print("Start tag:", tag) + + self.current_normalized_string += "\n<%s" % tag + + for attr in attrs: + if self.debug: print(" attr:", attr) + self.current_normalized_string += ' %s="%s"' % attr + + self.current_normalized_string += ">" + + if tag == 'pre': + self.preserve_data = True + + def handle_endtag(self, tag): + if self.debug: print("End tag :", tag) + + self.current_normalized_string += "\n</%s>" % tag + + if tag == 'pre': + self.preserve_data = False + + def handle_data(self, data): + if self.debug: print("Data :", data) + if not self.preserve_data: + # normalize whitespace remove leading/trailing + data = " ".join(data.strip().split()) + + if data: + self.current_normalized_string += "\n%s" % data + + def handle_comment(self, data): + print("Comment :", data) + + def handle_entityref(self, name): + c = chr(name2codepoint[name]) + if self.debug: print("Named ent:", c) + + self.current_normalized_string += "%s" % c + + def handle_charref(self, name): + if name.startswith('x'): + c = chr(int(name[1:], 16)) + else: + c = chr(int(name)) + if self.debug: print("Num ent :", c) + + self.current_normalized_string += "%s" % c + + def handle_decl(self, data): + print("Decl :", data) + + def reset(self): + """wrapper around reset with clearing of csef.current_normalized_string + and reset of self.preserve_data + """ + HTMLParser.reset(self) + self.current_normalized_string = "" + self.preserve_data = False + + def normalize(self, html): + self.feed(html) + result = self.current_normalized_string + self.reset() + return result + + +if __name__ == "__main__": + parser = NormalizingHtmlParser() + + parser.feed('<div class="markup"><p> paragraph text with whitespace\n and more space <pre><span class="f" data-attr="f">text more text</span></pre></div>') + print("\n\ntest1", parser.current_normalized_string) + + parser.reset() + + parser.feed('''<div class="markup"> + <p> paragraph text with whitespace\n and more space + <pre><span class="f" data-attr="f">text \n more text</span></pre> + </div>''') + print("\n\ntest2", parser.current_normalized_string) + parser.reset() + print("\n\nnormalize", parser.normalize('''<div class="markup"> + <p> paragraph text with whitespace\n and more space + <pre><span class="f" data-attr="f">text \n more text <</span></pre> + </div>'''))
