Mercurial > p > roundup > code
comparison roundup/dehtml.py @ 8491:520075b29474
feat: support justhtml parsing library to convert email to plain text
justhtml is an pure python, fast, HTML5 compliant parser. It is now an
option for converting html only emails to plain text. Its output
format differs slightly from dehtml or beautifulsoup. Mostly by
removing extra blank lines.
dehtml.py:
Using the stream parser of justhtml. Unable to get the full
document parser to successfully strip script and style blocks.
If I can fix this and use the standard parser, I can in theory
generate markdown from the DOM tree generated by justhtml.
Updated test case to include inline elements that should not cause a
line break when they are encountered. Running dehtml as: `python
roundup/dehtml.py foo.html` will load foo.html and parse it using
all available parsers.
configuration.py: justhtml is available as an option.
docs: updated CHANGES.txt, doc/tracker_config.txt added beautifulsoup
and justhtml to the optional software section of doc/installtion.txt.
test_mailgw.py, .github/workflows/ci-test Updated tests and install
justhtml as part of CI.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 14 Dec 2025 22:40:46 -0500 |
| parents | b68a1d8fd5d9 |
| children | 9c3ec0a5c7fc |
comparison
equal
deleted
inserted
replaced
| 8490:918792e35e0c | 8491:520075b29474 |
|---|---|
| 2 from __future__ import print_function | 2 from __future__ import print_function |
| 3 | 3 |
| 4 import sys | 4 import sys |
| 5 | 5 |
| 6 from roundup.anypy.strings import u2s, uchr | 6 from roundup.anypy.strings import u2s, uchr |
| 7 | |
| 8 # ruff PLC0415 ignore imports not at top of file | |
| 9 # ruff RET505 ignore else after return | |
| 10 # ruff: noqa: PLC0415 RET505 | |
| 7 | 11 |
| 8 _pyver = sys.version_info[0] | 12 _pyver = sys.version_info[0] |
| 9 | 13 |
| 10 | 14 |
| 11 class dehtml: | 15 class dehtml: |
| 25 # kill all script and style elements | 29 # kill all script and style elements |
| 26 for script in soup(["script", "style"]): | 30 for script in soup(["script", "style"]): |
| 27 script.extract() | 31 script.extract() |
| 28 | 32 |
| 29 return u2s(soup.get_text("\n", strip=True)) | 33 return u2s(soup.get_text("\n", strip=True)) |
| 34 | |
| 35 self.html2text = html2text | |
| 36 elif converter == "justhtml": | |
| 37 from justhtml import stream | |
| 38 | |
| 39 def html2text(html): | |
| 40 # The below does not work. | |
| 41 # Using stream parser since I couldn't seem to strip | |
| 42 # 'script' and 'style' blocks. But stream doesn't | |
| 43 # have error reporting or stripping of text nodes | |
| 44 # and dropping empty nodes. Also I would like to try | |
| 45 # its GFM markdown output too even though it keeps | |
| 46 # tables as html and doesn't completely covert as | |
| 47 # this would work well for those supporting markdown. | |
| 48 # | |
| 49 # ctx used for for testing since I have a truncated | |
| 50 # test doc. It eliminates error from missing DOCTYPE | |
| 51 # and head. | |
| 52 # | |
| 53 #from justhtml import JustHTML | |
| 54 # from justhtml.context import FragmentContext | |
| 55 # | |
| 56 #ctx = FragmentContext('html') | |
| 57 #justhtml = JustHTML(html,collect_errors=True, | |
| 58 # fragment_context=ctx) | |
| 59 # I still have the text output inside style/script tags. | |
| 60 # with :not(style, script). I do get text contents | |
| 61 # with query("style, script"). | |
| 62 # | |
| 63 #return u2s("\n".join( | |
| 64 # [elem.to_text(separator="\n", strip=True) | |
| 65 # for elem in justhtml.query(":not(style, script)")]) | |
| 66 # ) | |
| 67 | |
| 68 # define inline elements so I can accumulate all unbroken | |
| 69 # text in a single line with embedded inline elements. | |
| 70 # 'br' is inline but should be treated it as a line break | |
| 71 # and element before/after should not be accumulated | |
| 72 # together. | |
| 73 inline_elements = ( | |
| 74 "a", | |
| 75 "address", | |
| 76 "b", | |
| 77 "cite", | |
| 78 "code", | |
| 79 "em", | |
| 80 "i", | |
| 81 "img", | |
| 82 "mark", | |
| 83 "q", | |
| 84 "s", | |
| 85 "small", | |
| 86 "span", | |
| 87 "strong", | |
| 88 "sub", | |
| 89 "sup", | |
| 90 "time") | |
| 91 | |
| 92 # each line is appended and joined at the end | |
| 93 text = [] | |
| 94 # the accumulator for all text in inline elements | |
| 95 text_accumulator = "" | |
| 96 # if set skip all lines till matching end tag found | |
| 97 # used to skip script/style blocks | |
| 98 skip_till_endtag = None | |
| 99 # used to force text_accumulator into text with added | |
| 100 # newline so we have a blank line between paragraphs. | |
| 101 _need_parabreak = False | |
| 102 | |
| 103 for event, data in stream(html): | |
| 104 if event == "end" and skip_till_endtag == data: | |
| 105 skip_till_endtag = None | |
| 106 continue | |
| 107 if skip_till_endtag: | |
| 108 continue | |
| 109 if (event == "start" and | |
| 110 data[0] in ('script', 'style')): | |
| 111 skip_till_endtag = data[0] | |
| 112 continue | |
| 113 if (event == "start" and | |
| 114 text_accumulator and | |
| 115 data[0] not in inline_elements): | |
| 116 # add accumulator to "text" | |
| 117 text.append(text_accumulator) | |
| 118 text_accumulator = "" | |
| 119 _need_parabreak = False | |
| 120 elif event == "text": | |
| 121 if not data.isspace(): | |
| 122 text_accumulator = text_accumulator + data | |
| 123 _need_parabreak = True | |
| 124 elif (_need_parabreak and | |
| 125 event == "start" and | |
| 126 data[0] == "p"): | |
| 127 text.append(text_accumulator + "\n") | |
| 128 text_accumulator = "" | |
| 129 _need_parabreak = False | |
| 130 | |
| 131 # save anything left in the accumulator at end of document | |
| 132 if text_accumulator: | |
| 133 # add newline to match dehtml and beautifulsoup | |
| 134 text.append(text_accumulator + "\n") | |
| 135 return u2s("\n".join(text)) | |
| 30 | 136 |
| 31 self.html2text = html2text | 137 self.html2text = html2text |
| 32 else: | 138 else: |
| 33 raise ImportError | 139 raise ImportError |
| 34 except ImportError: | 140 except ImportError: |
| 94 | 200 |
| 95 self.html2text = html2text | 201 self.html2text = html2text |
| 96 | 202 |
| 97 | 203 |
| 98 if __name__ == "__main__": | 204 if __name__ == "__main__": |
| 205 # ruff: noqa: B011 S101 | |
| 206 | |
| 207 try: | |
| 208 assert False | |
| 209 except AssertionError: | |
| 210 pass | |
| 211 else: | |
| 212 print("Error, assertions turned off. Test fails") | |
| 213 sys.exit(1) | |
| 214 | |
| 99 html = """ | 215 html = """ |
| 100 <body> | 216 <body> |
| 101 <script> | 217 <script> |
| 102 this must not be in output | 218 this must not be in output |
| 103 </script> | 219 </script> |
| 126 <li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li> | 242 <li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li> |
| 127 <li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li> | 243 <li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li> |
| 128 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li> | 244 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li> |
| 129 </ul> | 245 </ul> |
| 130 <div class="section" id="prerequisites"> | 246 <div class="section" id="prerequisites"> |
| 131 <h2><a class="toc-backref" href="#id5">Prerequisites</a></h2> | 247 <H2><a class="toc-backref" href="#id5">Prerequisites</a></H2> |
| 132 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning | 248 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning |
| 133 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>. | 249 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>. |
| 134 It is highly recommended that users install the latest patch version | 250 It is highly recommended that users install the <span>latest patch version</span> |
| 135 of python as these contain many fixes to serious bugs.</p> | 251 of python as these contain many fixes to serious bugs.</p> |
| 136 <p>Some variants of Linux will need an additional “python dev” package | 252 <p>Some variants of Linux will need an additional “python dev” package |
| 137 installed for Roundup installation to work. Debian and derivatives, are | 253 installed for Roundup installation to work. Debian and derivatives, are |
| 138 known to require this.</p> | 254 known to require this.</p> |
| 139 <p>If you’re on windows, you will either need to be using the ActiveState python | 255 <p>If you’re on windows, you will either need to be using the ActiveState python |
| 145 </script> | 261 </script> |
| 146 </div> | 262 </div> |
| 147 </body> | 263 </body> |
| 148 """ | 264 """ |
| 149 | 265 |
| 150 html2text = dehtml("dehtml").html2text | 266 if len(sys.argv) > 1: |
| 151 if html2text: | 267 with open(sys.argv[1]) as h: |
| 152 print(html2text(html)) | 268 html = h.read() |
| 153 | 269 |
| 270 print("==== beautifulsoup") | |
| 154 try: | 271 try: |
| 155 # trap error seen if N_TOKENS not defined when run. | 272 # trap error seen if N_TOKENS not defined when run. |
| 156 html2text = dehtml("beautifulsoup").html2text | 273 html2text = dehtml("beautifulsoup").html2text |
| 157 if html2text: | 274 if html2text: |
| 158 print(html2text(html)) | 275 text = html2text(html) |
| 276 assert ('HELP' not in text) | |
| 277 assert ('display:block' not in text) | |
| 278 print(text) | |
| 159 except NameError as e: | 279 except NameError as e: |
| 160 print("captured error %s" % e) | 280 print("captured error %s" % e) |
| 161 | 281 |
| 282 print("==== justhtml") | |
| 283 try: | |
| 284 html2text = dehtml("justhtml").html2text | |
| 285 if html2text: | |
| 286 text = html2text(html) | |
| 287 assert ('HELP' not in text) | |
| 288 assert ('display:block' not in text) | |
| 289 print(text) | |
| 290 except NameError as e: | |
| 291 print("captured error %s" % e) | |
| 292 | |
| 293 print("==== dehtml") | |
| 294 html2text = dehtml("dehtml").html2text | |
| 295 if html2text: | |
| 296 text = html2text(html) | |
| 297 assert ('HELP' not in text) | |
| 298 assert ('display:block' not in text) | |
| 299 print(text) | |
| 300 | |
| 301 print("==== disabled html -> text conversion") | |
| 162 html2text = dehtml("none").html2text | 302 html2text = dehtml("none").html2text |
| 163 if html2text: | 303 if html2text: |
| 164 print("FAIL: Error, dehtml(none) is returning a function") | 304 print("FAIL: Error, dehtml(none) is returning a function") |
| 165 else: | 305 else: |
| 166 print("PASS: dehtml(none) is returning None") | 306 print("PASS: dehtml(none) is returning None") |
