comparison roundup/dehtml.py @ 8491:520075b29474

feat: support justhtml parsing library to convert email to plain text justhtml is an pure python, fast, HTML5 compliant parser. It is now an option for converting html only emails to plain text. Its output format differs slightly from dehtml or beautifulsoup. Mostly by removing extra blank lines. dehtml.py: Using the stream parser of justhtml. Unable to get the full document parser to successfully strip script and style blocks. If I can fix this and use the standard parser, I can in theory generate markdown from the DOM tree generated by justhtml. Updated test case to include inline elements that should not cause a line break when they are encountered. Running dehtml as: `python roundup/dehtml.py foo.html` will load foo.html and parse it using all available parsers. configuration.py: justhtml is available as an option. docs: updated CHANGES.txt, doc/tracker_config.txt added beautifulsoup and justhtml to the optional software section of doc/installtion.txt. test_mailgw.py, .github/workflows/ci-test Updated tests and install justhtml as part of CI.
author John Rouillard <rouilj@ieee.org>
date Sun, 14 Dec 2025 22:40:46 -0500
parents b68a1d8fd5d9
children 9c3ec0a5c7fc
comparison
equal deleted inserted replaced
8490:918792e35e0c 8491:520075b29474
2 from __future__ import print_function 2 from __future__ import print_function
3 3
4 import sys 4 import sys
5 5
6 from roundup.anypy.strings import u2s, uchr 6 from roundup.anypy.strings import u2s, uchr
7
8 # ruff PLC0415 ignore imports not at top of file
9 # ruff RET505 ignore else after return
10 # ruff: noqa: PLC0415 RET505
7 11
8 _pyver = sys.version_info[0] 12 _pyver = sys.version_info[0]
9 13
10 14
11 class dehtml: 15 class dehtml:
25 # kill all script and style elements 29 # kill all script and style elements
26 for script in soup(["script", "style"]): 30 for script in soup(["script", "style"]):
27 script.extract() 31 script.extract()
28 32
29 return u2s(soup.get_text("\n", strip=True)) 33 return u2s(soup.get_text("\n", strip=True))
34
35 self.html2text = html2text
36 elif converter == "justhtml":
37 from justhtml import stream
38
39 def html2text(html):
40 # The below does not work.
41 # Using stream parser since I couldn't seem to strip
42 # 'script' and 'style' blocks. But stream doesn't
43 # have error reporting or stripping of text nodes
44 # and dropping empty nodes. Also I would like to try
45 # its GFM markdown output too even though it keeps
46 # tables as html and doesn't completely covert as
47 # this would work well for those supporting markdown.
48 #
49 # ctx used for for testing since I have a truncated
50 # test doc. It eliminates error from missing DOCTYPE
51 # and head.
52 #
53 #from justhtml import JustHTML
54 # from justhtml.context import FragmentContext
55 #
56 #ctx = FragmentContext('html')
57 #justhtml = JustHTML(html,collect_errors=True,
58 # fragment_context=ctx)
59 # I still have the text output inside style/script tags.
60 # with :not(style, script). I do get text contents
61 # with query("style, script").
62 #
63 #return u2s("\n".join(
64 # [elem.to_text(separator="\n", strip=True)
65 # for elem in justhtml.query(":not(style, script)")])
66 # )
67
68 # define inline elements so I can accumulate all unbroken
69 # text in a single line with embedded inline elements.
70 # 'br' is inline but should be treated it as a line break
71 # and element before/after should not be accumulated
72 # together.
73 inline_elements = (
74 "a",
75 "address",
76 "b",
77 "cite",
78 "code",
79 "em",
80 "i",
81 "img",
82 "mark",
83 "q",
84 "s",
85 "small",
86 "span",
87 "strong",
88 "sub",
89 "sup",
90 "time")
91
92 # each line is appended and joined at the end
93 text = []
94 # the accumulator for all text in inline elements
95 text_accumulator = ""
96 # if set skip all lines till matching end tag found
97 # used to skip script/style blocks
98 skip_till_endtag = None
99 # used to force text_accumulator into text with added
100 # newline so we have a blank line between paragraphs.
101 _need_parabreak = False
102
103 for event, data in stream(html):
104 if event == "end" and skip_till_endtag == data:
105 skip_till_endtag = None
106 continue
107 if skip_till_endtag:
108 continue
109 if (event == "start" and
110 data[0] in ('script', 'style')):
111 skip_till_endtag = data[0]
112 continue
113 if (event == "start" and
114 text_accumulator and
115 data[0] not in inline_elements):
116 # add accumulator to "text"
117 text.append(text_accumulator)
118 text_accumulator = ""
119 _need_parabreak = False
120 elif event == "text":
121 if not data.isspace():
122 text_accumulator = text_accumulator + data
123 _need_parabreak = True
124 elif (_need_parabreak and
125 event == "start" and
126 data[0] == "p"):
127 text.append(text_accumulator + "\n")
128 text_accumulator = ""
129 _need_parabreak = False
130
131 # save anything left in the accumulator at end of document
132 if text_accumulator:
133 # add newline to match dehtml and beautifulsoup
134 text.append(text_accumulator + "\n")
135 return u2s("\n".join(text))
30 136
31 self.html2text = html2text 137 self.html2text = html2text
32 else: 138 else:
33 raise ImportError 139 raise ImportError
34 except ImportError: 140 except ImportError:
94 200
95 self.html2text = html2text 201 self.html2text = html2text
96 202
97 203
98 if __name__ == "__main__": 204 if __name__ == "__main__":
205 # ruff: noqa: B011 S101
206
207 try:
208 assert False
209 except AssertionError:
210 pass
211 else:
212 print("Error, assertions turned off. Test fails")
213 sys.exit(1)
214
99 html = """ 215 html = """
100 <body> 216 <body>
101 <script> 217 <script>
102 this must not be in output 218 this must not be in output
103 </script> 219 </script>
126 <li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li> 242 <li class="toctree-l2"><a class="reference internal" href="user_guide.html">User Guide</a></li>
127 <li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li> 243 <li class="toctree-l2"><a class="reference internal" href="customizing.html">Customising Roundup</a></li>
128 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li> 244 <li class="toctree-l2"><a class="reference internal" href="admin_guide.html">Administration Guide</a></li>
129 </ul> 245 </ul>
130 <div class="section" id="prerequisites"> 246 <div class="section" id="prerequisites">
131 <h2><a class="toc-backref" href="#id5">Prerequisites</a></h2> 247 <H2><a class="toc-backref" href="#id5">Prerequisites</a></H2>
132 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning 248 <p>Roundup requires Python 2.5 or newer (but not Python 3) with a functioning
133 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>. 249 anydbm module. Download the latest version from <a class="reference external" href="http://www.python.org/">http://www.python.org/</a>.
134 It is highly recommended that users install the latest patch version 250 It is highly recommended that users install the <span>latest patch version</span>
135 of python as these contain many fixes to serious bugs.</p> 251 of python as these contain many fixes to serious bugs.</p>
136 <p>Some variants of Linux will need an additional &#8220;python dev&#8221; package 252 <p>Some variants of Linux will need an additional &#8220;python dev&#8221; package
137 installed for Roundup installation to work. Debian and derivatives, are 253 installed for Roundup installation to work. Debian and derivatives, are
138 known to require this.</p> 254 known to require this.</p>
139 <p>If you&#8217;re on windows, you will either need to be using the ActiveState python 255 <p>If you&#8217;re on windows, you will either need to be using the ActiveState python
145 </script> 261 </script>
146 </div> 262 </div>
147 </body> 263 </body>
148 """ 264 """
149 265
150 html2text = dehtml("dehtml").html2text 266 if len(sys.argv) > 1:
151 if html2text: 267 with open(sys.argv[1]) as h:
152 print(html2text(html)) 268 html = h.read()
153 269
270 print("==== beautifulsoup")
154 try: 271 try:
155 # trap error seen if N_TOKENS not defined when run. 272 # trap error seen if N_TOKENS not defined when run.
156 html2text = dehtml("beautifulsoup").html2text 273 html2text = dehtml("beautifulsoup").html2text
157 if html2text: 274 if html2text:
158 print(html2text(html)) 275 text = html2text(html)
276 assert ('HELP' not in text)
277 assert ('display:block' not in text)
278 print(text)
159 except NameError as e: 279 except NameError as e:
160 print("captured error %s" % e) 280 print("captured error %s" % e)
161 281
282 print("==== justhtml")
283 try:
284 html2text = dehtml("justhtml").html2text
285 if html2text:
286 text = html2text(html)
287 assert ('HELP' not in text)
288 assert ('display:block' not in text)
289 print(text)
290 except NameError as e:
291 print("captured error %s" % e)
292
293 print("==== dehtml")
294 html2text = dehtml("dehtml").html2text
295 if html2text:
296 text = html2text(html)
297 assert ('HELP' not in text)
298 assert ('display:block' not in text)
299 print(text)
300
301 print("==== disabled html -> text conversion")
162 html2text = dehtml("none").html2text 302 html2text = dehtml("none").html2text
163 if html2text: 303 if html2text:
164 print("FAIL: Error, dehtml(none) is returning a function") 304 print("FAIL: Error, dehtml(none) is returning a function")
165 else: 305 else:
166 print("PASS: dehtml(none) is returning None") 306 print("PASS: dehtml(none) is returning None")

Roundup Issue Tracker: http://roundup-tracker.org/