comparison test/test_mailgw.py @ 8491:520075b29474

feat: support justhtml parsing library to convert email to plain text justhtml is an pure python, fast, HTML5 compliant parser. It is now an option for converting html only emails to plain text. Its output format differs slightly from dehtml or beautifulsoup. Mostly by removing extra blank lines. dehtml.py: Using the stream parser of justhtml. Unable to get the full document parser to successfully strip script and style blocks. If I can fix this and use the standard parser, I can in theory generate markdown from the DOM tree generated by justhtml. Updated test case to include inline elements that should not cause a line break when they are encountered. Running dehtml as: `python roundup/dehtml.py foo.html` will load foo.html and parse it using all available parsers. configuration.py: justhtml is available as an option. docs: updated CHANGES.txt, doc/tracker_config.txt added beautifulsoup and justhtml to the optional software section of doc/installtion.txt. test_mailgw.py, .github/workflows/ci-test Updated tests and install justhtml as part of CI.
author John Rouillard <rouilj@ieee.org>
date Sun, 14 Dec 2025 22:40:46 -0500
parents 28c5030757d3
children 67ed90055e47
comparison
equal deleted inserted replaced
8490:918792e35e0c 8491:520075b29474
33 except ImportError: 33 except ImportError:
34 from .pytest_patcher import mark_class 34 from .pytest_patcher import mark_class
35 skip_beautifulsoup = mark_class(pytest.mark.skip( 35 skip_beautifulsoup = mark_class(pytest.mark.skip(
36 reason="Skipping beautifulsoup tests: 'bs4' not installed")) 36 reason="Skipping beautifulsoup tests: 'bs4' not installed"))
37 37
38 try:
39 import justhtml
40 skip_justhtml = lambda func, *args, **kwargs: func
41 except ImportError:
42 from .pytest_patcher import mark_class
43 skip_justhtml = mark_class(pytest.mark.skip(
44 reason="Skipping justhtml tests: 'justhtml' not installed"))
38 45
39 from roundup.anypy.email_ import message_from_bytes 46 from roundup.anypy.email_ import message_from_bytes
40 from roundup.anypy.strings import b2s, u2s, s2b 47 from roundup.anypy.strings import b2s, u2s, s2b
41 from roundup.scripts.roundup_mailgw import parse_arguments 48 from roundup.scripts.roundup_mailgw import parse_arguments
42 49
313 320
314 @skip_beautifulsoup 321 @skip_beautifulsoup
315 def testTextHtmlMessageBeautifulSoup(self): 322 def testTextHtmlMessageBeautifulSoup(self):
316 self.testTextHtmlMessage(converter='beautifulsoup') 323 self.testTextHtmlMessage(converter='beautifulsoup')
317 324
325 @skip_justhtml
326 def testTextHtmlMessageJusthtml(self):
327 self.testTextHtmlMessage(converter='justhtml')
328
318 def testTextHtmlMessage(self, converter='dehtml'): 329 def testTextHtmlMessage(self, converter='dehtml'):
319 html_message='''Content-Type: text/html; 330 html_message='''Content-Type: text/html;
320 charset="iso-8859-1" 331 charset="iso-8859-1"
321 From: Chef <chef@bork.bork.bork> 332 From: Chef <chef@bork.bork.bork>
322 To: issue_tracker@your.tracker.email.domain.example 333 To: issue_tracker@your.tracker.email.domain.example
373 ''' 384 '''
374 text_fragments = {} 385 text_fragments = {}
375 text_fragments['dehtml'] = ['Roundup\n Home\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\n\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\n\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\n\nIf you', (u2s(u'\u2019'), ''), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you', (u2s(u'\u2019'), ''), 'll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).'] 386 text_fragments['dehtml'] = ['Roundup\n Home\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\n\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\n\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\n\nIf you', (u2s(u'\u2019'), ''), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you', (u2s(u'\u2019'), ''), 'll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).']
376 text_fragments['beautifulsoup'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from\nhttp://www.python.org/\n.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at\nhttp://www.activestate.com/Products/ActivePython/\n), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/\n).'] 387 text_fragments['beautifulsoup'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from\nhttp://www.python.org/\n.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at\nhttp://www.activestate.com/Products/ActivePython/\n), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/\n).']
377 388
389 text_fragments['justhtml'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).']
390 self.maxDiff = 100000
378 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = converter 391 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = converter
379 nodeid = self._handle_mail(html_message) 392 nodeid = self._handle_mail(html_message)
380 assert not os.path.exists(SENDMAILDEBUG) 393 assert not os.path.exists(SENDMAILDEBUG)
381 msgid = self.db.issue.get(nodeid, 'messages')[0] 394 msgid = self.db.issue.get(nodeid, 'messages')[0]
395 print(self.db.msg.get(msgid, 'content'))
396 print("\n==== fragment\n")
397 print(text_fragments[converter])
382 self.compareStringFragments(self.db.msg.get(msgid, 'content'), 398 self.compareStringFragments(self.db.msg.get(msgid, 'content'),
383 text_fragments[converter]) 399 text_fragments[converter])
384 400
385 if converter == 'dehtml': 401 if converter == 'dehtml':
386 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = "none" 402 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = "none"

Roundup Issue Tracker: http://roundup-tracker.org/