diff test/test_mailgw.py @ 8491:520075b29474

feat: support justhtml parsing library to convert email to plain text justhtml is an pure python, fast, HTML5 compliant parser. It is now an option for converting html only emails to plain text. Its output format differs slightly from dehtml or beautifulsoup. Mostly by removing extra blank lines. dehtml.py: Using the stream parser of justhtml. Unable to get the full document parser to successfully strip script and style blocks. If I can fix this and use the standard parser, I can in theory generate markdown from the DOM tree generated by justhtml. Updated test case to include inline elements that should not cause a line break when they are encountered. Running dehtml as: `python roundup/dehtml.py foo.html` will load foo.html and parse it using all available parsers. configuration.py: justhtml is available as an option. docs: updated CHANGES.txt, doc/tracker_config.txt added beautifulsoup and justhtml to the optional software section of doc/installtion.txt. test_mailgw.py, .github/workflows/ci-test Updated tests and install justhtml as part of CI.
author John Rouillard <rouilj@ieee.org>
date Sun, 14 Dec 2025 22:40:46 -0500
parents 28c5030757d3
children 67ed90055e47
line wrap: on
line diff
--- a/test/test_mailgw.py	Sat Dec 13 23:02:53 2025 -0500
+++ b/test/test_mailgw.py	Sun Dec 14 22:40:46 2025 -0500
@@ -35,6 +35,13 @@
     skip_beautifulsoup = mark_class(pytest.mark.skip(
         reason="Skipping beautifulsoup tests: 'bs4' not installed"))
 
+try:
+    import justhtml
+    skip_justhtml = lambda func, *args, **kwargs: func
+except ImportError:
+    from .pytest_patcher import mark_class
+    skip_justhtml = mark_class(pytest.mark.skip(
+        reason="Skipping justhtml tests: 'justhtml' not installed"))
 
 from roundup.anypy.email_ import message_from_bytes
 from roundup.anypy.strings import b2s, u2s, s2b
@@ -315,6 +322,10 @@
     def testTextHtmlMessageBeautifulSoup(self):
         self.testTextHtmlMessage(converter='beautifulsoup')
 
+    @skip_justhtml
+    def testTextHtmlMessageJusthtml(self):
+        self.testTextHtmlMessage(converter='justhtml')
+        
     def testTextHtmlMessage(self, converter='dehtml'):
         html_message='''Content-Type: text/html;
   charset="iso-8859-1"
@@ -375,10 +386,15 @@
         text_fragments['dehtml'] = ['Roundup\n        Home\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\n\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\n\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\n\nIf you', (u2s(u'\u2019'), ''), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you', (u2s(u'\u2019'), ''), 'll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).']
         text_fragments['beautifulsoup'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from\nhttp://www.python.org/\n.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at\nhttp://www.activestate.com/Products/ActivePython/\n), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/\n).']
 
+        text_fragments['justhtml'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).']
+        self.maxDiff = 100000
         self.db.config.MAILGW_CONVERT_HTMLTOTEXT = converter
         nodeid = self._handle_mail(html_message)
         assert not os.path.exists(SENDMAILDEBUG)
         msgid = self.db.issue.get(nodeid, 'messages')[0]
+        print(self.db.msg.get(msgid, 'content'))
+        print("\n==== fragment\n")
+        print(text_fragments[converter])
         self.compareStringFragments(self.db.msg.get(msgid, 'content'),
                                     text_fragments[converter])
 

Roundup Issue Tracker: http://roundup-tracker.org/