Mercurial > p > roundup > code
comparison test/test_mailgw.py @ 8491:520075b29474
feat: support justhtml parsing library to convert email to plain text
justhtml is an pure python, fast, HTML5 compliant parser. It is now an
option for converting html only emails to plain text. Its output
format differs slightly from dehtml or beautifulsoup. Mostly by
removing extra blank lines.
dehtml.py:
Using the stream parser of justhtml. Unable to get the full
document parser to successfully strip script and style blocks.
If I can fix this and use the standard parser, I can in theory
generate markdown from the DOM tree generated by justhtml.
Updated test case to include inline elements that should not cause a
line break when they are encountered. Running dehtml as: `python
roundup/dehtml.py foo.html` will load foo.html and parse it using
all available parsers.
configuration.py: justhtml is available as an option.
docs: updated CHANGES.txt, doc/tracker_config.txt added beautifulsoup
and justhtml to the optional software section of doc/installtion.txt.
test_mailgw.py, .github/workflows/ci-test Updated tests and install
justhtml as part of CI.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 14 Dec 2025 22:40:46 -0500 |
| parents | 28c5030757d3 |
| children | 67ed90055e47 |
comparison
equal
deleted
inserted
replaced
| 8490:918792e35e0c | 8491:520075b29474 |
|---|---|
| 33 except ImportError: | 33 except ImportError: |
| 34 from .pytest_patcher import mark_class | 34 from .pytest_patcher import mark_class |
| 35 skip_beautifulsoup = mark_class(pytest.mark.skip( | 35 skip_beautifulsoup = mark_class(pytest.mark.skip( |
| 36 reason="Skipping beautifulsoup tests: 'bs4' not installed")) | 36 reason="Skipping beautifulsoup tests: 'bs4' not installed")) |
| 37 | 37 |
| 38 try: | |
| 39 import justhtml | |
| 40 skip_justhtml = lambda func, *args, **kwargs: func | |
| 41 except ImportError: | |
| 42 from .pytest_patcher import mark_class | |
| 43 skip_justhtml = mark_class(pytest.mark.skip( | |
| 44 reason="Skipping justhtml tests: 'justhtml' not installed")) | |
| 38 | 45 |
| 39 from roundup.anypy.email_ import message_from_bytes | 46 from roundup.anypy.email_ import message_from_bytes |
| 40 from roundup.anypy.strings import b2s, u2s, s2b | 47 from roundup.anypy.strings import b2s, u2s, s2b |
| 41 from roundup.scripts.roundup_mailgw import parse_arguments | 48 from roundup.scripts.roundup_mailgw import parse_arguments |
| 42 | 49 |
| 313 | 320 |
| 314 @skip_beautifulsoup | 321 @skip_beautifulsoup |
| 315 def testTextHtmlMessageBeautifulSoup(self): | 322 def testTextHtmlMessageBeautifulSoup(self): |
| 316 self.testTextHtmlMessage(converter='beautifulsoup') | 323 self.testTextHtmlMessage(converter='beautifulsoup') |
| 317 | 324 |
| 325 @skip_justhtml | |
| 326 def testTextHtmlMessageJusthtml(self): | |
| 327 self.testTextHtmlMessage(converter='justhtml') | |
| 328 | |
| 318 def testTextHtmlMessage(self, converter='dehtml'): | 329 def testTextHtmlMessage(self, converter='dehtml'): |
| 319 html_message='''Content-Type: text/html; | 330 html_message='''Content-Type: text/html; |
| 320 charset="iso-8859-1" | 331 charset="iso-8859-1" |
| 321 From: Chef <chef@bork.bork.bork> | 332 From: Chef <chef@bork.bork.bork> |
| 322 To: issue_tracker@your.tracker.email.domain.example | 333 To: issue_tracker@your.tracker.email.domain.example |
| 373 ''' | 384 ''' |
| 374 text_fragments = {} | 385 text_fragments = {} |
| 375 text_fragments['dehtml'] = ['Roundup\n Home\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\n\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\n\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\n\nIf you', (u2s(u'\u2019'), ''), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you', (u2s(u'\u2019'), ''), 'll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).'] | 386 text_fragments['dehtml'] = ['Roundup\n Home\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\n\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\n\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\n\nIf you', (u2s(u'\u2019'), ''), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you', (u2s(u'\u2019'), ''), 'll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).'] |
| 376 text_fragments['beautifulsoup'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from\nhttp://www.python.org/\n.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at\nhttp://www.activestate.com/Products/ActivePython/\n), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/\n).'] | 387 text_fragments['beautifulsoup'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from\nhttp://www.python.org/\n.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at\nhttp://www.activestate.com/Products/ActivePython/\n), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/\n).'] |
| 377 | 388 |
| 389 text_fragments['justhtml'] = ['Roundup\nHome\nDownload\nDocs\nRoundup Features\nInstalling Roundup\nUpgrading to newer versions of Roundup\nRoundup FAQ\nUser Guide\nCustomising Roundup\nAdministration Guide\nPrerequisites\nRoundup requires Python 2.6 or newer (but not Python 3) with a functioning\nanydbm module. Download the latest version from http://www.python.org/.\nIt is highly recommended that users install the latest patch version\nof python as these contain many fixes to serious bugs.\nSome variants of Linux will need an additional ', ('python dev', u2s(u'\u201cpython dev\u201d')), ' package\ninstalled for Roundup installation to work. Debian and derivatives, are\nknown to require this.\nIf you', (u2s(u'\u2019'), "'"), 're on windows, you will either need to be using the ActiveState python\ndistribution (at http://www.activestate.com/Products/ActivePython/), or you’ll\nhave to install the win32all package separately (get it from\nhttp://starship.python.net/crew/mhammond/win32/).'] | |
| 390 self.maxDiff = 100000 | |
| 378 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = converter | 391 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = converter |
| 379 nodeid = self._handle_mail(html_message) | 392 nodeid = self._handle_mail(html_message) |
| 380 assert not os.path.exists(SENDMAILDEBUG) | 393 assert not os.path.exists(SENDMAILDEBUG) |
| 381 msgid = self.db.issue.get(nodeid, 'messages')[0] | 394 msgid = self.db.issue.get(nodeid, 'messages')[0] |
| 395 print(self.db.msg.get(msgid, 'content')) | |
| 396 print("\n==== fragment\n") | |
| 397 print(text_fragments[converter]) | |
| 382 self.compareStringFragments(self.db.msg.get(msgid, 'content'), | 398 self.compareStringFragments(self.db.msg.get(msgid, 'content'), |
| 383 text_fragments[converter]) | 399 text_fragments[converter]) |
| 384 | 400 |
| 385 if converter == 'dehtml': | 401 if converter == 'dehtml': |
| 386 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = "none" | 402 self.db.config.MAILGW_CONVERT_HTMLTOTEXT = "none" |
