Mercurial > p > roundup > code
diff roundup/mailgw.py @ 5305:e20f472fde7d
issue2550799: provide basic support for handling html only emails
Initial implementation and testing with the dehtml html converter
done.
The use of beautifulsoup 4 is not tested. My test system breaks when
running dehtml.py using beautiful soup. I don't get the failures when
running under the test harness, but the text output is significantly
different (different line breaks, number of newlines etc.)
The tests for dehtml need to be generated for beautiful soup and the
expected output changed. Since I have a wonky install of beautiful
soup, I don't trust my output as the standard to test against. Also
since beautiful soup is optional, the test harness needs to skip the
beautifulsoup tests if import bs4 fails. Again something outside of my
expertise. I deleted the work I had done to implement that. I could
not get it working and wanted to get this feature in in some form.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Fri, 13 Oct 2017 21:46:59 -0400 |
| parents | 32f95ec6bd8e |
| children | 91354bf0b683 |
line wrap: on
line diff
--- a/roundup/mailgw.py Thu Oct 12 22:31:59 2017 -0400 +++ b/roundup/mailgw.py Fri Oct 13 21:46:59 2017 -0400 @@ -382,7 +382,7 @@ # Only if configured in [mailgw] unpack_rfc822 def extract_content(self, parent_type=None, ignore_alternatives=False, - unpack_rfc822=False): + unpack_rfc822=False, html2text=None): """Extract the body and the attachments recursively. If the content is hidden inside a multipart/alternative part, @@ -392,24 +392,43 @@ content_type = self.gettype() content = None attachments = [] + html_part = False if content_type == 'text/plain': content = self.getbody() + elif content_type == 'text/html' and html2text: + # if user allows html conversion run this. + content = html2text(self.getbody()) + attachments.append(self.as_attachment()) + html_part = True elif content_type[:10] == 'multipart/': - content_found = bool (content) - ig = ignore_alternatives and not content_found + content_found = False + ig = ignore_alternatives + html_part_found = False for part in self.getparts(): - new_content, new_attach = part.extract_content(content_type, - not content and ig, unpack_rfc822) + new_content, new_attach, html_part = part.extract_content( + content_type, not content and ig, unpack_rfc822, + html2text) # If we haven't found a text/plain part yet, take this one, # otherwise make it an attachment. if not content: content = new_content cpart = part + if html_part: + html_part_found = True elif new_content: - if content_found or content_type != 'multipart/alternative': + if html_part: + # attachment should be added elsewhere. + pass + elif content_found or content_type != 'multipart/alternative': attachments.append(part.text_as_attachment()) + elif html_part_found: + # text/plain part found after html + # save html as attachment + attachments.append(cpart.as_attachment()) + content = new_content + cpart = part else: # if we have found a text/plain in the current # multipart/alternative and find another one, we @@ -425,12 +444,13 @@ attachments.extend(new_attach) if ig and content_type == 'multipart/alternative' and content: attachments = [] + html_part = False elif unpack_rfc822 and content_type == 'message/rfc822': s = cStringIO.StringIO(self.getbody()) m = Message(s) ig = ignore_alternatives and not content - new_content, attachments = m.extract_content(m.gettype(), ig, - unpack_rfc822) + new_content, attachments, html_part = m.extract_content(m.gettype(), ig, + unpack_rfc822, html2text) attachments.insert(0, m.text_as_attachment()) elif (parent_type == 'multipart/signed' and content_type == 'application/pgp-signature'): @@ -438,7 +458,7 @@ pass else: attachments.append(self.as_attachment()) - return content, attachments + return content, attachments, html_part def text_as_attachment(self): """Return first text/plain part as Message""" @@ -1072,10 +1092,15 @@ def get_content_and_attachments(self): ''' get the attachments and first text part from the message ''' + from roundup.dehtml import dehtml + html2text=dehtml(self.config['MAILGW_CONVERT_HTMLTOTEXT']).html2text + ig = self.config.MAILGW_IGNORE_ALTERNATIVES - self.content, self.attachments = self.message.extract_content( + self.message.instance = self.mailgw.instance + self.content, self.attachments, html_part = self.message.extract_content( ignore_alternatives=ig, - unpack_rfc822=self.config.MAILGW_UNPACK_RFC822) + unpack_rfc822=self.config.MAILGW_UNPACK_RFC822, + html2text=html2text ) def create_files(self):
