Mercurial Repository: p/roundup/code: roundup/mailgw.py comparison

comparison roundup/mailgw.py @ 5305:e20f472fde7d

issue2550799: provide basic support for handling html only emails Initial implementation and testing with the dehtml html converter done. The use of beautifulsoup 4 is not tested. My test system breaks when running dehtml.py using beautiful soup. I don't get the failures when running under the test harness, but the text output is significantly different (different line breaks, number of newlines etc.) The tests for dehtml need to be generated for beautiful soup and the expected output changed. Since I have a wonky install of beautiful soup, I don't trust my output as the standard to test against. Also since beautiful soup is optional, the test harness needs to skip the beautifulsoup tests if import bs4 fails. Again something outside of my expertise. I deleted the work I had done to implement that. I could not get it working and wanted to get this feature in in some form.

author	John Rouillard <rouilj@ieee.org>
date	Fri, 13 Oct 2017 21:46:59 -0400
parents	32f95ec6bd8e
children	91354bf0b683

comparison

equal deleted inserted replaced

-:ae32f082e623
+:e20f472fde7d
 #   For web forms only.
 # message/rfc822:
 #   Only if configured in [mailgw] unpack_rfc822
 def extract_content(self, parent_type=None, ignore_alternatives=False,
-unpack_rfc822=False):
+unpack_rfc822=False, html2text=None):
 """Extract the body and the attachments recursively.
 If the content is hidden inside a multipart/alternative part,
 we use the *last* text/plain part of the *first*
 multipart/alternative in the whole message.
 """
 content_type = self.gettype()
 content = None
 attachments = []
+html_part = False
 if content_type == 'text/plain':
 content = self.getbody()
+elif content_type == 'text/html' and html2text:
+# if user allows html conversion run this.
+content = html2text(self.getbody())
+attachments.append(self.as_attachment())
+html_part = True
 elif content_type[:10] == 'multipart/':
-content_found = bool (content)
+content_found = False
-ig = ignore_alternatives and not content_found
+ig = ignore_alternatives
+html_part_found = False
 for part in self.getparts():
-new_content, new_attach = part.extract_content(content_type,
+new_content, new_attach, html_part = part.extract_content(
-not content and ig, unpack_rfc822)
+content_type, not content and ig, unpack_rfc822,
+html2text)
 # If we haven't found a text/plain part yet, take this one,
 # otherwise make it an attachment.
 if not content:
 content = new_content
 cpart   = part
+if html_part:
+html_part_found = True
 elif new_content:
-if content_found or content_type != 'multipart/alternative':
+if html_part:
+# attachment should be added elsewhere.
+pass
+elif content_found or content_type != 'multipart/alternative':
 attachments.append(part.text_as_attachment())
+elif html_part_found:
+# text/plain part found after html
+# save html as attachment
+attachments.append(cpart.as_attachment())
+content = new_content
+cpart   = part
 else:
 # if we have found a text/plain in the current
 # multipart/alternative and find another one, we
 # use the first as an attachment (if configured)
 # and use the second one because rfc 2046, sec.
 cpart   = part
 attachments.extend(new_attach)
 if ig and content_type == 'multipart/alternative' and content:
 attachments = []
+html_part = False
 elif unpack_rfc822 and content_type == 'message/rfc822':
 s = cStringIO.StringIO(self.getbody())
 m = Message(s)
 ig = ignore_alternatives and not content
-new_content, attachments = m.extract_content(m.gettype(), ig,
+new_content, attachments, html_part = m.extract_content(m.gettype(), ig,
-unpack_rfc822)
+unpack_rfc822, html2text)
 attachments.insert(0, m.text_as_attachment())
 elif (parent_type == 'multipart/signed' and
 content_type == 'application/pgp-signature'):
 # ignore it so it won't be saved as an attachment
 pass
 else:
 attachments.append(self.as_attachment())
-return content, attachments
+return content, attachments, html_part
 def text_as_attachment(self):
 """Return first text/plain part as Message"""
 if not self.gettype().startswith ('multipart/'):
 return self.as_attachment()
 encrypted.""")
 def get_content_and_attachments(self):
 ''' get the attachments and first text part from the message
 '''
+from roundup.dehtml import dehtml
+html2text=dehtml(self.config['MAILGW_CONVERT_HTMLTOTEXT']).html2text
 ig = self.config.MAILGW_IGNORE_ALTERNATIVES
-self.content, self.attachments = self.message.extract_content(
+self.message.instance = self.mailgw.instance
+self.content, self.attachments, html_part = self.message.extract_content(
 ignore_alternatives=ig,
-unpack_rfc822=self.config.MAILGW_UNPACK_RFC822)
+unpack_rfc822=self.config.MAILGW_UNPACK_RFC822,
+html2text=html2text )
 def create_files(self):
 ''' Create a file for each attachment in the message
 '''

Mercurial > p > roundup > code

comparison roundup/mailgw.py @ 5305:e20f472fde7d