comparison roundup/mailgw.py @ 5305:e20f472fde7d

issue2550799: provide basic support for handling html only emails Initial implementation and testing with the dehtml html converter done. The use of beautifulsoup 4 is not tested. My test system breaks when running dehtml.py using beautiful soup. I don't get the failures when running under the test harness, but the text output is significantly different (different line breaks, number of newlines etc.) The tests for dehtml need to be generated for beautiful soup and the expected output changed. Since I have a wonky install of beautiful soup, I don't trust my output as the standard to test against. Also since beautiful soup is optional, the test harness needs to skip the beautifulsoup tests if import bs4 fails. Again something outside of my expertise. I deleted the work I had done to implement that. I could not get it working and wanted to get this feature in in some form.
author John Rouillard <rouilj@ieee.org>
date Fri, 13 Oct 2017 21:46:59 -0400
parents 32f95ec6bd8e
children 91354bf0b683
comparison
equal deleted inserted replaced
5304:ae32f082e623 5305:e20f472fde7d
380 # For web forms only. 380 # For web forms only.
381 # message/rfc822: 381 # message/rfc822:
382 # Only if configured in [mailgw] unpack_rfc822 382 # Only if configured in [mailgw] unpack_rfc822
383 383
384 def extract_content(self, parent_type=None, ignore_alternatives=False, 384 def extract_content(self, parent_type=None, ignore_alternatives=False,
385 unpack_rfc822=False): 385 unpack_rfc822=False, html2text=None):
386 """Extract the body and the attachments recursively. 386 """Extract the body and the attachments recursively.
387 387
388 If the content is hidden inside a multipart/alternative part, 388 If the content is hidden inside a multipart/alternative part,
389 we use the *last* text/plain part of the *first* 389 we use the *last* text/plain part of the *first*
390 multipart/alternative in the whole message. 390 multipart/alternative in the whole message.
391 """ 391 """
392 content_type = self.gettype() 392 content_type = self.gettype()
393 content = None 393 content = None
394 attachments = [] 394 attachments = []
395 html_part = False
395 396
396 if content_type == 'text/plain': 397 if content_type == 'text/plain':
397 content = self.getbody() 398 content = self.getbody()
399 elif content_type == 'text/html' and html2text:
400 # if user allows html conversion run this.
401 content = html2text(self.getbody())
402 attachments.append(self.as_attachment())
403 html_part = True
398 elif content_type[:10] == 'multipart/': 404 elif content_type[:10] == 'multipart/':
399 content_found = bool (content) 405 content_found = False
400 ig = ignore_alternatives and not content_found 406 ig = ignore_alternatives
407 html_part_found = False
401 for part in self.getparts(): 408 for part in self.getparts():
402 new_content, new_attach = part.extract_content(content_type, 409 new_content, new_attach, html_part = part.extract_content(
403 not content and ig, unpack_rfc822) 410 content_type, not content and ig, unpack_rfc822,
411 html2text)
404 412
405 # If we haven't found a text/plain part yet, take this one, 413 # If we haven't found a text/plain part yet, take this one,
406 # otherwise make it an attachment. 414 # otherwise make it an attachment.
407 if not content: 415 if not content:
408 content = new_content 416 content = new_content
409 cpart = part 417 cpart = part
418 if html_part:
419 html_part_found = True
410 elif new_content: 420 elif new_content:
411 if content_found or content_type != 'multipart/alternative': 421 if html_part:
422 # attachment should be added elsewhere.
423 pass
424 elif content_found or content_type != 'multipart/alternative':
412 attachments.append(part.text_as_attachment()) 425 attachments.append(part.text_as_attachment())
426 elif html_part_found:
427 # text/plain part found after html
428 # save html as attachment
429 attachments.append(cpart.as_attachment())
430 content = new_content
431 cpart = part
413 else: 432 else:
414 # if we have found a text/plain in the current 433 # if we have found a text/plain in the current
415 # multipart/alternative and find another one, we 434 # multipart/alternative and find another one, we
416 # use the first as an attachment (if configured) 435 # use the first as an attachment (if configured)
417 # and use the second one because rfc 2046, sec. 436 # and use the second one because rfc 2046, sec.
423 cpart = part 442 cpart = part
424 443
425 attachments.extend(new_attach) 444 attachments.extend(new_attach)
426 if ig and content_type == 'multipart/alternative' and content: 445 if ig and content_type == 'multipart/alternative' and content:
427 attachments = [] 446 attachments = []
447 html_part = False
428 elif unpack_rfc822 and content_type == 'message/rfc822': 448 elif unpack_rfc822 and content_type == 'message/rfc822':
429 s = cStringIO.StringIO(self.getbody()) 449 s = cStringIO.StringIO(self.getbody())
430 m = Message(s) 450 m = Message(s)
431 ig = ignore_alternatives and not content 451 ig = ignore_alternatives and not content
432 new_content, attachments = m.extract_content(m.gettype(), ig, 452 new_content, attachments, html_part = m.extract_content(m.gettype(), ig,
433 unpack_rfc822) 453 unpack_rfc822, html2text)
434 attachments.insert(0, m.text_as_attachment()) 454 attachments.insert(0, m.text_as_attachment())
435 elif (parent_type == 'multipart/signed' and 455 elif (parent_type == 'multipart/signed' and
436 content_type == 'application/pgp-signature'): 456 content_type == 'application/pgp-signature'):
437 # ignore it so it won't be saved as an attachment 457 # ignore it so it won't be saved as an attachment
438 pass 458 pass
439 else: 459 else:
440 attachments.append(self.as_attachment()) 460 attachments.append(self.as_attachment())
441 return content, attachments 461 return content, attachments, html_part
442 462
443 def text_as_attachment(self): 463 def text_as_attachment(self):
444 """Return first text/plain part as Message""" 464 """Return first text/plain part as Message"""
445 if not self.gettype().startswith ('multipart/'): 465 if not self.gettype().startswith ('multipart/'):
446 return self.as_attachment() 466 return self.as_attachment()
1070 encrypted.""") 1090 encrypted.""")
1071 1091
1072 def get_content_and_attachments(self): 1092 def get_content_and_attachments(self):
1073 ''' get the attachments and first text part from the message 1093 ''' get the attachments and first text part from the message
1074 ''' 1094 '''
1095 from roundup.dehtml import dehtml
1096 html2text=dehtml(self.config['MAILGW_CONVERT_HTMLTOTEXT']).html2text
1097
1075 ig = self.config.MAILGW_IGNORE_ALTERNATIVES 1098 ig = self.config.MAILGW_IGNORE_ALTERNATIVES
1076 self.content, self.attachments = self.message.extract_content( 1099 self.message.instance = self.mailgw.instance
1100 self.content, self.attachments, html_part = self.message.extract_content(
1077 ignore_alternatives=ig, 1101 ignore_alternatives=ig,
1078 unpack_rfc822=self.config.MAILGW_UNPACK_RFC822) 1102 unpack_rfc822=self.config.MAILGW_UNPACK_RFC822,
1103 html2text=html2text )
1079 1104
1080 1105
1081 def create_files(self): 1106 def create_files(self):
1082 ''' Create a file for each attachment in the message 1107 ''' Create a file for each attachment in the message
1083 ''' 1108 '''

Roundup Issue Tracker: http://roundup-tracker.org/