Mercurial > p > roundup > code
comparison roundup/mailgw.py @ 5305:e20f472fde7d
issue2550799: provide basic support for handling html only emails
Initial implementation and testing with the dehtml html converter
done.
The use of beautifulsoup 4 is not tested. My test system breaks when
running dehtml.py using beautiful soup. I don't get the failures when
running under the test harness, but the text output is significantly
different (different line breaks, number of newlines etc.)
The tests for dehtml need to be generated for beautiful soup and the
expected output changed. Since I have a wonky install of beautiful
soup, I don't trust my output as the standard to test against. Also
since beautiful soup is optional, the test harness needs to skip the
beautifulsoup tests if import bs4 fails. Again something outside of my
expertise. I deleted the work I had done to implement that. I could
not get it working and wanted to get this feature in in some form.
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Fri, 13 Oct 2017 21:46:59 -0400 |
| parents | 32f95ec6bd8e |
| children | 91354bf0b683 |
comparison
equal
deleted
inserted
replaced
| 5304:ae32f082e623 | 5305:e20f472fde7d |
|---|---|
| 380 # For web forms only. | 380 # For web forms only. |
| 381 # message/rfc822: | 381 # message/rfc822: |
| 382 # Only if configured in [mailgw] unpack_rfc822 | 382 # Only if configured in [mailgw] unpack_rfc822 |
| 383 | 383 |
| 384 def extract_content(self, parent_type=None, ignore_alternatives=False, | 384 def extract_content(self, parent_type=None, ignore_alternatives=False, |
| 385 unpack_rfc822=False): | 385 unpack_rfc822=False, html2text=None): |
| 386 """Extract the body and the attachments recursively. | 386 """Extract the body and the attachments recursively. |
| 387 | 387 |
| 388 If the content is hidden inside a multipart/alternative part, | 388 If the content is hidden inside a multipart/alternative part, |
| 389 we use the *last* text/plain part of the *first* | 389 we use the *last* text/plain part of the *first* |
| 390 multipart/alternative in the whole message. | 390 multipart/alternative in the whole message. |
| 391 """ | 391 """ |
| 392 content_type = self.gettype() | 392 content_type = self.gettype() |
| 393 content = None | 393 content = None |
| 394 attachments = [] | 394 attachments = [] |
| 395 html_part = False | |
| 395 | 396 |
| 396 if content_type == 'text/plain': | 397 if content_type == 'text/plain': |
| 397 content = self.getbody() | 398 content = self.getbody() |
| 399 elif content_type == 'text/html' and html2text: | |
| 400 # if user allows html conversion run this. | |
| 401 content = html2text(self.getbody()) | |
| 402 attachments.append(self.as_attachment()) | |
| 403 html_part = True | |
| 398 elif content_type[:10] == 'multipart/': | 404 elif content_type[:10] == 'multipart/': |
| 399 content_found = bool (content) | 405 content_found = False |
| 400 ig = ignore_alternatives and not content_found | 406 ig = ignore_alternatives |
| 407 html_part_found = False | |
| 401 for part in self.getparts(): | 408 for part in self.getparts(): |
| 402 new_content, new_attach = part.extract_content(content_type, | 409 new_content, new_attach, html_part = part.extract_content( |
| 403 not content and ig, unpack_rfc822) | 410 content_type, not content and ig, unpack_rfc822, |
| 411 html2text) | |
| 404 | 412 |
| 405 # If we haven't found a text/plain part yet, take this one, | 413 # If we haven't found a text/plain part yet, take this one, |
| 406 # otherwise make it an attachment. | 414 # otherwise make it an attachment. |
| 407 if not content: | 415 if not content: |
| 408 content = new_content | 416 content = new_content |
| 409 cpart = part | 417 cpart = part |
| 418 if html_part: | |
| 419 html_part_found = True | |
| 410 elif new_content: | 420 elif new_content: |
| 411 if content_found or content_type != 'multipart/alternative': | 421 if html_part: |
| 422 # attachment should be added elsewhere. | |
| 423 pass | |
| 424 elif content_found or content_type != 'multipart/alternative': | |
| 412 attachments.append(part.text_as_attachment()) | 425 attachments.append(part.text_as_attachment()) |
| 426 elif html_part_found: | |
| 427 # text/plain part found after html | |
| 428 # save html as attachment | |
| 429 attachments.append(cpart.as_attachment()) | |
| 430 content = new_content | |
| 431 cpart = part | |
| 413 else: | 432 else: |
| 414 # if we have found a text/plain in the current | 433 # if we have found a text/plain in the current |
| 415 # multipart/alternative and find another one, we | 434 # multipart/alternative and find another one, we |
| 416 # use the first as an attachment (if configured) | 435 # use the first as an attachment (if configured) |
| 417 # and use the second one because rfc 2046, sec. | 436 # and use the second one because rfc 2046, sec. |
| 423 cpart = part | 442 cpart = part |
| 424 | 443 |
| 425 attachments.extend(new_attach) | 444 attachments.extend(new_attach) |
| 426 if ig and content_type == 'multipart/alternative' and content: | 445 if ig and content_type == 'multipart/alternative' and content: |
| 427 attachments = [] | 446 attachments = [] |
| 447 html_part = False | |
| 428 elif unpack_rfc822 and content_type == 'message/rfc822': | 448 elif unpack_rfc822 and content_type == 'message/rfc822': |
| 429 s = cStringIO.StringIO(self.getbody()) | 449 s = cStringIO.StringIO(self.getbody()) |
| 430 m = Message(s) | 450 m = Message(s) |
| 431 ig = ignore_alternatives and not content | 451 ig = ignore_alternatives and not content |
| 432 new_content, attachments = m.extract_content(m.gettype(), ig, | 452 new_content, attachments, html_part = m.extract_content(m.gettype(), ig, |
| 433 unpack_rfc822) | 453 unpack_rfc822, html2text) |
| 434 attachments.insert(0, m.text_as_attachment()) | 454 attachments.insert(0, m.text_as_attachment()) |
| 435 elif (parent_type == 'multipart/signed' and | 455 elif (parent_type == 'multipart/signed' and |
| 436 content_type == 'application/pgp-signature'): | 456 content_type == 'application/pgp-signature'): |
| 437 # ignore it so it won't be saved as an attachment | 457 # ignore it so it won't be saved as an attachment |
| 438 pass | 458 pass |
| 439 else: | 459 else: |
| 440 attachments.append(self.as_attachment()) | 460 attachments.append(self.as_attachment()) |
| 441 return content, attachments | 461 return content, attachments, html_part |
| 442 | 462 |
| 443 def text_as_attachment(self): | 463 def text_as_attachment(self): |
| 444 """Return first text/plain part as Message""" | 464 """Return first text/plain part as Message""" |
| 445 if not self.gettype().startswith ('multipart/'): | 465 if not self.gettype().startswith ('multipart/'): |
| 446 return self.as_attachment() | 466 return self.as_attachment() |
| 1070 encrypted.""") | 1090 encrypted.""") |
| 1071 | 1091 |
| 1072 def get_content_and_attachments(self): | 1092 def get_content_and_attachments(self): |
| 1073 ''' get the attachments and first text part from the message | 1093 ''' get the attachments and first text part from the message |
| 1074 ''' | 1094 ''' |
| 1095 from roundup.dehtml import dehtml | |
| 1096 html2text=dehtml(self.config['MAILGW_CONVERT_HTMLTOTEXT']).html2text | |
| 1097 | |
| 1075 ig = self.config.MAILGW_IGNORE_ALTERNATIVES | 1098 ig = self.config.MAILGW_IGNORE_ALTERNATIVES |
| 1076 self.content, self.attachments = self.message.extract_content( | 1099 self.message.instance = self.mailgw.instance |
| 1100 self.content, self.attachments, html_part = self.message.extract_content( | |
| 1077 ignore_alternatives=ig, | 1101 ignore_alternatives=ig, |
| 1078 unpack_rfc822=self.config.MAILGW_UNPACK_RFC822) | 1102 unpack_rfc822=self.config.MAILGW_UNPACK_RFC822, |
| 1103 html2text=html2text ) | |
| 1079 | 1104 |
| 1080 | 1105 |
| 1081 def create_files(self): | 1106 def create_files(self): |
| 1082 ''' Create a file for each attachment in the message | 1107 ''' Create a file for each attachment in the message |
| 1083 ''' | 1108 ''' |
