Mercurial > p > roundup > code
view roundup/anypy/email_.py @ 8538:f676eda10c20
build(deps): bump codecov/codecov-action from 5.5.2 to 5.5.3 #85
| author | John Rouillard <rouilj@ieee.org> |
|---|---|
| date | Sun, 22 Mar 2026 22:39:40 -0400 |
| parents | 609c5fd638e8 |
| children |
line wrap: on
line source
import binascii import email import re from email import base64mime, quoprimime from email import charset as _charset if str is bytes: message_from_bytes = email.message_from_string message_from_binary_file = email.message_from_file else: message_from_bytes = email.message_from_bytes message_from_binary_file = email.message_from_binary_file ## please import this file if you are using the email module # Match encoded-word strings in the form =?charset?q?Hello_World?= ecre = re.compile(r''' =\? # literal =? (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset \? # literal ? (?P<encoding>[qb]) # either a "q" or a "b", case insensitive \? # literal ? (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string \?= # literal ?= ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE) # Fixed header parser, see my proposed patch and discussions: # http://bugs.python.org/issue1079 "decode_header does not follow RFC 2047" # http://bugs.python.org/issue1467619 "Header.decode_header eats up spaces" # This implements the decode_header specific parts of my proposed patch # backported to python2.X def decode_header(header): """Decode a message header value without converting charset. Returns a list of (string, charset) pairs containing each of the decoded parts of the header. Charset is None for non-encoded parts of the header, otherwise a lower-case string containing the name of the character set specified in the encoded string. header may be a string that may or may not contain RFC2047 encoded words, or it may be a Header object. An email.errors.HeaderParseError may be raised when certain decoding error occurs (e.g. a base64 decoding exception). """ # If it is a Header object, we can just return the encoded chunks. if hasattr(header, '_chunks'): return [(_charset._encode(string, str(charset)), str(charset)) for string, charset in header._chunks] # If no encoding, just return the header with no charset. if not ecre.search(header): return [(header, None)] # First step is to parse all the encoded parts into triplets of the form # (encoded_string, encoding, charset). For unencoded strings, the last # two parts will be None. words = [] for line in header.splitlines(): parts = ecre.split(line) first = True while parts: unencoded = parts.pop(0) if first: unencoded = unencoded.lstrip() first = False if unencoded: words.append((unencoded, None, None)) if parts: charset = parts.pop(0).lower() encoding = parts.pop(0).lower() encoded = parts.pop(0) words.append((encoded, encoding, charset)) # Now loop over words and remove words that consist of whitespace # between two encoded strings. droplist = [] for n, w in enumerate(words): if n > 1 and w[1] and words[n - 2][1] and words[n - 1][0].isspace(): droplist.append(n - 1) for d in reversed(droplist): del words[d] # The next step is to decode each encoded word by applying the reverse # base64 or quopri transformation. decoded_words is now a list of the # form (decoded_word, charset). decoded_words = [] for encoded_string, encoding, charset in words: if encoding is None: # This is an unencoded word. decoded_words.append((encoded_string, charset)) elif encoding == 'q': word = quoprimime.header_decode(encoded_string) decoded_words.append((word, charset)) elif encoding == 'b': # Postel's law: add missing padding paderr = len(encoded_string) % 4 if paderr: encoded_string += '==='[:4 - paderr] # noqa: PLW2901 try: word = base64mime.decode(encoded_string) except binascii.Error: raise email.errors.HeaderParseError('Base64 decoding error') else: decoded_words.append((word, charset)) else: raise AssertionError('Unexpected encoding: ' + encoding) # Now convert all words to bytes and collapse consecutive runs of # similarly encoded words. collapsed = [] last_word = last_charset = None for word, charset in decoded_words: # ruff: noqa: PLW2901 - loop var word is overwritten if isinstance(word, str) and bytes is not str: word = bytes(word, 'raw-unicode-escape') # PLW2901 if last_word is None: last_word = word last_charset = charset elif charset != last_charset: collapsed.append((last_word, last_charset)) last_word = word last_charset = charset elif last_charset is None: BSPACE = b' ' last_word += BSPACE + word else: last_word += word collapsed.append((last_word, last_charset)) return collapsed
