Mercurial > p > roundup > code
comparison roundup/rfc2822.py @ 1383:f19dde90e473
applied unicode patch
| author | Andrey Lebedev <kedder@users.sourceforge.net> |
|---|---|
| date | Wed, 15 Jan 2003 22:17:20 +0000 |
| parents | |
| children | 0634f815b90c |
comparison
equal
deleted
inserted
replaced
| 1382:87143c3d7156 | 1383:f19dde90e473 |
|---|---|
| 1 import re | |
| 2 from binascii import b2a_base64, a2b_base64 | |
| 3 | |
| 4 ecre = re.compile(r''' | |
| 5 =\? # literal =? | |
| 6 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset | |
| 7 \? # literal ? | |
| 8 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive | |
| 9 \? # literal ? | |
| 10 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string | |
| 11 \?= # literal ?= | |
| 12 ''', re.VERBOSE | re.IGNORECASE) | |
| 13 | |
| 14 hqre = re.compile(r'^[-a-zA-Z0-9!*+/\[\]., ]+$') | |
| 15 | |
| 16 def base64_decode(s, convert_eols=None): | |
| 17 """Decode a raw base64 string. | |
| 18 | |
| 19 If convert_eols is set to a string value, all canonical email linefeeds, | |
| 20 e.g. "\\r\\n", in the decoded text will be converted to the value of | |
| 21 convert_eols. os.linesep is a good choice for convert_eols if you are | |
| 22 decoding a text attachment. | |
| 23 | |
| 24 This function does not parse a full MIME header value encoded with | |
| 25 base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high | |
| 26 level email.Header class for that functionality. | |
| 27 | |
| 28 Taken from 'email' module | |
| 29 """ | |
| 30 if not s: | |
| 31 return s | |
| 32 | |
| 33 dec = a2b_base64(s) | |
| 34 if convert_eols: | |
| 35 return dec.replace(CRLF, convert_eols) | |
| 36 return dec | |
| 37 | |
| 38 def unquote_match(match): | |
| 39 """Turn a match in the form =AB to the ASCII character with value 0xab | |
| 40 | |
| 41 Taken from 'email' module | |
| 42 """ | |
| 43 s = match.group(0) | |
| 44 return chr(int(s[1:3], 16)) | |
| 45 | |
| 46 def qp_decode(s): | |
| 47 """Decode a string encoded with RFC 2045 MIME header `Q' encoding. | |
| 48 | |
| 49 This function does not parse a full MIME header value encoded with | |
| 50 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use | |
| 51 the high level email.Header class for that functionality. | |
| 52 | |
| 53 Taken from 'email' module | |
| 54 """ | |
| 55 s = s.replace('_', ' ') | |
| 56 return re.sub(r'=\w{2}', unquote_match, s) | |
| 57 | |
| 58 def _decode_header(header): | |
| 59 """Decode a message header value without converting charset. | |
| 60 | |
| 61 Returns a list of (decoded_string, charset) pairs containing each of the | |
| 62 decoded parts of the header. Charset is None for non-encoded parts of the | |
| 63 header, otherwise a lower-case string containing the name of the character | |
| 64 set specified in the encoded string. | |
| 65 | |
| 66 Taken from 'email' module | |
| 67 """ | |
| 68 # If no encoding, just return the header | |
| 69 header = str(header) | |
| 70 if not ecre.search(header): | |
| 71 return [(header, None)] | |
| 72 | |
| 73 decoded = [] | |
| 74 dec = '' | |
| 75 for line in header.splitlines(): | |
| 76 # This line might not have an encoding in it | |
| 77 if not ecre.search(line): | |
| 78 decoded.append((line, None)) | |
| 79 continue | |
| 80 | |
| 81 parts = ecre.split(line) | |
| 82 while parts: | |
| 83 unenc = parts.pop(0) | |
| 84 if unenc: | |
| 85 if unenc.strip(): | |
| 86 decoded.append((unenc, None)) | |
| 87 if parts: | |
| 88 charset, encoding = [s.lower() for s in parts[0:2]] | |
| 89 encoded = parts[2] | |
| 90 dec = '' | |
| 91 if encoding == 'q': | |
| 92 dec = qp_decode(encoded) | |
| 93 elif encoding == 'b': | |
| 94 dec = base64_decode(encoded) | |
| 95 else: | |
| 96 dec = encoded | |
| 97 | |
| 98 if decoded and decoded[-1][1] == charset: | |
| 99 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1]) | |
| 100 else: | |
| 101 decoded.append((dec, charset)) | |
| 102 del parts[0:3] | |
| 103 return decoded | |
| 104 | |
| 105 def decode_header(hdr): | |
| 106 """ Decodes rfc2822 encoded header and return utf-8 encoded string | |
| 107 """ | |
| 108 if not hdr: | |
| 109 return None | |
| 110 outs = u"" | |
| 111 for section in _decode_header(hdr): | |
| 112 charset = unaliasCharset(section[1]) | |
| 113 outs += unicode(section[0], charset or 'iso-8859-1', 'replace') | |
| 114 return outs.encode('utf-8') | |
| 115 | |
| 116 def encode_header(header): | |
| 117 """ Will encode in quoted-printable encoding only if header | |
| 118 contains non latin characters | |
| 119 """ | |
| 120 | |
| 121 # Return empty headers unchanged | |
| 122 if not header: | |
| 123 return header | |
| 124 | |
| 125 global hqre | |
| 126 # return plain header if it is not contains non-ascii characters | |
| 127 if hqre.match(header): | |
| 128 return header | |
| 129 | |
| 130 charset = 'utf-8' | |
| 131 quoted = '' | |
| 132 #max_encoded = 76 - len(charset) - 7 | |
| 133 for c in header: | |
| 134 # Space may be represented as _ instead of =20 for readability | |
| 135 if c == ' ': | |
| 136 quoted += '_' | |
| 137 # These characters can be included verbatim | |
| 138 elif hqre.match(c): | |
| 139 quoted += c | |
| 140 # Otherwise, replace with hex value like =E2 | |
| 141 else: | |
| 142 quoted += "=%02X" % ord(c) | |
| 143 plain = 0 | |
| 144 | |
| 145 return '=?%s?q?%s?=' % (charset, quoted) | |
| 146 | |
| 147 def unaliasCharset(charset): | |
| 148 if charset: | |
| 149 return charset.lower().replace("windows-", 'cp') | |
| 150 #return charset_table.get(charset.lower(), charset) | |
| 151 return None | |
| 152 | |
| 153 def test(): | |
| 154 print encode_header("Contrary, Mary") | |
| 155 #print unaliasCharset('Windows-1251') | |
| 156 | |
| 157 if __name__ == '__main__': | |
| 158 test() | |
| 159 | |
| 160 # vim: et |
