diff roundup/rfc2822.py @ 1383:f19dde90e473

applied unicode patch
author Andrey Lebedev <kedder@users.sourceforge.net>
date Wed, 15 Jan 2003 22:17:20 +0000
parents
children 0634f815b90c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/roundup/rfc2822.py	Wed Jan 15 22:17:20 2003 +0000
@@ -0,0 +1,160 @@
+import re
+from binascii import b2a_base64, a2b_base64
+
+ecre = re.compile(r'''
+  =\?                   # literal =?
+  (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
+  \?                    # literal ?
+  (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
+  \?                    # literal ?
+  (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
+  \?=                   # literal ?=
+  ''', re.VERBOSE | re.IGNORECASE)
+
+hqre = re.compile(r'^[-a-zA-Z0-9!*+/\[\]., ]+$')
+
+def base64_decode(s, convert_eols=None):
+    """Decode a raw base64 string.
+
+    If convert_eols is set to a string value, all canonical email linefeeds,
+    e.g. "\\r\\n", in the decoded text will be converted to the value of
+    convert_eols.  os.linesep is a good choice for convert_eols if you are
+    decoding a text attachment.
+
+    This function does not parse a full MIME header value encoded with
+    base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
+    level email.Header class for that functionality.
+
+    Taken from 'email' module
+    """
+    if not s:
+        return s
+    
+    dec = a2b_base64(s)
+    if convert_eols:
+        return dec.replace(CRLF, convert_eols)
+    return dec
+
+def unquote_match(match):
+    """Turn a match in the form =AB to the ASCII character with value 0xab
+
+    Taken from 'email' module
+    """
+    s = match.group(0)
+    return chr(int(s[1:3], 16))
+
+def qp_decode(s):
+    """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
+
+    This function does not parse a full MIME header value encoded with
+    quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
+    the high level email.Header class for that functionality.
+
+    Taken from 'email' module
+    """
+    s = s.replace('_', ' ')
+    return re.sub(r'=\w{2}', unquote_match, s)
+
+def _decode_header(header):
+    """Decode a message header value without converting charset.
+
+    Returns a list of (decoded_string, charset) pairs containing each of the
+    decoded parts of the header.  Charset is None for non-encoded parts of the
+    header, otherwise a lower-case string containing the name of the character
+    set specified in the encoded string.
+
+    Taken from 'email' module
+    """
+    # If no encoding, just return the header
+    header = str(header)
+    if not ecre.search(header):
+        return [(header, None)]
+
+    decoded = []
+    dec = ''
+    for line in header.splitlines():
+        # This line might not have an encoding in it
+        if not ecre.search(line):
+            decoded.append((line, None))
+            continue
+
+        parts = ecre.split(line)
+        while parts:
+            unenc = parts.pop(0)
+            if unenc:
+                if unenc.strip():
+                    decoded.append((unenc, None))
+            if parts:
+                charset, encoding = [s.lower() for s in parts[0:2]]
+                encoded = parts[2]
+                dec = ''
+                if encoding == 'q':
+                    dec = qp_decode(encoded)
+                elif encoding == 'b':
+                    dec = base64_decode(encoded)
+                else:
+                    dec = encoded
+
+                if decoded and decoded[-1][1] == charset:
+                    decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
+                else:
+                    decoded.append((dec, charset))
+            del parts[0:3]
+    return decoded
+
+def decode_header(hdr):
+    """ Decodes rfc2822 encoded header and return utf-8 encoded string
+    """
+    if not hdr:
+        return None
+    outs = u""
+    for section in _decode_header(hdr):
+        charset = unaliasCharset(section[1])
+        outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
+    return outs.encode('utf-8')
+
+def encode_header(header):
+    """ Will encode in quoted-printable encoding only if header 
+    contains non latin characters
+    """
+
+    # Return empty headers unchanged
+    if not header:
+        return header
+
+    global hqre
+    # return plain header if it is not contains non-ascii characters
+    if hqre.match(header):
+        return header
+    
+    charset = 'utf-8'
+    quoted = ''
+    #max_encoded = 76 - len(charset) - 7
+    for c in header:
+        # Space may be represented as _ instead of =20 for readability
+        if c == ' ':
+            quoted += '_'
+        # These characters can be included verbatim
+        elif hqre.match(c):
+            quoted += c
+        # Otherwise, replace with hex value like =E2
+        else:
+            quoted += "=%02X" % ord(c)
+            plain = 0
+
+    return '=?%s?q?%s?=' % (charset, quoted)
+
+def unaliasCharset(charset):
+    if charset:
+        return charset.lower().replace("windows-", 'cp')
+        #return charset_table.get(charset.lower(), charset)
+    return None
+
+def test():
+    print encode_header("Contrary, Mary")
+    #print unaliasCharset('Windows-1251')
+
+if __name__ == '__main__':
+    test()
+
+# vim: et

Roundup Issue Tracker: http://roundup-tracker.org/