changeset 5416:56c9bcdea47f

Python 3 preparation: unicode. This patch introduces roundup/anypy/strings.py, which has a comment explaining the string representations generally used and common functions to handle the required conversions. Places in the code that explicitly reference the "unicode" type / built-in function are generally changed to use the new functions (or, in a few places where those new functions don't seem to fit well, other approaches such as references to type(u'') or use of the codecs module). This patch does not generally attempt to address text conversions in any places not currently referencing the "unicode" type (although scripts/import_sf.py is made to use binary I/O in places as fixing the "unicode" reference didn't seem coherent otherwise).
author Joseph Myers <jsm@polyomino.org.uk>
date Wed, 25 Jul 2018 09:05:58 +0000
parents 2d6a92c3e212
children c749d6795bc2
files roundup/anypy/strings.py roundup/backends/back_sqlite.py roundup/backends/indexer_rdbms.py roundup/backends/indexer_whoosh.py roundup/backends/rdbms_common.py roundup/cgi/PageTemplates/TALES.py roundup/cgi/TranslationService.py roundup/cgi/engine_chameleon.py roundup/cgi/engine_jinja2.py roundup/cgi/templating.py roundup/configuration.py roundup/dehtml.py roundup/i18n.py roundup/mailer.py roundup/mailgw.py roundup/password.py roundup/roundupdb.py roundup/xmlrpc.py scripts/import_sf.py
diffstat 19 files changed, 134 insertions(+), 59 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/roundup/anypy/strings.py	Wed Jul 25 09:05:58 2018 +0000
@@ -0,0 +1,75 @@
+# Roundup represents text internally using the native Python str type.
+# In Python 3, these are Unicode strings.  In Python 2, these are
+# encoded using UTF-8, and the Python 2 unicode type is only used in a
+# few places, generally for interacting with external modules
+# requiring that type to be used.
+
+import sys
+_py3 = sys.version_info[0] > 2
+
+def b2s(b):
+    """Convert a UTF-8 encoded bytes object to the internal string format."""
+    if _py3:
+        return b.decode('utf-8')
+    else:
+        return b
+
+def s2b(s):
+    """Convert a string object to UTF-8 encoded bytes."""
+    if _py3:
+        return s.encode('utf-8')
+    else:
+        return s
+
+def s2u(s, errors='strict'):
+    """Convert a string object to a Unicode string."""
+    if _py3:
+        return s
+    else:
+        return unicode(s, 'utf-8', errors)
+
+def u2s(u):
+    """Convert a Unicode string to the internal string format."""
+    if _py3:
+        return u
+    else:
+        return u.encode('utf-8')
+
+def us2u(s, errors='strict'):
+    """Convert a string or Unicode string to a Unicode string."""
+    if _py3:
+        return s
+    else:
+        if isinstance(s, unicode):
+            return s
+        else:
+            return unicode(s, 'utf-8', errors)
+
+def us2s(u):
+    """Convert a string or Unicode string to the internal string format."""
+    if _py3:
+        return u
+    else:
+        if isinstance(u, unicode):
+            return u.encode('utf-8')
+        else:
+            return u
+
+def uany2s(u):
+    """Convert a Unicode string or other object to the internal string format.
+
+    Objects that are not Unicode strings are passed to str()."""
+    if _py3:
+        return str(u)
+    else:
+        if isinstance(u, unicode):
+            return u.encode('utf-8')
+        else:
+            return str(u)
+
+def is_us(s):
+    """Return whether an object is a string or Unicode string."""
+    if _py3:
+        return isinstance(s, str)
+    else:
+        return isinstance(s, str) or isinstance(s, unicode)
--- a/roundup/backends/back_sqlite.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/backends/back_sqlite.py	Wed Jul 25 09:05:58 2018 +0000
@@ -13,6 +13,7 @@
 from roundup import hyperdb, date, password
 from roundup.backends import rdbms_common
 from roundup.backends.sessions_dbm import Sessions, OneTimeKeys
+from roundup.anypy.strings import uany2s
 
 sqlite_version = None
 try:
@@ -85,7 +86,7 @@
         hyperdb.Multilink : lambda x: x,    # used in journal marshalling
     }
     sql_to_hyperdb_value = {
-        hyperdb.String : lambda x: isinstance(x, unicode) and x.encode('utf8') or str(x),
+        hyperdb.String : uany2s,
         hyperdb.Date   : lambda x: date.Date(str(x)),
         hyperdb.Link   : str, # XXX numeric ids
         hyperdb.Interval  : date.Interval,
--- a/roundup/backends/indexer_rdbms.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/backends/indexer_rdbms.py	Wed Jul 25 09:05:58 2018 +0000
@@ -5,6 +5,7 @@
 import re
 
 from roundup.backends.indexer_common import Indexer as IndexerBase
+from roundup.anypy.strings import us2u, u2s
 
 class Indexer(IndexerBase):
     def __init__(self, db):
@@ -61,10 +62,9 @@
             self.db.cursor.execute(sql, (id, ))
 
         # ok, find all the unique words in the text
-        if not isinstance(text, unicode):
-            text = unicode(text, "utf-8", "replace")
+        text = us2u(text, "replace")
         text = text.upper()
-        wordlist = [w.encode("utf-8")
+        wordlist = [u2s(w)
                     for w in re.findall(r'(?u)\b\w{%d,%d}\b'
                                         % (self.minlength, self.maxlength), text)]
         words = set()
--- a/roundup/backends/indexer_whoosh.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/backends/indexer_whoosh.py	Wed Jul 25 09:05:58 2018 +0000
@@ -5,6 +5,7 @@
 from whoosh import fields, qparser, index, query, analysis
 
 from roundup.backends.indexer_common import Indexer as IndexerBase
+from roundup.anypy.strings import us2u
 
 class Indexer(IndexerBase):
     def __init__(self, db):
@@ -78,8 +79,7 @@
         if not text:
             text = u''
 
-        if not isinstance(text, unicode):
-            text = unicode(text, "utf-8", "replace")
+        text = us2u(text, "replace")
 
         # We use the identifier twice: once in the actual "text" being
         # indexed so we can search on it, and again as the "data" being
--- a/roundup/backends/rdbms_common.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/backends/rdbms_common.py	Wed Jul 25 09:05:58 2018 +0000
@@ -69,6 +69,7 @@
 from roundup.date import Range
 
 from roundup.backends.back_anydbm import compile_expression
+from roundup.anypy.strings import us2s
 
 
 # dummy value meaning "argument not passed"
@@ -2944,8 +2945,7 @@
             elif isinstance(prop, hyperdb.Password):
                 value = password.Password(encrypted=value)
             elif isinstance(prop, String):
-                if isinstance(value, unicode):
-                    value = value.encode('utf8')
+                value = us2s(value)
                 if not isinstance(value, str):
                     raise TypeError('new property "%(propname)s" not a '
                         'string: %(value)r'%locals())
--- a/roundup/cgi/PageTemplates/TALES.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/cgi/PageTemplates/TALES.py	Wed Jul 25 09:05:58 2018 +0000
@@ -231,7 +231,7 @@
         text = self.evaluate(expr)
         if text is Default or text is None:
             return text
-        if isinstance(text, unicode):
+        if isinstance(text, type(u'')):
             return text
         else:
             return ustr(text)
--- a/roundup/cgi/TranslationService.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/cgi/TranslationService.py	Wed Jul 25 09:05:58 2018 +0000
@@ -16,13 +16,12 @@
 from roundup import i18n
 from roundup.cgi.PageTemplates import Expressions, PathIterator, TALES
 from roundup.cgi.TAL import TALInterpreter
+from roundup.anypy.strings import us2u, u2s
 
 ### Translation classes
 
 class TranslationServiceMixin:
 
-    OUTPUT_ENCODING = "utf-8"
-
     def translate(self, domain, msgid, mapping=None,
         context=None, target_language=None, default=None
     ):
@@ -32,18 +31,15 @@
         return _msg
 
     def gettext(self, msgid):
-        if not isinstance(msgid, unicode):
-            msgid = unicode(msgid, 'utf8')
+        msgid = us2u(msgid)
         msgtrans=self.ugettext(msgid)
-        return msgtrans.encode(self.OUTPUT_ENCODING)
+        return u2s(msgtrans)
 
     def ngettext(self, singular, plural, number):
-        if not isinstance(singular, unicode):
-            singular = unicode(singular, 'utf8')
-        if not isinstance(plural, unicode):
-            plural = unicode(plural, 'utf8')
+        singular = us2u(singular)
+        plural = us2u(plural)
         msgtrans=self.ungettext(singular, plural, number)
-        return msgtrans.encode(self.OUTPUT_ENCODING)
+        return u2s(msgtrans)
 
 class TranslationService(TranslationServiceMixin, i18n.RoundupTranslations):
     pass
@@ -55,8 +51,7 @@
             return self._fallback.ugettext(message)
         # Sometimes the untranslatable message is a UTF-8 encoded string
         # (thanks to PageTemplate's internals).
-        if not isinstance(message, unicode):
-            return unicode(message, 'utf8')
+        message = us2u(message)
         return message
 
 ### TAL patching
--- a/roundup/cgi/engine_chameleon.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/cgi/engine_chameleon.py	Wed Jul 25 09:05:58 2018 +0000
@@ -6,6 +6,7 @@
 import chameleon
 
 from roundup.cgi.templating import StringIO, context, TALLoaderBase
+from roundup.anypy.strings import s2u
 
 class Loader(TALLoaderBase):
     def __init__(self, dir):
@@ -27,7 +28,7 @@
         def translate(msgid, domain=None, mapping=None, default=None):
             result = client.translator.translate(domain, msgid,
                          mapping=mapping, default=default)
-            return unicode(result, client.translator.OUTPUT_ENCODING)
+            return s2u(result)
 
         output = self._pt.render(None, translate, **c)
         return output.encode(client.charset)
--- a/roundup/cgi/engine_jinja2.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/cgi/engine_jinja2.py	Wed Jul 25 09:05:58 2018 +0000
@@ -40,6 +40,7 @@
 # http://jinja.pocoo.org/docs/api/#loaders
 
 from roundup.cgi.templating import context, LoaderBase, TemplateBase
+from roundup.anypy.strings import s2u
 
 class Jinja2Loader(LoaderBase):
     def __init__(self, dir):
@@ -59,8 +60,7 @@
         # The automatic conversion will assume 'ascii' and fail sometime.
         # Analysed with roundup 1.5.0 and jinja 2.7.1. See issue2550811.
         self._env.filters["u"] = lambda s: \
-            unicode(s(), "utf-8") if type(s) == MethodType \
-                                  else unicode(s, "utf-8")
+            s2u(s()) if type(s) == MethodType else s2u(s)
 
     def check(self, tplname):
         #print tplname
--- a/roundup/cgi/templating.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/cgi/templating.py	Wed Jul 25 09:05:58 2018 +0000
@@ -29,6 +29,7 @@
 from roundup import hyperdb, date, support
 from roundup import i18n
 from roundup.i18n import _
+from roundup.anypy.strings import is_us, us2s, s2u, u2s
 
 from .KeywordsExpr import render_keywords_expression_editor
 
@@ -1774,7 +1775,7 @@
             return self.plain(escape=1)
 
         value = self._value
-        if isinstance(value, str) or isinstance(value, unicode):
+        if is_us(value):
             value = value.strip().lower() in ('checked', 'yes', 'true',
                 'on', '1')
 
@@ -1827,8 +1828,7 @@
             anonymous=0, offset=None):
         HTMLProperty.__init__(self, client, classname, nodeid, prop, name,
                 value, anonymous=anonymous)
-        if self._value and not (isinstance(self._value, str) or
-                isinstance(self._value, unicode)):
+        if self._value and not is_us(self._value):
             self._value.setTranslator(self._client.translator)
         self._offset = offset
         if self._offset is None :
@@ -1910,9 +1910,9 @@
                     raise ValueError(self._('default value for '
                         'DateHTMLProperty must be either DateHTMLProperty '
                         'or string date representation.'))
-        elif isinstance(value, str) or isinstance(value, unicode):
+        elif is_us(value):
             # most likely erroneous input to be passed back to user
-            if isinstance(value, unicode): value = value.encode('utf8')
+            value = us2s(value)
             s = self.input(name=self._formname, value=value, size=size,
                               **kwargs)
             if popcal:
@@ -1923,7 +1923,7 @@
 
         if raw_value is None:
             value = ''
-        elif isinstance(raw_value, str) or isinstance(raw_value, unicode):
+        elif is_us(raw_value):
             if format is self._marker:
                 value = raw_value
             else:
@@ -2012,7 +2012,7 @@
             anonymous=0):
         HTMLProperty.__init__(self, client, classname, nodeid, prop,
             name, value, anonymous)
-        if self._value and not isinstance(self._value, (str, unicode)):
+        if self._value and not is_us(self._value):
             self._value.setTranslator(self._client.translator)
 
     def plain(self, escape=0):
@@ -2967,9 +2967,9 @@
         klass = self.client.db.getclass(self.classname)
         if self.search_text:
             matches = self.client.db.indexer.search(
-                [w.upper().encode("utf-8", "replace") for w in re.findall(
+                [u2s(w.upper()) for w in re.findall(
                     r'(?u)\b\w{2,25}\b',
-                    unicode(self.search_text, "utf-8", "replace")
+                    s2u(self.search_text, "replace")
                 )], klass)
         else:
             matches = None
--- a/roundup/configuration.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/configuration.py	Wed Jul 25 09:05:58 2018 +0000
@@ -540,8 +540,9 @@
         return value.pattern
 
     def str2value(self, value):
-        if not isinstance(value, unicode):
+        if not isinstance(value, type(u'')):
             value = str(value)
+        if not isinstance(value, type(u'')):
             # if it is 7-bit ascii, use it as string,
             # otherwise convert to unicode.
             try:
--- a/roundup/dehtml.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/dehtml.py	Wed Jul 25 09:05:58 2018 +0000
@@ -1,5 +1,6 @@
 
 from __future__ import print_function
+from roundup.anypy.strings import u2s
 class dehtml:
     def __init__(self, converter):
         if converter == "none":
@@ -17,7 +18,7 @@
                     for script in soup(["script", "style"]):
                         script.extract()
 
-                    return soup.get_text('\n', strip=True).encode('utf-8')
+                    return u2s(soup.get_text('\n', strip=True))
 
                 self.html2text = html2text
             else:
--- a/roundup/i18n.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/i18n.py	Wed Jul 25 09:05:58 2018 +0000
@@ -40,6 +40,7 @@
 import os
 
 from roundup import msgfmt
+from roundup.anypy.strings import is_us
 
 # List of directories for mo file search (see SF bug 1219689)
 LOCALE_DIRS = [
@@ -79,7 +80,7 @@
             if val:
                 languages = val.split(':')
                 break
-    elif isinstance(language, str) or  isinstance(language, unicode):
+    elif is_us(language):
         languages = [language]
     else:
         # 'language' must be iterable
--- a/roundup/mailer.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/mailer.py	Wed Jul 25 09:05:58 2018 +0000
@@ -17,6 +17,7 @@
 from email.mime.multipart import MIMEMultipart
 
 from roundup.anypy import email_
+from roundup.anypy.strings import s2u
 
 try:
     import pyme, pyme.core
@@ -85,12 +86,12 @@
         '''
         # encode header values if they need to be
         charset = getattr(self.config, 'EMAIL_CHARSET', 'utf-8')
-        tracker_name = unicode(self.config.TRACKER_NAME, 'utf-8')
+        tracker_name = s2u(self.config.TRACKER_NAME)
         if not author:
             author = (tracker_name, self.config.ADMIN_EMAIL)
             name = author[0]
         else:
-            name = unicode(author[0], 'utf-8')
+            name = s2u(author[0])
         author = nice_sender_header(name, author[1], charset)
         try:
             message['Subject'] = subject.encode('ascii')
--- a/roundup/mailgw.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/mailgw.py	Wed Jul 25 09:05:58 2018 +0000
@@ -97,6 +97,7 @@
 
 import string, re, os, mimetools, cStringIO, smtplib, socket, binascii, quopri
 import time, random, sys, logging
+import codecs
 import traceback
 import email.utils
 
@@ -343,7 +344,7 @@
             charset = charset.lower().replace("windows-", 'cp')
             # Do conversion only if charset specified - handle
             # badly-specified charsets
-            edata = unicode(data, charset, 'replace').encode('utf-8')
+            edata = codecs.decode(data, charset, 'replace').encode('utf-8')
             # Convert from dos eol to unix
             edata = edata.replace('\r\n', '\n')
         else:
--- a/roundup/password.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/password.py	Wed Jul 25 09:05:58 2018 +0000
@@ -24,6 +24,8 @@
 from base64 import b64encode, b64decode
 from hashlib import md5, sha1
 
+from roundup.anypy.strings import us2s, s2b
+
 try:
     import crypt
 except ImportError:
@@ -105,10 +107,8 @@
     :returns:
         raw bytes of generated key
     """
-    if isinstance(password, unicode):
-        password = password.encode("utf-8")
-    if isinstance(salt, unicode):
-        salt = salt.encode("utf-8")
+    password = s2b(us2s(password))
+    salt = s2b(us2s(salt))
     if keylen > 40:
         #NOTE: pbkdf2 allows up to (2**31-1)*20 bytes,
         # but m2crypto has issues on some platforms above 40,
@@ -126,8 +126,7 @@
     """ unpack pbkdf2 encrypted password into parts,
         assume it has format "{rounds}${salt}${digest}
     """
-    if isinstance(pbkdf2, unicode):
-        pbkdf2 = pbkdf2.encode("ascii")
+    pbkdf2 = us2s(pbkdf2)
     try:
         rounds, salt, digest = pbkdf2.split("$")
     except ValueError:
--- a/roundup/roundupdb.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/roundupdb.py	Wed Jul 25 09:05:58 2018 +0000
@@ -39,6 +39,8 @@
 from roundup.mailer import Mailer, MessageSendError, encode_quopri, \
     nice_sender_header
 
+from roundup.anypy.strings import s2u
+
 try:
     import pyme, pyme.core
     # gpgme_check_version() must have been called once in a programm
@@ -494,7 +496,7 @@
         charset = getattr(self.db.config, 'EMAIL_CHARSET', 'utf-8')
 
         # construct the content and convert to unicode object
-        body = unicode('\n'.join(m), 'utf-8').encode(charset)
+        body = s2u('\n'.join(m)).encode(charset)
 
         # make sure the To line is always the same (for testing mostly)
         sendto.sort()
@@ -520,7 +522,7 @@
             sendto = [sendto]
 
         # tracker sender info
-        tracker_name = unicode(self.db.config.TRACKER_NAME, 'utf-8')
+        tracker_name = s2u(self.db.config.TRACKER_NAME)
         tracker_name = nice_sender_header(tracker_name, from_address,
             charset)
 
--- a/roundup/xmlrpc.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/roundup/xmlrpc.py	Wed Jul 25 09:05:58 2018 +0000
@@ -12,6 +12,7 @@
 from roundup.anypy import xmlrpc_
 SimpleXMLRPCDispatcher = xmlrpc_.server.SimpleXMLRPCDispatcher
 Binary = xmlrpc_.client.Binary
+from roundup.anypy.strings import us2s
 from traceback import format_exc
 
 def translate(value):
@@ -41,13 +42,8 @@
             key, value = arg.split('=', 1)
         except ValueError :
             raise UsageError('argument "%s" not propname=value'%arg)
-        if isinstance(key, unicode):
-            try:
-                key = key.encode ('ascii')
-            except UnicodeEncodeError:
-                raise UsageError('argument %r is no valid ascii keyword'%key)
-        if isinstance(value, unicode):
-            value = value.encode('utf-8')
+        key = us2s(key)
+        value = us2s(value)
         if value:
             try:
                 props[key] = hyperdb.rawToHyperdb(db, cl, itemid,
--- a/scripts/import_sf.py	Wed Jul 25 00:40:26 2018 +0000
+++ b/scripts/import_sf.py	Wed Jul 25 09:05:58 2018 +0000
@@ -30,6 +30,7 @@
 
 from roundup import instance, hyperdb, date, support, password
 from roundup.anypy import http_, urllib_
+from roundup.anypy.strings import s2b, us2s
 
 today = date.Date('.')
 
@@ -295,7 +296,7 @@
                     files.append(fid)
                     name = name.strip()
                     try:
-                        f = open(os.path.join(file_dir, fid))
+                        f = open(os.path.join(file_dir, fid), 'rb')
                         content = f.read()
                         f.close()
                     except:
@@ -384,11 +385,11 @@
         if isinstance(klass, hyperdb.FileClass) and entry.get('content'):
             fname = klass.exportFilename('/tmp/imported/', entry['id'])
             support.ensureParentsExist(fname)
-            c = open(fname, 'w')
-            if isinstance(entry['content'], unicode):
-                c.write(entry['content'].encode('utf8'))
+            c = open(fname, 'wb')
+            if isinstance(entry['content'], bytes):
+                c.write(entry['content'])
             else:
-                c.write(entry['content'])
+                c.write(s2b(us2s(entry['content'])))
             c.close()
 
     f.close()

Roundup Issue Tracker: http://roundup-tracker.org/