changeset 5544:1a0498c1ed90

Avoid errors indexing binary uploads with Python 3. If you upload a binary file for a FileClass whose content property is set to be indexed (the default), an error of the form "'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte" can occur when the code attempts to index the content of that file. (This error is after the creation of the file, and any issue etc. created at the same time, has been committed; the page returned gives the impression that the creation failed, but that's not the case.) The indexing itself only happens for text/plain files, but that check is in the indexers themselves, after this error occurs (and it's entirely possible that a text/plain upload could actually have some binary or non-UTF-8 content). bytes objects for the binary contents get converted to str, with resulting errors when they are not in fact UTF-8 text. This patch makes the places that might try indexing binary content do the conversion to strings, for Python 3, with errors='ignore', so that at least no such exception occurs (and if the file is not text/plain, the results of the conversion will then get discarded in the indexers).
author Joseph Myers <jsm@polyomino.org.uk>
date Sun, 16 Sep 2018 20:04:03 +0000
parents bc3e00a3d24b
children 4523fe3cf04c
files roundup/backends/back_anydbm.py roundup/backends/rdbms_common.py
diffstat 2 files changed, 22 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/roundup/backends/back_anydbm.py	Sun Sep 16 16:19:20 2018 +0000
+++ b/roundup/backends/back_anydbm.py	Sun Sep 16 20:04:03 2018 +0000
@@ -2227,9 +2227,12 @@
             # store and possibly index
             self.db.storefile(self.classname, itemid, None, bs2b(content))
             if self.properties['content'].indexme:
+                index_content = content
+                if bytes != str and isinstance(content, bytes):
+                    index_content = content.decode('utf-8', errors='ignore')
                 mime_type = self.get(itemid, 'type', self.default_mime_type)
                 self.db.indexer.add_text((self.classname, itemid, 'content'),
-                    content, mime_type)
+                    index_content, mime_type)
             propvalues['content'] = content
 
         # fire reactors
@@ -2245,8 +2248,12 @@
         for prop, propclass in self.getprops().items():
             if prop == 'content' and propclass.indexme:
                 mime_type = self.get(nodeid, 'type', self.default_mime_type)
+                index_content = self.get(nodeid, 'binary_content')
+                if bytes != str and isinstance(index_content, bytes):
+                    index_content = index_content.decode('utf-8',
+                                                         errors='ignore')
                 self.db.indexer.add_text((self.classname, nodeid, 'content'),
-                    str(self.get(nodeid, 'content')), mime_type)
+                    index_content, mime_type)
             elif isinstance(propclass, hyperdb.String) and propclass.indexme:
                 # index them under (classname, nodeid, property)
                 try:
--- a/roundup/backends/rdbms_common.py	Sun Sep 16 16:19:20 2018 +0000
+++ b/roundup/backends/rdbms_common.py	Sun Sep 16 20:04:03 2018 +0000
@@ -3052,8 +3052,11 @@
 
         # and index!
         if self.properties['content'].indexme:
+            index_content = content
+            if bytes != str and isinstance(content, bytes):
+                index_content = content.decode('utf-8', errors='ignore')
             self.db.indexer.add_text((self.classname, newid, 'content'),
-                content, mime_type)
+                index_content, mime_type)
 
         # store off the content as a file
         self.db.storefile(self.classname, newid, None, bs2b(content))
@@ -3105,8 +3108,11 @@
             self.db.storefile(self.classname, itemid, None, bs2b(content))
             if self.properties['content'].indexme:
                 mime_type = self.get(itemid, 'type', self.default_mime_type)
+                index_content = content
+                if bytes != str and isinstance(content, bytes):
+                    index_content = content.decode('utf-8', errors='ignore')
                 self.db.indexer.add_text((self.classname, itemid, 'content'),
-                    content, mime_type)
+                    index_content, mime_type)
             propvalues['content'] = content
 
         # fire reactors
@@ -3122,8 +3128,12 @@
         for prop, propclass in self.getprops().items():
             if prop == 'content' and propclass.indexme:
                 mime_type = self.get(nodeid, 'type', self.default_mime_type)
+                index_content = self.get(nodeid, 'binary_content')
+                if bytes != str and isinstance(index_content, bytes):
+                    index_content = index_content.decode('utf-8',
+                                                         errors='ignore')
                 self.db.indexer.add_text((self.classname, nodeid, 'content'),
-                    str(self.get(nodeid, 'content')), mime_type)
+                    index_content, mime_type)
             elif isinstance(propclass, hyperdb.String) and propclass.indexme:
                 # index them under (classname, nodeid, property)
                 try:

Roundup Issue Tracker: http://roundup-tracker.org/