Issue python#9319: Include the filename in "Non-UTF8 code ..." syntax error.

Victor Stinner · Victor Stinner · commit fe7c5b5bdf7c · 2011-04-05T01:48:03.000+02:00
diff --git a/Lib/test/test_imp.py b/Lib/test/test_imp.py
@@ -58,6 +58,12 @@ def test_find_module_encoding(self):
             with imp.find_module('module_' + mod, self.test_path)[0] as fd:
                 self.assertEqual(fd.encoding, encoding)
 
+        path = [os.path.dirname(__file__)]
+        self.assertRaisesRegex(SyntaxError,
+            r"Non-UTF-8 code starting with '\\xf6'"
+            r" in file .*badsyntax_pep3120.py",
+            imp.find_module, 'badsyntax_pep3120', path)
+
     def test_issue1267(self):
         for mod, encoding, _ in self.test_strings:
             fp, filename, info  = imp.find_module('module_' + mod,
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------
 
+- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.
+
 - Issue #10785: Store the filename as Unicode in the Python parser.
 
 - Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
@@ -1690,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
     return result;
 }
 
-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+   the file starts with a BOM.
 
-   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
-   the first or second line of the file (in which case the encoding
-   should be assumed to be PyUnicode_GetDefaultEncoding()).
+   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+   encoding in the first or second line of the file (in which case the encoding
+   should be assumed to be UTF-8).
+
+   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+   by the caller. */
 
-   The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
-   by the caller.
-*/
 char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
 {
     struct tok_state *tok;
     FILE *fp;
@@ -1720,9 +1721,18 @@ PyTokenizer_FindEncoding(int fd)
         return NULL;
     }
 #ifndef PGEN
-    tok->filename = PyUnicode_FromString("<string>");
-    if (tok->filename == NULL)
-        goto error;
+    if (filename != NULL) {
+        Py_INCREF(filename);
+        tok->filename = filename;
+    }
+    else {
+        tok->filename = PyUnicode_FromString("<string>");
+        if (tok->filename == NULL) {
+            fclose(fp);
+            PyTokenizer_Free(tok);
+            return encoding;
+        }
+    }
 #endif
     while (tok->lineno < 2 && tok->done == E_OK) {
         PyTokenizer_Get(tok, &p_start, &p_end);
@@ -1733,13 +1743,16 @@ PyTokenizer_FindEncoding(int fd)
         if (encoding)
         strcpy(encoding, tok->encoding);
     }
-#ifndef PGEN
-error:
-#endif
     PyTokenizer_Free(tok);
     return encoding;
 }
 
+char *
+PyTokenizer_FindEncoding(int fd)
+{
+    return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
 #ifdef Py_DEBUG
 
 void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
@@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);
 extern int PyTokenizer_Get(struct tok_state *, char **, char **);
 extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
                                           int len, int *offset);
-extern char * PyTokenizer_FindEncoding(int);
 
 #ifdef __cplusplus
 }
diff --git a/Python/import.c b/Python/import.c
@@ -124,12 +124,12 @@ static const Py_UNICODE PYC_TAG_UNICODE[] = {
 /* See _PyImport_FixupExtensionObject() below */
 static PyObject *extensions = NULL;
 
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
+
 /* This table is defined in config.c: */
 extern struct _inittab _PyImport_Inittab[];
 
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
-
 struct _inittab *PyImport_Inittab = _PyImport_Inittab;
 
 /* these tables define the module suffixes that Python recognizes */
@@ -3540,9 +3540,9 @@ call_find_module(PyObject *name, PyObject *path_list)
     }
     if (fd != -1) {
         if (strchr(fdp->mode, 'b') == NULL) {
-            /* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed
+            /* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed
                memory. */
-            found_encoding = PyTokenizer_FindEncoding(fd);
+            found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj);
             lseek(fd, 0, 0); /* Reset position */
             if (found_encoding == NULL && PyErr_Occurred()) {
                 Py_XDECREF(pathobj);
diff --git a/Python/traceback.c b/Python/traceback.c
@@ -18,8 +18,8 @@
 #define MAX_FRAME_DEPTH 100
 #define MAX_NTHREADS 100
 
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
 
 static PyObject *
 tb_dir(PyTracebackObject *self)
@@ -251,7 +251,7 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent)
 
     /* use the right encoding to decode the file as unicode */
     fd = PyObject_AsFileDescriptor(binary);
-    found_encoding = PyTokenizer_FindEncoding(fd);
+    found_encoding = PyTokenizer_FindEncodingFilename(fd, filename);
     encoding = (found_encoding != NULL) ? found_encoding : "utf-8";
     lseek(fd, 0, 0); /* Reset position */
     fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding);

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);`
`75`	`75`	`extern int PyTokenizer_Get(struct tok_state , char , char *);`
`76`	`76`	`extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,`
`77`	`77`	`int len, int *offset);`
`78`		`-extern char * PyTokenizer_FindEncoding(int);`
`79`	`78`
`80`	`79`	`#ifdef __cplusplus`
`81`	`80`	`}`