Skip to content

Commit fe7c5b5

Browse files
author
Victor Stinner
committed
Issue python#9319: Include the filename in "Non-UTF8 code ..." syntax error.
1 parent 7f2fee3 commit fe7c5b5

File tree

6 files changed

+43
-23
lines changed

6 files changed

+43
-23
lines changed

Lib/test/test_imp.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ def test_find_module_encoding(self):
5858
with imp.find_module('module_' + mod, self.test_path)[0] as fd:
5959
self.assertEqual(fd.encoding, encoding)
6060

61+
path = [os.path.dirname(__file__)]
62+
self.assertRaisesRegex(SyntaxError,
63+
r"Non-UTF-8 code starting with '\\xf6'"
64+
r" in file .*badsyntax_pep3120.py",
65+
imp.find_module, 'badsyntax_pep3120', path)
66+
6167
def test_issue1267(self):
6268
for mod, encoding, _ in self.test_strings:
6369
fp, filename, info = imp.find_module('module_' + mod,

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.
14+
1315
- Issue #10785: Store the filename as Unicode in the Python parser.
1416

1517
- Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes

Parser/tokenizer.c

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1690,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
16901690
return result;
16911691
}
16921692

1693-
/* Get -*- encoding -*- from a Python file.
1693+
/* Get the encoding of a Python file. Check for the coding cookie and check if
1694+
the file starts with a BOM.
16941695
1695-
PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1696-
the first or second line of the file (in which case the encoding
1697-
should be assumed to be PyUnicode_GetDefaultEncoding()).
1696+
PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1697+
encoding in the first or second line of the file (in which case the encoding
1698+
should be assumed to be UTF-8).
1699+
1700+
The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1701+
by the caller. */
16981702

1699-
The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1700-
by the caller.
1701-
*/
17021703
char *
1703-
PyTokenizer_FindEncoding(int fd)
1704+
PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
17041705
{
17051706
struct tok_state *tok;
17061707
FILE *fp;
@@ -1720,9 +1721,18 @@ PyTokenizer_FindEncoding(int fd)
17201721
return NULL;
17211722
}
17221723
#ifndef PGEN
1723-
tok->filename = PyUnicode_FromString("<string>");
1724-
if (tok->filename == NULL)
1725-
goto error;
1724+
if (filename != NULL) {
1725+
Py_INCREF(filename);
1726+
tok->filename = filename;
1727+
}
1728+
else {
1729+
tok->filename = PyUnicode_FromString("<string>");
1730+
if (tok->filename == NULL) {
1731+
fclose(fp);
1732+
PyTokenizer_Free(tok);
1733+
return encoding;
1734+
}
1735+
}
17261736
#endif
17271737
while (tok->lineno < 2 && tok->done == E_OK) {
17281738
PyTokenizer_Get(tok, &p_start, &p_end);
@@ -1733,13 +1743,16 @@ PyTokenizer_FindEncoding(int fd)
17331743
if (encoding)
17341744
strcpy(encoding, tok->encoding);
17351745
}
1736-
#ifndef PGEN
1737-
error:
1738-
#endif
17391746
PyTokenizer_Free(tok);
17401747
return encoding;
17411748
}
17421749

1750+
char *
1751+
PyTokenizer_FindEncoding(int fd)
1752+
{
1753+
return PyTokenizer_FindEncodingFilename(fd, NULL);
1754+
}
1755+
17431756
#ifdef Py_DEBUG
17441757

17451758
void

Parser/tokenizer.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);
7575
extern int PyTokenizer_Get(struct tok_state *, char **, char **);
7676
extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
7777
int len, int *offset);
78-
extern char * PyTokenizer_FindEncoding(int);
7978

8079
#ifdef __cplusplus
8180
}

Python/import.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,12 @@ static const Py_UNICODE PYC_TAG_UNICODE[] = {
124124
/* See _PyImport_FixupExtensionObject() below */
125125
static PyObject *extensions = NULL;
126126

127+
/* Function from Parser/tokenizer.c */
128+
extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
129+
127130
/* This table is defined in config.c: */
128131
extern struct _inittab _PyImport_Inittab[];
129132

130-
/* Method from Parser/tokenizer.c */
131-
extern char * PyTokenizer_FindEncoding(int);
132-
133133
struct _inittab *PyImport_Inittab = _PyImport_Inittab;
134134

135135
/* these tables define the module suffixes that Python recognizes */
@@ -3540,9 +3540,9 @@ call_find_module(PyObject *name, PyObject *path_list)
35403540
}
35413541
if (fd != -1) {
35423542
if (strchr(fdp->mode, 'b') == NULL) {
3543-
/* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed
3543+
/* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed
35443544
memory. */
3545-
found_encoding = PyTokenizer_FindEncoding(fd);
3545+
found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj);
35463546
lseek(fd, 0, 0); /* Reset position */
35473547
if (found_encoding == NULL && PyErr_Occurred()) {
35483548
Py_XDECREF(pathobj);

Python/traceback.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
#define MAX_FRAME_DEPTH 100
1919
#define MAX_NTHREADS 100
2020

21-
/* Method from Parser/tokenizer.c */
22-
extern char * PyTokenizer_FindEncoding(int);
21+
/* Function from Parser/tokenizer.c */
22+
extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
2323

2424
static PyObject *
2525
tb_dir(PyTracebackObject *self)
@@ -251,7 +251,7 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent)
251251

252252
/* use the right encoding to decode the file as unicode */
253253
fd = PyObject_AsFileDescriptor(binary);
254-
found_encoding = PyTokenizer_FindEncoding(fd);
254+
found_encoding = PyTokenizer_FindEncodingFilename(fd, filename);
255255
encoding = (found_encoding != NULL) ? found_encoding : "utf-8";
256256
lseek(fd, 0, 0); /* Reset position */
257257
fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding);

0 commit comments

Comments
 (0)