Skip to content

Commit 62ca60a

Browse files
authored
Merge pull request #7318 from youknowone/utf8source
Update test_utf8source from v3.14.3 and implement it
2 parents 3cbd08f + 53ddc7e commit 62ca60a

4 files changed

Lines changed: 142 additions & 14 deletions

File tree

Lib/test/test_importlib/source/test_source_encoding.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,12 @@ def test_default_encoding(self):
6262
self.run_test(self.source_line.encode('utf-8'))
6363

6464
# [encoding first line]
65-
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 17
6665
def test_encoding_on_first_line(self):
6766
encoding = 'Latin-1'
6867
source = self.create_source(encoding)
6968
self.run_test(source)
7069

7170
# [encoding second line]
72-
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 34
7371
def test_encoding_on_second_line(self):
7472
source = b"#/usr/bin/python\n" + self.create_source('Latin-1')
7573
self.run_test(source)
@@ -84,7 +82,6 @@ def test_bom_and_utf_8(self):
8482
self.run_test(source)
8583

8684
# [BOM conflict]
87-
@unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: invalid utf-8 sequence of 1 bytes from index 20
8885
def test_bom_conflict(self):
8986
source = codecs.BOM_UTF8 + self.create_source('latin-1')
9087
with self.assertRaises(SyntaxError):

Lib/test/test_runpy.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,6 @@ def test_main_recursion_error(self):
752752
with infinite_recursion(25):
753753
self.assertRaises(RecursionError, run_path, zip_name)
754754

755-
@unittest.expectedFailure # TODO: RUSTPYTHON; detect encoding comments in files
756755
def test_encoding(self):
757756
with temp_dir() as script_dir:
758757
filename = os.path.join(script_dir, 'script.py')

Lib/test/test_utf8source.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# This file is marked as binary in the CVS, to prevent MacCVS from recoding it.
2-
31
import unittest
42

53
class PEP3120Test(unittest.TestCase):
@@ -14,11 +12,9 @@ def test_pep3120(self):
1412
b'\\\xd0\x9f'
1513
)
1614

17-
# TODO: RUSTPYTHON
18-
@unittest.expectedFailure
1915
def test_badsyntax(self):
2016
try:
21-
import test.badsyntax_pep3120
17+
import test.tokenizedata.badsyntax_pep3120 # noqa: F401
2218
except SyntaxError as msg:
2319
msg = str(msg).lower()
2420
self.assertTrue('utf-8' in msg)
@@ -28,8 +24,6 @@ def test_badsyntax(self):
2824

2925
class BuiltinCompileTests(unittest.TestCase):
3026

31-
# TODO: RUSTPYTHON
32-
@unittest.expectedFailure
3327
# Issue 3574.
3428
def test_latin1(self):
3529
# Allow compile() to read Latin-1 source.

crates/vm/src/stdlib/builtins.rs

Lines changed: 141 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,145 @@ mod builtins {
111111
_feature_version: OptionalArg<i32>,
112112
}
113113

114+
/// Detect PEP 263 encoding cookie from source bytes.
115+
/// Checks first two lines for `# coding[:=] <encoding>` pattern.
116+
/// Returns the encoding name if found, or None for default (UTF-8).
117+
#[cfg(feature = "parser")]
118+
fn detect_source_encoding(source: &[u8]) -> Option<String> {
119+
fn find_encoding_in_line(line: &[u8]) -> Option<String> {
120+
// PEP 263: '#' must be preceded only by whitespace/formfeed
121+
let hash_pos = line.iter().position(|&b| b == b'#')?;
122+
if !line[..hash_pos]
123+
.iter()
124+
.all(|&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
125+
{
126+
return None;
127+
}
128+
let after_hash = &line[hash_pos..];
129+
130+
// Find "coding" after the #
131+
let coding_pos = after_hash.windows(6).position(|w| w == b"coding")?;
132+
let after_coding = &after_hash[coding_pos + 6..];
133+
134+
// Next char must be ':' or '='
135+
let rest = if after_coding.first() == Some(&b':') || after_coding.first() == Some(&b'=')
136+
{
137+
&after_coding[1..]
138+
} else {
139+
return None;
140+
};
141+
142+
// Skip whitespace
143+
let rest = rest
144+
.iter()
145+
.copied()
146+
.skip_while(|&b| b == b' ' || b == b'\t')
147+
.collect::<Vec<_>>();
148+
149+
// Read encoding name: [-\w.]+
150+
let name: String = rest
151+
.iter()
152+
.take_while(|&&b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.')
153+
.map(|&b| b as char)
154+
.collect();
155+
156+
if name.is_empty() { None } else { Some(name) }
157+
}
158+
159+
// Split into lines (first two only)
160+
let mut lines = source.splitn(3, |&b| b == b'\n');
161+
162+
if let Some(first) = lines.next() {
163+
// Strip BOM if present
164+
let first = first.strip_prefix(b"\xef\xbb\xbf").unwrap_or(first);
165+
if let Some(enc) = find_encoding_in_line(first) {
166+
return Some(enc);
167+
}
168+
// Only check second line if first line is blank or a comment
169+
let trimmed = first
170+
.iter()
171+
.skip_while(|&&b| b == b' ' || b == b'\t' || b == b'\x0c' || b == b'\r')
172+
.copied()
173+
.collect::<Vec<_>>();
174+
if !trimmed.is_empty() && trimmed[0] != b'#' {
175+
return None;
176+
}
177+
}
178+
179+
lines.next().and_then(find_encoding_in_line)
180+
}
181+
182+
/// Decode source bytes to a string, handling PEP 263 encoding declarations
183+
/// and BOM. Raises SyntaxError for invalid UTF-8 without an encoding
184+
/// declaration (matching CPython behavior).
185+
/// Check if an encoding name is a UTF-8 variant after normalization.
186+
/// Matches: utf-8, utf_8, utf8, UTF-8, etc.
187+
#[cfg(feature = "parser")]
188+
fn is_utf8_encoding(name: &str) -> bool {
189+
let normalized: String = name.chars().filter(|&c| c != '-' && c != '_').collect();
190+
normalized.eq_ignore_ascii_case("utf8")
191+
}
192+
193+
#[cfg(feature = "parser")]
194+
fn decode_source_bytes(source: &[u8], filename: &str, vm: &VirtualMachine) -> PyResult<String> {
195+
let has_bom = source.starts_with(b"\xef\xbb\xbf");
196+
let encoding = detect_source_encoding(source);
197+
198+
let is_utf8 = encoding.as_deref().is_none_or(is_utf8_encoding);
199+
200+
// Validate BOM + encoding combination
201+
if has_bom && !is_utf8 {
202+
return Err(vm.new_exception_msg(
203+
vm.ctx.exceptions.syntax_error.to_owned(),
204+
format!("encoding problem for '{filename}': utf-8").into(),
205+
));
206+
}
207+
208+
if is_utf8 {
209+
let src = if has_bom { &source[3..] } else { source };
210+
match core::str::from_utf8(src) {
211+
Ok(s) => Ok(s.to_owned()),
212+
Err(e) => {
213+
let bad_byte = src[e.valid_up_to()];
214+
let line = src[..e.valid_up_to()]
215+
.iter()
216+
.filter(|&&b| b == b'\n')
217+
.count()
218+
+ 1;
219+
Err(vm.new_exception_msg(
220+
vm.ctx.exceptions.syntax_error.to_owned(),
221+
format!(
222+
"Non-UTF-8 code starting with '\\x{bad_byte:02x}' \
223+
on line {line}, but no encoding declared; \
224+
see https://peps.python.org/pep-0263/ for details \
225+
({filename}, line {line})"
226+
)
227+
.into(),
228+
))
229+
}
230+
}
231+
} else {
232+
// Use codec registry for non-UTF-8 encodings
233+
let enc = encoding.as_deref().unwrap();
234+
let bytes_obj = vm.ctx.new_bytes(source.to_vec());
235+
let decoded = vm
236+
.state
237+
.codec_registry
238+
.decode_text(bytes_obj.into(), enc, None, vm)
239+
.map_err(|exc| {
240+
if exc.fast_isinstance(vm.ctx.exceptions.lookup_error) {
241+
vm.new_exception_msg(
242+
vm.ctx.exceptions.syntax_error.to_owned(),
243+
format!("unknown encoding for '{filename}': {enc}").into(),
244+
)
245+
} else {
246+
exc
247+
}
248+
})?;
249+
Ok(decoded.to_string_lossy().into_owned())
250+
}
251+
}
252+
114253
#[cfg(any(feature = "parser", feature = "compiler"))]
115254
#[pyfunction]
116255
fn compile(args: CompileArgs, vm: &VirtualMachine) -> PyResult {
@@ -203,9 +342,8 @@ mod builtins {
203342
let source = ArgStrOrBytesLike::try_from_object(vm, args.source)?;
204343
let source = source.borrow_bytes();
205344

206-
// TODO: compiler::compile should probably get bytes
207-
let source = core::str::from_utf8(&source)
208-
.map_err(|e| vm.new_unicode_decode_error(e.to_string()))?;
345+
let source = decode_source_bytes(&source, &args.filename.to_string_lossy(), vm)?;
346+
let source = source.as_str();
209347

210348
let flags = args.flags.map_or(Ok(0), |v| v.try_to_primitive(vm))?;
211349

0 commit comments

Comments
 (0)