Skip to content

Commit aac2070

Browse files
fix: Python-Rust combining char diff in isalnum (#7612)
* fix: Python-Rust combining char diff in isalnum Related to: #7518 Rust and Python differ on alphanumeric characters. Rust follows the Unicode standard closer than Python. This means that is_alphanumeric (char function in Rust) is different from isalnum (Python). To fix the discrepancy, RustPython needs to mimic Python by rejecting certain characters. Some classes of combining characters count as alphanumeric in Rust but not Python. Combining characters are accent marks that are combined with other characters to create a single grapheme. It's possible that this PR is not exhaustive. I fixed the combining character issue BUT I don't know the full range of discrepancies. * fix: Ignore combining characters in SRE Closes: #7518
1 parent f82b8d8 commit aac2070

6 files changed

Lines changed: 26 additions & 3 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/sre_engine/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true }
1919
num_enum = { workspace = true }
2020
bitflags = { workspace = true }
2121
optional = { workspace = true }
22+
icu_properties = { workspace = true }
2223

2324
[dev-dependencies]
2425
criterion = { workspace = true }

crates/sre_engine/src/string.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty};
12
use rustpython_wtf8::Wtf8;
23

34
#[derive(Debug, Clone, Copy)]
@@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
443444
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
444445
// TODO: check with cpython
445446
char::try_from(ch)
446-
.map(|x| x.is_alphanumeric())
447+
.map(|x| {
448+
x.is_alphanumeric()
449+
&& CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered
450+
})
447451
.unwrap_or(false)
448452
}
449453
#[inline]

crates/vm/src/builtins/str.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ use rustpython_common::{
4545
};
4646

4747
use icu_properties::props::{
48-
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
48+
BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory,
49+
XidContinue, XidStart,
4950
};
5051
use unicode_casing::CharExt;
5152

@@ -946,7 +947,11 @@ impl PyStr {
946947

947948
#[pymethod]
948949
fn isalnum(&self) -> bool {
949-
!self.data.is_empty() && self.char_all(char::is_alphanumeric)
950+
!self.data.is_empty()
951+
&& self.char_all(|c| {
952+
c.is_alphanumeric()
953+
&& CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered
954+
})
950955
}
951956

952957
#[pymethod]

extra_tests/snippets/builtin_str.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,15 @@
7373
# assert "DZ".title() == "Dz"
7474
assert a.isalpha()
7575

76+
# Combining characters differ slightly between Rust and Python
77+
assert "\u006e".isalnum()
78+
assert not "\u0303".isalnum()
79+
assert not "\u006e\u0303".isalnum()
80+
assert "\u00f1".isalnum()
81+
assert not "\u0345".isalnum()
82+
for raw in range(0x0363, 0x036F):
83+
assert not chr(raw).isalnum()
84+
7685
s = "1 2 3"
7786
assert s.split(" ", 1) == ["1", "2 3"]
7887
assert s.rsplit(" ", 1) == ["1 2", "3"]

extra_tests/snippets/stdlib_re.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,6 @@
7979
# Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183
8080
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38")
8181
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38"
82+
83+
# Combining characters; issue #7518
84+
assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"

0 commit comments

Comments
 (0)