Skip to content

Commit da427d8

Browse files
Fix most of unicodedata; unmask multiple tests
Python bundles an old version of Unicode for compatibility. RustPython tries to mimic supporting that old version by checking the version of individual chars. This is a problem for a few reasons. The first is that the age check adds an additional hit per each char lookup in Unicode data. The check is outdated because the `unic-ucd-age` crate is several versions behind the current Unicode version. The check rejects valid chars because of the version differences. The check is subtly wrong because it returns properties for Unicode 16.0.0 for Unicode 3.2.0 while checking against a Unicode 10.0.0 database. Unfortunately, there isn't a crate that can help us here. `icu4x` targets modern Unicode versions. Writing a data provider for `icu4x` for Unicode 3.2.0 is a lot of work for a legacy path. I opted to parse the Unicode 3.2.0 data myself but to skip `icu4x` (mostly) to instead write small lookup tables. As of this commit, Unicode names is still wrong for 3.2.0. Luckily, the crate RustPython uses is fast and robust for modern Unicode.
1 parent fb422da commit da427d8

19 files changed

Lines changed: 51605 additions & 207 deletions

Cargo.lock

Lines changed: 0 additions & 49 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,6 @@ icu_locale = "2"
318318
icu_properties = "2"
319319
icu_normalizer = "2"
320320
uuid = "1.23.1"
321-
ucd = "0.1.1"
322-
unic-ucd-age = "0.9.0"
323321
unicode_names2 = "2.0.0"
324322
widestring = "1.2.0"
325323
windows-sys = "0.61.2"

Lib/test/test_str.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,6 @@ def test_isprintable(self):
853853
self.assertTrue('\U0001F46F'.isprintable())
854854
self.assertFalse('\U000E0020'.isprintable())
855855

856-
@unittest.expectedFailure # TODO: RUSTPYTHON
857856
@support.requires_resource('cpu')
858857
def test_isprintable_invariant(self):
859858
for codepoint in range(sys.maxunicode + 1):

Lib/test/test_unicodedata.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ def test_function_checksum(self):
118118
result = h.hexdigest()
119119
self.assertEqual(result, self.expectedchecksum)
120120

121-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: None != 'EGYPTIAN HIEROGLYPH-13460'
122121
def test_name(self):
123122
name = self.db.name
124123
self.assertRaises(ValueError, name, '\0')
@@ -207,7 +206,6 @@ def test_lookup_nonexistant(self):
207206
]:
208207
self.assertRaises(KeyError, self.db.lookup, nonexistent)
209208

210-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: None != 9
211209
def test_digit(self):
212210
self.assertEqual(self.db.digit('A', None), None)
213211
self.assertEqual(self.db.digit('9'), 9)
@@ -227,7 +225,6 @@ def test_digit(self):
227225
self.assertRaises(TypeError, self.db.digit, 'xx')
228226
self.assertRaises(ValueError, self.db.digit, 'x')
229227

230-
@unittest.expectedFailure # TODO: RUSTPYTHON
231228
def test_numeric(self):
232229
self.assertEqual(self.db.numeric('A',None), None)
233230
self.assertEqual(self.db.numeric('9'), 9)
@@ -260,7 +257,6 @@ def test_numeric(self):
260257
self.assertRaises(TypeError, self.db.numeric, 'xx')
261258
self.assertRaises(ValueError, self.db.numeric, 'x')
262259

263-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: None != 2
264260
def test_decimal(self):
265261
self.assertEqual(self.db.decimal('A',None), None)
266262
self.assertEqual(self.db.decimal('9'), 9)
@@ -285,7 +281,6 @@ def test_decimal(self):
285281
self.assertRaises(TypeError, self.db.decimal, 'xx')
286282
self.assertRaises(ValueError, self.db.decimal, 'x')
287283

288-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 'Cn' != 'Mn'
289284
def test_category(self):
290285
self.assertEqual(self.db.category('\uFFFE'), 'Cn')
291286
self.assertEqual(self.db.category('a'), 'Ll')
@@ -365,7 +360,6 @@ def test_decomposition(self):
365360
self.assertRaises(TypeError, self.db.decomposition)
366361
self.assertRaises(TypeError, self.db.decomposition, 'xx')
367362

368-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 1 != 0
369363
def test_mirrored(self):
370364
self.assertEqual(self.db.mirrored('\uFFFE'), 0)
371365
self.assertEqual(self.db.mirrored('a'), 0)
@@ -385,7 +379,6 @@ def test_mirrored(self):
385379
self.assertRaises(TypeError, self.db.mirrored)
386380
self.assertRaises(TypeError, self.db.mirrored, 'xx')
387381

388-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 0 != 220
389382
def test_combining(self):
390383
self.assertEqual(self.db.combining('\uFFFE'), 0)
391384
self.assertEqual(self.db.combining('a'), 0)
@@ -605,7 +598,6 @@ def test_issue29456(self):
605598
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
606599
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
607600

608-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 'W' != 'N'
609601
def test_east_asian_width(self):
610602
eaw = self.db.east_asian_width
611603
self.assertRaises(TypeError, eaw, b'a')
@@ -646,7 +638,6 @@ def test_east_asian_width(self):
646638
self.assertEqual(eaw('\u2630'), 'N' if self.old else 'W')
647639
self.assertEqual(eaw('\U0001FAE9'), 'N' if self.old else 'W')
648640

649-
@unittest.expectedFailure # TODO: RUSTPYTHON; + W
650641
def test_east_asian_width_unassigned(self):
651642
eaw = self.db.east_asian_width
652643
# unassigned
@@ -685,7 +676,7 @@ def test_normalization(self):
685676
def test_combining(self):
686677
return super().test_combining()
687678

688-
@unittest.expectedSuccess # TODO: RUSTPYTHON
679+
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: 'LATIN SMALL LETTER D WITH CURL' != None
689680
def test_name(self):
690681
return super().test_name()
691682

@@ -753,7 +744,6 @@ def test_normalize_consistent(self):
753744
def test_bug_1704793(self):
754745
self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
755746

756-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true
757747
def test_ucd_510(self):
758748
# In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
759749
self.assertTrue(unicodedata.mirrored("\u0f3a"))
@@ -805,7 +795,6 @@ def unistr(data):
805795
data = [int(x, 16) for x in data.split(" ")]
806796
return "".join([chr(x) for x in data])
807797

808-
@unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true : 13055
809798
@requires_resource('network')
810799
@requires_resource('cpu')
811800
def test_normalization(self):

crates/stdlib/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,6 @@ unicode_names2 = { workspace = true }
8181
# update version all at the same time
8282
icu_properties = { workspace = true }
8383
icu_normalizer = { workspace = true }
84-
unic-ucd-age = { workspace = true }
85-
ucd = { workspace = true }
8684

8785
# compression
8886
adler32 = { workspace = true }
@@ -143,6 +141,9 @@ system-configuration = { workspace = true }
143141
insta = { workspace = true }
144142
rustpython-pylib = { workspace = true, features = [ "freeze-stdlib" ] }
145143

144+
[build-dependencies]
145+
icu_normalizer = { workspace = true }
146+
icu_properties = { workspace = true }
146147

147148
[lints]
148149
workspace = true

0 commit comments

Comments
 (0)