Skip to content

Commit ce75f32

Browse files
joshuamegnauth54youknowone
authored andcommitted
Match CPython's title() exactly
1 parent 58d8b7c commit ce75f32

3 files changed

Lines changed: 59 additions & 33 deletions

File tree

Lib/test/test_str.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,6 @@ def test_capitalize(self):
955955
self.assertEqual('finnish'.capitalize(), 'Finnish')
956956
self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
957957

958-
@unittest.expectedFailure # TODO: RUSTPYTHON; ? ^
959958
def test_title(self):
960959
super().test_title()
961960
self.assertEqual('\U0001044F'.title(), '\U00010427')

crates/vm/src/builtins/str.rs

Lines changed: 58 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,9 @@ impl PyStr {
782782
let mut chars = s.char_indices();
783783
let mut out = VecFmtWriter(Vec::with_capacity(s.len()));
784784
titlecase_first(s, &mut chars, &mut out);
785-
lowercase_and_sigma(s, &mut chars, &mut out);
785+
for (i, ch) in chars {
786+
lowercase_or_sigma(ch, s, i, &mut out);
787+
}
786788
unsafe { Wtf8Buf::from_bytes_unchecked(out.0) }
787789
}
788790
PyKindStr::Wtf8(s) => {
@@ -793,15 +795,18 @@ impl PyStr {
793795
let s = first.valid();
794796
let mut chars = s.char_indices();
795797
titlecase_first(s, &mut chars, &mut out);
796-
lowercase_and_sigma(s, &mut chars, &mut out);
798+
for (i, ch) in chars {
799+
lowercase_or_sigma(ch, s, i, &mut out);
800+
}
797801
out.0.extend(first.invalid());
798802
}
799803
// This loop is only hit if the WTF-8 buffer contains invalid Unicode. Otherwise,
800804
// everything is handled above without chunking.
801805
for chunk in chunks {
802806
let s = chunk.valid();
803-
let mut chars = s.char_indices();
804-
lowercase_and_sigma(s, &mut chars, &mut out);
807+
for (i, ch) in s.char_indices() {
808+
lowercase_or_sigma(ch, s, i, &mut out);
809+
}
805810
out.0.extend(chunk.invalid());
806811
}
807812

@@ -1076,28 +1081,22 @@ impl PyStr {
10761081
PyKindStr::Ascii(_) => unsafe {
10771082
Wtf8Buf::from_bytes_unchecked(crate::bytes_inner::title_ascii(self.as_bytes()))
10781083
},
1079-
PyKindStr::Utf8(s) => TitlecaseMapper::new()
1080-
.titlecase_segment_to_string(s, &LanguageIdentifier::UNKNOWN, Default::default())
1081-
.to_string()
1082-
.into(),
1084+
PyKindStr::Utf8(s) => {
1085+
let mut out = VecFmtWriter(Vec::with_capacity(s.len()));
1086+
titlecase_string(s, &mut out);
1087+
// SAFETY: `s` is valid UTF-8 and titlecase_string only works on Unicode.
1088+
unsafe { Wtf8Buf::from_bytes_unchecked(out.0) }
1089+
}
10831090
PyKindStr::Wtf8(s) => {
1084-
let mut buf = VecFmtWriter(Vec::with_capacity(s.len()));
1085-
let mapper = TitlecaseMapper::new();
1091+
let mut out = VecFmtWriter(Vec::with_capacity(s.len()));
10861092
for chunk in s.as_bytes().utf8_chunks() {
1087-
mapper
1088-
.titlecase_segment(
1089-
chunk.valid(),
1090-
&LanguageIdentifier::UNKNOWN,
1091-
Default::default(),
1092-
)
1093-
.write_to(&mut buf)
1094-
.expect("Writing to an in-memory buffer cannot fail.");
1095-
buf.0.extend(chunk.invalid());
1093+
titlecase_string(chunk.valid(), &mut out);
1094+
out.0.extend(chunk.invalid());
10961095
}
10971096
// SAFETY:
10981097
// * `s` is valid WTF-8; surrogate bytes were appended without processing.
10991098
// * TitlecaseMapper produces valid UTF-8.
1100-
unsafe { Wtf8Buf::from_bytes_unchecked(buf.0) }
1099+
unsafe { Wtf8Buf::from_bytes_unchecked(out.0) }
11011100
}
11021101
}
11031102
}
@@ -1572,6 +1571,11 @@ impl PyStr {
15721571
}
15731572
}
15741573

1574+
/// Title case first char if it is cased or write as is.
1575+
///
1576+
/// This matches CPython's behavior:
1577+
/// "123abc" -> "123abc"
1578+
/// "abc" -> "Abc"
15751579
fn titlecase_first(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut VecFmtWriter) {
15761580
if let Some((first_pos, first_ch)) = chars.next() {
15771581
let first = &s[..first_pos + first_ch.len_utf8()];
@@ -1582,20 +1586,43 @@ fn titlecase_first(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut Ve
15821586
}
15831587
}
15841588

1585-
fn lowercase_and_sigma(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut VecFmtWriter) {
1589+
/// Title case a string following CPython conventions.
1590+
///
1591+
/// CPython title cases each char in a segment. A "segment" is split by case ignorable characters
1592+
/// rather than whitespace.
1593+
/// "123abc" -> "123Abc"
1594+
/// "123abc456def" -> "123Abc456Def"
1595+
/// "123 abc" -> "123 Abc"
1596+
fn titlecase_string(s: &str, out: &mut VecFmtWriter) {
1597+
let mut previous_is_cased = false;
1598+
let mapper = TitlecaseMapper::new();
1599+
for (i, ch) in s.char_indices() {
1600+
if previous_is_cased {
1601+
lowercase_or_sigma(ch, s, i, out);
1602+
} else {
1603+
let s = &s[i..i + ch.len_utf8()];
1604+
mapper
1605+
.titlecase_segment(s, &LanguageIdentifier::UNKNOWN, Default::default())
1606+
.write_to(out)
1607+
.expect("Writing to an in-memory buffer cannot fail.");
1608+
}
1609+
1610+
previous_is_cased = Cased::for_char(ch);
1611+
}
1612+
}
1613+
1614+
fn lowercase_or_sigma(ch: char, s: &str, i: usize, out: &mut VecFmtWriter) {
15861615
let sigma = 'Σ';
1587-
for (i, ch) in chars {
1588-
if ch == sigma {
1589-
let sigma_cased = handle_capital_sigma(s, i);
1616+
if ch == sigma {
1617+
let sigma_cased = handle_capital_sigma(s, i);
1618+
let mut buf = [0u8; 4];
1619+
let s = sigma_cased.encode_utf8(&mut buf);
1620+
out.0.extend(s.as_bytes());
1621+
} else {
1622+
for ch in ch.to_lowercase() {
15901623
let mut buf = [0u8; 4];
1591-
let s = sigma_cased.encode_utf8(&mut buf);
1624+
let s = ch.encode_utf8(&mut buf);
15921625
out.0.extend(s.as_bytes());
1593-
} else {
1594-
for ch in ch.to_lowercase() {
1595-
let mut buf = [0u8; 4];
1596-
let s = ch.encode_utf8(&mut buf);
1597-
out.0.extend(s.as_bytes());
1598-
}
15991626
}
16001627
}
16011628
}

crates/vm/src/bytes_inner.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1221,7 +1221,7 @@ pub(crate) const fn is_py_ascii_whitespace(b: u8) -> bool {
12211221
/// ASCII-only title casing.
12221222
///
12231223
/// This is purposely naive as is CPython's implementation.
1224-
pub fn title_ascii(bytes: &[u8]) -> Vec<u8> {
1224+
pub(crate) fn title_ascii(bytes: &[u8]) -> Vec<u8> {
12251225
let mut next_upper = true;
12261226
let mut out = Vec::with_capacity(bytes.len());
12271227
for &b in bytes {

0 commit comments

Comments
 (0)