@@ -782,7 +782,9 @@ impl PyStr {
782782 let mut chars = s. char_indices ( ) ;
783783 let mut out = VecFmtWriter ( Vec :: with_capacity ( s. len ( ) ) ) ;
784784 titlecase_first ( s, & mut chars, & mut out) ;
785- lowercase_and_sigma ( s, & mut chars, & mut out) ;
785+ for ( i, ch) in chars {
786+ lowercase_or_sigma ( ch, s, i, & mut out) ;
787+ }
786788 unsafe { Wtf8Buf :: from_bytes_unchecked ( out. 0 ) }
787789 }
788790 PyKindStr :: Wtf8 ( s) => {
@@ -793,15 +795,18 @@ impl PyStr {
793795 let s = first. valid ( ) ;
794796 let mut chars = s. char_indices ( ) ;
795797 titlecase_first ( s, & mut chars, & mut out) ;
796- lowercase_and_sigma ( s, & mut chars, & mut out) ;
798+ for ( i, ch) in chars {
799+ lowercase_or_sigma ( ch, s, i, & mut out) ;
800+ }
797801 out. 0 . extend ( first. invalid ( ) ) ;
798802 }
799803 // This loop is only hit if the WTF-8 buffer contains invalid Unicode. Otherwise,
800804 // everything is handled above without chunking.
801805 for chunk in chunks {
802806 let s = chunk. valid ( ) ;
803- let mut chars = s. char_indices ( ) ;
804- lowercase_and_sigma ( s, & mut chars, & mut out) ;
807+ for ( i, ch) in s. char_indices ( ) {
808+ lowercase_or_sigma ( ch, s, i, & mut out) ;
809+ }
805810 out. 0 . extend ( chunk. invalid ( ) ) ;
806811 }
807812
@@ -1076,28 +1081,22 @@ impl PyStr {
10761081 PyKindStr :: Ascii ( _) => unsafe {
10771082 Wtf8Buf :: from_bytes_unchecked ( crate :: bytes_inner:: title_ascii ( self . as_bytes ( ) ) )
10781083 } ,
1079- PyKindStr :: Utf8 ( s) => TitlecaseMapper :: new ( )
1080- . titlecase_segment_to_string ( s, & LanguageIdentifier :: UNKNOWN , Default :: default ( ) )
1081- . to_string ( )
1082- . into ( ) ,
1084+ PyKindStr :: Utf8 ( s) => {
1085+ let mut out = VecFmtWriter ( Vec :: with_capacity ( s. len ( ) ) ) ;
1086+ titlecase_string ( s, & mut out) ;
1087+ // SAFETY: `s` is valid UTF-8 and titlecase_string only works on Unicode.
1088+ unsafe { Wtf8Buf :: from_bytes_unchecked ( out. 0 ) }
1089+ }
10831090 PyKindStr :: Wtf8 ( s) => {
1084- let mut buf = VecFmtWriter ( Vec :: with_capacity ( s. len ( ) ) ) ;
1085- let mapper = TitlecaseMapper :: new ( ) ;
1091+ let mut out = VecFmtWriter ( Vec :: with_capacity ( s. len ( ) ) ) ;
10861092 for chunk in s. as_bytes ( ) . utf8_chunks ( ) {
1087- mapper
1088- . titlecase_segment (
1089- chunk. valid ( ) ,
1090- & LanguageIdentifier :: UNKNOWN ,
1091- Default :: default ( ) ,
1092- )
1093- . write_to ( & mut buf)
1094- . expect ( "Writing to an in-memory buffer cannot fail." ) ;
1095- buf. 0 . extend ( chunk. invalid ( ) ) ;
1093+ titlecase_string ( chunk. valid ( ) , & mut out) ;
1094+ out. 0 . extend ( chunk. invalid ( ) ) ;
10961095 }
10971096 // SAFETY:
10981097 // * `s` is valid WTF-8; surrogate bytes were appended without processing.
10991098 // * TitlecaseMapper produces valid UTF-8.
1100- unsafe { Wtf8Buf :: from_bytes_unchecked ( buf . 0 ) }
1099+ unsafe { Wtf8Buf :: from_bytes_unchecked ( out . 0 ) }
11011100 }
11021101 }
11031102 }
@@ -1572,6 +1571,11 @@ impl PyStr {
15721571 }
15731572}
15741573
1574+ /// Title case first char if it is cased or write as is.
1575+ ///
1576+ /// This matches CPython's behavior:
1577+ /// "123abc" -> "123abc"
1578+ /// "abc" -> "Abc"
15751579fn titlecase_first ( s : & str , chars : & mut core:: str:: CharIndices < ' _ > , out : & mut VecFmtWriter ) {
15761580 if let Some ( ( first_pos, first_ch) ) = chars. next ( ) {
15771581 let first = & s[ ..first_pos + first_ch. len_utf8 ( ) ] ;
@@ -1582,20 +1586,43 @@ fn titlecase_first(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut Ve
15821586 }
15831587}
15841588
1585- fn lowercase_and_sigma ( s : & str , chars : & mut core:: str:: CharIndices < ' _ > , out : & mut VecFmtWriter ) {
1589+ /// Title case a string following CPython conventions.
1590+ ///
1591+ /// CPython title cases each char in a segment. A "segment" is split by case ignorable characters
1592+ /// rather than whitespace.
1593+ /// "123abc" -> "123Abc"
1594+ /// "123abc456def" -> "123Abc456Def"
1595+ /// "123 abc" -> "123 Abc"
1596+ fn titlecase_string ( s : & str , out : & mut VecFmtWriter ) {
1597+ let mut previous_is_cased = false ;
1598+ let mapper = TitlecaseMapper :: new ( ) ;
1599+ for ( i, ch) in s. char_indices ( ) {
1600+ if previous_is_cased {
1601+ lowercase_or_sigma ( ch, s, i, out) ;
1602+ } else {
1603+ let s = & s[ i..i + ch. len_utf8 ( ) ] ;
1604+ mapper
1605+ . titlecase_segment ( s, & LanguageIdentifier :: UNKNOWN , Default :: default ( ) )
1606+ . write_to ( out)
1607+ . expect ( "Writing to an in-memory buffer cannot fail." ) ;
1608+ }
1609+
1610+ previous_is_cased = Cased :: for_char ( ch) ;
1611+ }
1612+ }
1613+
1614+ fn lowercase_or_sigma ( ch : char , s : & str , i : usize , out : & mut VecFmtWriter ) {
15861615 let sigma = 'Σ' ;
1587- for ( i, ch) in chars {
1588- if ch == sigma {
1589- let sigma_cased = handle_capital_sigma ( s, i) ;
1616+ if ch == sigma {
1617+ let sigma_cased = handle_capital_sigma ( s, i) ;
1618+ let mut buf = [ 0u8 ; 4 ] ;
1619+ let s = sigma_cased. encode_utf8 ( & mut buf) ;
1620+ out. 0 . extend ( s. as_bytes ( ) ) ;
1621+ } else {
1622+ for ch in ch. to_lowercase ( ) {
15901623 let mut buf = [ 0u8 ; 4 ] ;
1591- let s = sigma_cased . encode_utf8 ( & mut buf) ;
1624+ let s = ch . encode_utf8 ( & mut buf) ;
15921625 out. 0 . extend ( s. as_bytes ( ) ) ;
1593- } else {
1594- for ch in ch. to_lowercase ( ) {
1595- let mut buf = [ 0u8 ; 4 ] ;
1596- let s = ch. encode_utf8 ( & mut buf) ;
1597- out. 0 . extend ( s. as_bytes ( ) ) ;
1598- }
15991626 }
16001627 }
16011628}
0 commit comments