Skip to content

Commit ef375be

Browse files
Fix swapcase() (RustPython#7788)
The tests for swapcase() were failing for two reasons. The first is '𐐧' casing which should be fixed with modern Unicode tables. The second failure is due to CPython's sigma override, which I implemented in PR RustPython#7717.
1 parent 4059a03 commit ef375be

3 files changed

Lines changed: 53 additions & 24 deletions

File tree

Lib/test/test_str.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -972,7 +972,6 @@ def test_title(self):
972972
self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
973973
self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
974974

975-
@unittest.expectedFailure # TODO: RUSTPYTHON; + 𐐧
976975
def test_swapcase(self):
977976
string_tests.StringLikeTest.test_swapcase(self)
978977
self.assertEqual('\U0001044F'.swapcase(), '\U00010427')

crates/vm/src/builtins/str.rs

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@ use crate::{
1111
TryFromBorrowedObject, VirtualMachine,
1212
anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper, adjust_indices},
1313
atomic_func,
14+
bytes_inner::{swapcase_ascii, title_ascii},
1415
cformat::cformat_string,
1516
class::PyClassImpl,
16-
common::lock::LazyLock,
17-
common::str::{PyKindStr, StrData, StrKind},
17+
common::{
18+
lock::LazyLock,
19+
str::{PyKindStr, StrData, StrKind},
20+
},
1821
convert::{IntoPyException, ToPyException, ToPyObject, ToPyResult},
1922
format::{format, format_map},
2023
function::{ArgIterable, ArgSize, FuncArgs, OptionalArg, OptionalOption, PyComparisonValue},
@@ -1078,7 +1081,7 @@ impl PyStr {
10781081
fn title(&self) -> Wtf8Buf {
10791082
match self.as_str_kind() {
10801083
PyKindStr::Ascii(_) => unsafe {
1081-
Wtf8Buf::from_bytes_unchecked(crate::bytes_inner::title_ascii(self.as_bytes()))
1084+
Wtf8Buf::from_bytes_unchecked(title_ascii(self.as_bytes()))
10821085
},
10831086
PyKindStr::Utf8(s) => {
10841087
let mut out = VecFmtWriter(Vec::with_capacity(s.len()));
@@ -1102,19 +1105,29 @@ impl PyStr {
11021105

11031106
#[pymethod]
11041107
fn swapcase(&self) -> Wtf8Buf {
1105-
let mut swapped_str = Wtf8Buf::with_capacity(self.data.len());
1106-
for c_orig in self.as_wtf8().code_points() {
1107-
let c = c_orig.to_char_lossy();
1108-
// to_uppercase returns an iterator because case changes may be multiple bytes
1109-
if c.is_lowercase() {
1110-
swapped_str.extend(c.to_uppercase());
1111-
} else if c.is_uppercase() {
1112-
swapped_str.extend(c.to_lowercase());
1113-
} else {
1114-
swapped_str.push(c_orig);
1108+
match self.as_str_kind() {
1109+
PyKindStr::Ascii(s) => unsafe {
1110+
// SAFETY: ASCII is valid Unicode and swapcase_ascii does not produce non-ASCII.
1111+
Wtf8Buf::from_bytes_unchecked(swapcase_ascii(s.as_bytes()))
1112+
},
1113+
PyKindStr::Utf8(s) => {
1114+
let mut out = VecFmtWriter(Vec::with_capacity(s.len()));
1115+
swapcase_utf8(s, &mut out);
1116+
// SAFETY: `s` is valid UTF-8 and swapcase_utf8 only works on Unicode.
1117+
unsafe { Wtf8Buf::from_bytes_unchecked(out.0) }
1118+
}
1119+
PyKindStr::Wtf8(s) => {
1120+
let mut out = VecFmtWriter(Vec::with_capacity(s.len()));
1121+
for chunk in s.as_bytes().utf8_chunks() {
1122+
swapcase_utf8(chunk.valid(), &mut out);
1123+
out.0.extend(chunk.invalid());
1124+
}
1125+
// SAFETY:
1126+
// * `s` is valid WTF-8; surrogate bytes were appended without processing.
1127+
// * swapcase_utf8 produces valid UTF-8.
1128+
unsafe { Wtf8Buf::from_bytes_unchecked(out.0) }
11151129
}
11161130
}
1117-
swapped_str
11181131
}
11191132

11201133
#[pymethod]
@@ -1648,6 +1661,24 @@ fn handle_capital_sigma(s: &str, i: usize) -> char {
16481661
if before && !after { 'ς' } else { 'σ' }
16491662
}
16501663

1664+
fn swapcase_utf8(s: &str, out: &mut VecFmtWriter) {
1665+
for (i, ch) in s.char_indices() {
1666+
if ch.is_uppercase() {
1667+
lowercase_or_sigma(ch, s, i, out);
1668+
} else if ch.is_lowercase() {
1669+
for ch in ch.to_uppercase() {
1670+
let mut buf = [0u8; 4];
1671+
let s = ch.encode_utf8(&mut buf);
1672+
out.0.extend(s.as_bytes());
1673+
}
1674+
} else {
1675+
let mut buf = [0u8; 4];
1676+
let s = ch.encode_utf8(&mut buf);
1677+
out.0.extend(s.as_bytes());
1678+
}
1679+
}
1680+
}
1681+
16511682
impl PyRef<PyStr> {
16521683
#[must_use]
16531684
pub fn is_empty(&self) -> bool {

crates/vm/src/bytes_inner.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -413,15 +413,7 @@ impl PyBytesInner {
413413
}
414414

415415
pub fn swapcase(&self) -> Vec<u8> {
416-
let mut new: Vec<u8> = Vec::with_capacity(self.elements.len());
417-
for w in &self.elements {
418-
match w {
419-
b'A'..=b'Z' => new.push(w.to_ascii_lowercase()),
420-
b'a'..=b'z' => new.push(w.to_ascii_uppercase()),
421-
x => new.push(*x),
422-
}
423-
}
424-
new
416+
swapcase_ascii(self.as_bytes())
425417
}
426418

427419
pub fn hex(
@@ -1238,3 +1230,10 @@ pub(crate) fn title_ascii(bytes: &[u8]) -> Vec<u8> {
12381230
}
12391231
out
12401232
}
1233+
1234+
pub(crate) fn swapcase_ascii(bytes: &[u8]) -> Vec<u8> {
1235+
bytes
1236+
.iter()
1237+
.map(|&b| if b.is_ascii_alphabetic() { b ^ 0x20 } else { b })
1238+
.collect()
1239+
}

0 commit comments

Comments
 (0)