Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion tests/snippets/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@

# make trans
# fmt: off
assert (
assert (
bytes.maketrans(memoryview(b"abc"), bytearray(b"zzz"))
== bytes([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 122, 122, 122, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255])
)
Expand Down Expand Up @@ -597,3 +597,11 @@
assert a * 1 == b'abcd'
assert a * 3 == b'abcdabcdabcd'
assert 3 * a == b'abcdabcdabcd'

# decode
assert b'\x72\x75\x73\x74'.decode('ascii') == 'rust'
assert b'\xc2\xae\x75\x73\x74'.decode('ascii', 'replace') == '��ust'
assert b'\xc2\xae\x75\x73\x74'.decode('ascii', 'ignore') == 'ust'
assert b'\xc2\xae\x75\x73\x74'.decode('utf-8') == '®ust'
assert b'\xc2\xae\x75\x73\x74'.decode() == '®ust'
assert b'\xe4\xb8\xad\xe6\x96\x87\xe5\xad\x97'.decode('utf-8') == '中文字'
1 change: 1 addition & 0 deletions vm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ proc-macro-hack = { version = "0.5", optional = true }
bitflags = "1.1"
libc = "0.2"
nix = "0.14.1"
wtf8 = "0.0.3"

flame = { version = "0.2", optional = true }
flamer = { version = "0.3", optional = true }
Expand Down
109 changes: 109 additions & 0 deletions vm/src/obj/objbytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ use super::objiter;

use super::objtype::PyClassRef;

use wtf8;

/// "bytes(iterable_of_ints) -> bytes\n\
/// bytes(string, encoding[, errors]) -> bytes\n\
/// bytes(bytes_or_buffer) -> immutable copy of bytes_or_buffer\n\
Expand Down Expand Up @@ -420,6 +422,113 @@ impl PyBytesRef {
fn rmul(self, n: PyIntRef, vm: &VirtualMachine) -> PyResult {
self.repeat(n, vm)
}

/// Return a string decoded from the given bytes.
/// Default encoding is 'utf-8'.
/// Default errors is 'strict', meaning that encoding errors raise a UnicodeError.
/// Other possible values are 'ignore', 'replace'
/// For a list of possible encodings,
/// see https://docs.python.org/3/library/codecs.html#standard-encodings
/// currently, only 'utf-8' and 'ascii' emplemented
#[pymethod(name = "decode")]
fn decode(
self,
encoding: OptionalArg<PyStringRef>,
errors: OptionalArg<PyStringRef>,
vm: &VirtualMachine,
) -> PyResult<String> {
let mut strict_mod = true;
let replacing_char = match errors {
OptionalArg::Present(ref input) => match input.as_str() {
"replace" => {
strict_mod = false;
Some('\u{FFFD}')
}
"ignore" => {
strict_mod = false;
None
}
_ => None,
},
OptionalArg::Missing => None,
};
let encoding_type = match encoding {
OptionalArg::Present(ref input) => input.as_str(),
OptionalArg::Missing => "utf-8",
};

let decode_error = Err(vm.new_value_error("DecodeError".to_string()));

let mut decode_content = String::new();
match encoding_type {
"ascii" => {
for &b in self.get_value() {
if b.is_ascii() {
decode_content.push(b as char)
} else if !strict_mod && replacing_char.is_some() {
decode_content.push(replacing_char.unwrap())
}
}
}
"utf-8" | "utf8" | "" => {
let mut p: u32 = 0u32;
let mut remaining_bytes = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

utf-8 is the default, but cpython throws a LookupError if you give it an invalid encoding.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, I will implement this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done :)

for &b in self.get_value() {
if (b as u8) & 128 == 0 {
if b.is_ascii() {
decode_content.push(b as char)
} else if !strict_mod && replacing_char.is_some() {
decode_content.push(replacing_char.unwrap())
}
} else if (b as u8) & 192 == 128 {
remaining_bytes -= 1;

p += u32::from(b as u8 & 63) << (6 * remaining_bytes);

if remaining_bytes == 0 {
match wtf8::CodePoint::from_u32(p) {
Some(cp) => {
if !strict_mod && replacing_char.is_some() {
decode_content.push(cp.to_char_lossy());
} else {
match cp.to_char() {
Some(c) => decode_content.push(c),
None => {
if strict_mod {
return decode_error;
}
}
}
}
}
None => {
if replacing_char.is_none() {
decode_content.push(replacing_char.unwrap())
}
}
}
p = 0u32;
}
} else if (b as u8) & 224 == 192 {
remaining_bytes = 1;
p = u32::from(b as u8 & 31) << 6;
} else if (b as u8) & 240 == 224 {
remaining_bytes = 2;
p = u32::from(b as u8 & 15) << 12;
} else if (b as u8) & 248 == 240 {
remaining_bytes = 3;
p = u32::from(b as u8 & 7) << 18;
} else if !strict_mod && replacing_char.is_some() {
decode_content.push(replacing_char.unwrap())
}
}
}
_ => {
return Err(vm.new_lookup_error(format!("unknown encoding: {}", encoding_type)));
}
}
Ok(decode_content)
}
}

#[pyclass]
Expand Down
5 changes: 5 additions & 0 deletions vm/src/vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ impl VirtualMachine {
self.new_exception_obj(exc_type, vec![pystr_msg]).unwrap()
}

pub fn new_lookup_error(&self, msg: String) -> PyObjectRef {
let lookup_error = self.ctx.exceptions.lookup_error.clone();
self.new_exception(lookup_error, msg)
}

pub fn new_attribute_error(&self, msg: String) -> PyObjectRef {
let attribute_error = self.ctx.exceptions.attribute_error.clone();
self.new_exception(attribute_error, msg)
Expand Down