Skip to content

Commit 8929a3e

Browse files
committed
create elementary functions for utf8
1 parent 5e0f88e commit 8929a3e

1 file changed

Lines changed: 188 additions & 0 deletions

File tree

src/rust/src/utf8.rs

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
2+
pub fn length_of(string: String) -> usize {
3+
let mut count = 0;
4+
let mut cur = 0;
5+
6+
for c in string.as_bytes() {
7+
if cur == 0 {
8+
if (c & 0b10000000) == 0 {
9+
count += 1;
10+
cur = 0;
11+
continue;
12+
}
13+
if (c & 0b11100000) == 0b11000000 {
14+
count += 1;
15+
cur = 1;
16+
continue;
17+
}
18+
if (c & 0b11110000) == 0b11100000 {
19+
count += 1;
20+
cur = 2;
21+
continue;
22+
}
23+
if (c & 0b11111000) == 0b11110000 {
24+
count += 1;
25+
cur = 3;
26+
continue;
27+
}
28+
}
29+
cur -= 1;
30+
}
31+
32+
count
33+
}
34+
35+
pub fn validate(string: String) -> bool {
36+
let mut cur = 0;
37+
let mut count = 0;
38+
39+
for c in string.as_bytes() {
40+
if cur == 0 {
41+
if (c & 0b10000000) == 0 {
42+
count += 1;
43+
cur = 0;
44+
continue;
45+
}
46+
if (c & 0b11100000) == 0b11000000 {
47+
count += 1;
48+
cur = 1;
49+
continue;
50+
}
51+
if (c & 0b11110000) == 0b11100000 {
52+
count += 1;
53+
cur = 2;
54+
continue;
55+
}
56+
if (c & 0b11111000) == 0b11110000 {
57+
count += 1;
58+
cur = 3;
59+
continue;
60+
}
61+
}
62+
if (c & 0b11000000) != 0b10000000 {
63+
return false;
64+
}
65+
if count == 0 {
66+
return false;
67+
}
68+
cur -= 1;
69+
}
70+
71+
true
72+
}
73+
74+
pub fn convert_utf8_from(string: String) -> Vec<u32> {
75+
let mut count = 0;
76+
let mut step = 0;
77+
let mut result = vec![];
78+
79+
if !validate(string.clone()) {
80+
return result;
81+
}
82+
83+
let mut temp = 0;
84+
for c in string.as_bytes() {
85+
if count == 0 {
86+
if (c & 0b10000000) == 0 {
87+
result.push(temp);
88+
temp = (*c as u32) & 0b01111111;
89+
count = 0;
90+
step = 2;
91+
continue;
92+
}
93+
if (c & 0b11100000) == 0b11000000 {
94+
result.push(temp);
95+
temp = (*c as u32) & 0b00011111;
96+
temp = temp << 6;
97+
count = 1;
98+
step = 2;
99+
continue;
100+
}
101+
if (c & 0b11110000) == 0b11100000 {
102+
result.push(temp);
103+
temp = (*c as u32) & 0b00001111;
104+
temp = temp << 12;
105+
count = 2;
106+
step = 2;
107+
continue;
108+
}
109+
if (c & 0b11111000) == 0b11110000 {
110+
result.push(temp);
111+
temp = (*c as u32) & 0b00000111;
112+
temp = temp << 18;
113+
count = 3;
114+
step = 2;
115+
continue;
116+
}
117+
}
118+
count -= 1;
119+
step -= 1;
120+
temp |= ((*c as u32) & 0b00111111) << (step * 6);
121+
}
122+
result.push(temp);
123+
124+
if let Some(x) = result.first() {
125+
if *x == 0 {
126+
result.remove(0);
127+
}
128+
}
129+
if let Some(x) = result.last() {
130+
if *x == 0 {
131+
result.pop();
132+
}
133+
}
134+
135+
result
136+
}
137+
138+
pub fn convert_utf8_to(vec: Vec<u32>) -> String {
139+
let mut result = Vec::new();
140+
141+
for v in vec {
142+
println!("{:?}", v);
143+
if v <= 0x0000007F {
144+
result.push(v as u8);
145+
continue;
146+
}
147+
if v <= 0x000007FF {
148+
result.push((((v >> 6) & 0b00011111) | 0b11000000) as u8);
149+
result.push(((v & 0b00111111) | 0b10000000) as u8);
150+
continue;
151+
}
152+
if v <= 0x0000FFFF {
153+
result.push((((v >> 12) & 0b00001111) | 0b11100000) as u8);
154+
result.push((((v >> 6) & 0b00111111) | 0b10000000) as u8);
155+
result.push(((v & 0b00111111) | 0b10000000) as u8);
156+
continue;
157+
}
158+
if v <= 0x0010FFFF {
159+
result.push((((v >> 18) & 0b00000111) | 0b11110000) as u8);
160+
result.push(((v >> 12) & 0b00111111) as u8);
161+
result.push(((v >> 6) & 0b00111111) as u8);
162+
result.push(((v >> 0) & 0b00111111) as u8);
163+
continue;
164+
}
165+
}
166+
167+
match String::from_utf8(result) {
168+
Ok(x) => x,
169+
Err(_) => String::from(""),
170+
}
171+
}
172+
173+
#[test]
174+
fn test() {
175+
let a = "가".to_string();
176+
for i in a.as_bytes() {
177+
print!("{:#010b} ", i);
178+
}
179+
println!();
180+
let c = convert_utf8_from(a.clone());
181+
println!("{:?}", c);
182+
let s = convert_utf8_to(c);
183+
for i in s.as_bytes() {
184+
print!("{:#010b} ", i);
185+
}
186+
println!();
187+
println!("{}", s);
188+
}

0 commit comments

Comments
 (0)