Skip to content

Commit e69fa48

Browse files
committed
we don't need those
1 parent cf79654 commit e69fa48

1 file changed

Lines changed: 0 additions & 250 deletions

File tree

ws4py/utf8validator.py

Lines changed: 0 additions & 250 deletions
Original file line numberDiff line numberDiff line change
@@ -112,253 +112,3 @@ def validate(self, ba):
112112
self.i += l
113113
return True, self.state == Utf8Validator.UTF8_ACCEPT, l, self.i
114114

115-
116-
UTF8_TEST_SEQUENCES = []
117-
118-
119-
def setTestSequences():
120-
"""
121-
Setup test sequences for UTF-8 decoder tests from
122-
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
123-
"""
124-
125-
# 1 Some correct UTF-8 text
126-
vss = '\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5'
127-
vs = ["Some valid UTF-8 sequences", []]
128-
vs[1].append((True, vss))
129-
UTF8_TEST_SEQUENCES.append(vs)
130-
131-
# All prefixes of correct UTF-8 text
132-
vs = ["All prefixes of a valid UTF-8 string that contains multi-byte code points", []]
133-
v = Utf8Validator()
134-
for i in xrange(1, len(vss) + 1):
135-
v.reset()
136-
res = v.validate(bytearray(vss[:i]))
137-
vs[1].append((res[0] and res[1], vss[:i]))
138-
UTF8_TEST_SEQUENCES.append(vs)
139-
140-
# 2.1 First possible sequence of a certain length
141-
vs = ["First possible sequence of a certain length", []]
142-
vs[1].append((True, '\x00'))
143-
vs[1].append((True, '\xc2\x80'))
144-
vs[1].append((True, '\xe0\xa0\x80'))
145-
vs[1].append((True, '\xf0\x90\x80\x80'))
146-
UTF8_TEST_SEQUENCES.append(vs)
147-
148-
# the following conform to the UTF-8 integer encoding scheme, but
149-
# valid UTF-8 only allows for Unicode code points up to U+10FFFF
150-
vs = ["First possible sequence length 5/6 (invalid codepoints)", []]
151-
vs[1].append((False, '\xf8\x88\x80\x80\x80'))
152-
vs[1].append((False, '\xfc\x84\x80\x80\x80\x80'))
153-
UTF8_TEST_SEQUENCES.append(vs)
154-
155-
# 2.2 Last possible sequence of a certain length
156-
vs = ["Last possible sequence of a certain length", []]
157-
vs[1].append((True, '\x7f'))
158-
vs[1].append((True, '\xdf\xbf'))
159-
vs[1].append((True, '\xef\xbf\xbf'))
160-
vs[1].append((True, '\xf4\x8f\xbf\xbf'))
161-
UTF8_TEST_SEQUENCES.append(vs)
162-
163-
# the following conform to the UTF-8 integer encoding scheme, but
164-
# valid UTF-8 only allows for Unicode code points up to U+10FFFF
165-
vs = ["Last possible sequence length 4/5/6 (invalid codepoints)", []]
166-
vs[1].append((False, '\xf7\xbf\xbf\xbf'))
167-
vs[1].append((False, '\xfb\xbf\xbf\xbf\xbf'))
168-
vs[1].append((False, '\xfd\xbf\xbf\xbf\xbf\xbf'))
169-
UTF8_TEST_SEQUENCES.append(vs)
170-
171-
# 2.3 Other boundary conditions
172-
vs = ["Other boundary conditions", []]
173-
vs[1].append((True, '\xed\x9f\xbf'))
174-
vs[1].append((True, '\xee\x80\x80'))
175-
vs[1].append((True, '\xef\xbf\xbd'))
176-
vs[1].append((True, '\xf4\x8f\xbf\xbf'))
177-
vs[1].append((False, '\xf4\x90\x80\x80'))
178-
UTF8_TEST_SEQUENCES.append(vs)
179-
180-
# 3.1 Unexpected continuation bytes
181-
vs = ["Unexpected continuation bytes", []]
182-
vs[1].append((False, '\x80'))
183-
vs[1].append((False, '\xbf'))
184-
vs[1].append((False, '\x80\xbf'))
185-
vs[1].append((False, '\x80\xbf\x80'))
186-
vs[1].append((False, '\x80\xbf\x80\xbf'))
187-
vs[1].append((False, '\x80\xbf\x80\xbf\x80'))
188-
vs[1].append((False, '\x80\xbf\x80\xbf\x80\xbf'))
189-
s = ""
190-
for i in xrange(0x80, 0xbf):
191-
s += chr(i)
192-
vs[1].append((False, s))
193-
UTF8_TEST_SEQUENCES.append(vs)
194-
195-
# 3.2 Lonely start characters
196-
vs = ["Lonely start characters", []]
197-
m = [(0xc0, 0xdf), (0xe0, 0xef), (0xf0, 0xf7), (0xf8, 0xfb), (0xfc, 0xfd)]
198-
for mm in m:
199-
s = ''
200-
for i in xrange(mm[0], mm[1]):
201-
s += chr(i)
202-
s += chr(0x20)
203-
vs[1].append((False, s))
204-
UTF8_TEST_SEQUENCES.append(vs)
205-
206-
# 3.3 Sequences with last continuation byte missing
207-
vs = ["Sequences with last continuation byte missing", []]
208-
k = ['\xc0', '\xe0\x80', '\xf0\x80\x80', '\xf8\x80\x80\x80', '\xfc\x80\x80\x80\x80',
209-
'\xdf', '\xef\xbf', '\xf7\xbf\xbf', '\xfb\xbf\xbf\xbf', '\xfd\xbf\xbf\xbf\xbf']
210-
for kk in k:
211-
vs[1].append((False, kk))
212-
UTF8_TEST_SEQUENCES.append(vs)
213-
214-
# 3.4 Concatenation of incomplete sequences
215-
vs = ["Concatenation of incomplete sequences", []]
216-
vs[1].append((False, ''.join(k)))
217-
UTF8_TEST_SEQUENCES.append(vs)
218-
219-
# 3.5 Impossible bytes
220-
vs = ["Impossible bytes", []]
221-
vs[1].append((False, '\xfe'))
222-
vs[1].append((False, '\xff'))
223-
vs[1].append((False, '\xfe\xfe\xff\xff'))
224-
UTF8_TEST_SEQUENCES.append(vs)
225-
226-
# 4.1 Examples of an overlong ASCII character
227-
vs = ["Examples of an overlong ASCII character", []]
228-
vs[1].append((False, '\xc0\xaf'))
229-
vs[1].append((False, '\xe0\x80\xaf'))
230-
vs[1].append((False, '\xf0\x80\x80\xaf'))
231-
vs[1].append((False, '\xf8\x80\x80\x80\xaf'))
232-
vs[1].append((False, '\xfc\x80\x80\x80\x80\xaf'))
233-
UTF8_TEST_SEQUENCES.append(vs)
234-
235-
# 4.2 Maximum overlong sequences
236-
vs = ["Maximum overlong sequences", []]
237-
vs[1].append((False, '\xc1\xbf'))
238-
vs[1].append((False, '\xe0\x9f\xbf'))
239-
vs[1].append((False, '\xf0\x8f\xbf\xbf'))
240-
vs[1].append((False, '\xf8\x87\xbf\xbf\xbf'))
241-
vs[1].append((False, '\xfc\x83\xbf\xbf\xbf\xbf'))
242-
UTF8_TEST_SEQUENCES.append(vs)
243-
244-
# 4.3 Overlong representation of the NUL character
245-
vs = ["Overlong representation of the NUL character", []]
246-
vs[1].append((False, '\xc0\x80'))
247-
vs[1].append((False, '\xe0\x80\x80'))
248-
vs[1].append((False, '\xf0\x80\x80\x80'))
249-
vs[1].append((False, '\xf8\x80\x80\x80\x80'))
250-
vs[1].append((False, '\xfc\x80\x80\x80\x80\x80'))
251-
UTF8_TEST_SEQUENCES.append(vs)
252-
253-
# 5.1 Single UTF-16 surrogates
254-
vs = ["Single UTF-16 surrogates", []]
255-
vs[1].append((False, '\xed\xa0\x80'))
256-
vs[1].append((False, '\xed\xad\xbf'))
257-
vs[1].append((False, '\xed\xae\x80'))
258-
vs[1].append((False, '\xed\xaf\xbf'))
259-
vs[1].append((False, '\xed\xb0\x80'))
260-
vs[1].append((False, '\xed\xbe\x80'))
261-
vs[1].append((False, '\xed\xbf\xbf'))
262-
UTF8_TEST_SEQUENCES.append(vs)
263-
264-
# 5.2 Paired UTF-16 surrogates
265-
vs = ["Paired UTF-16 surrogates", []]
266-
vs[1].append((False, '\xed\xa0\x80\xed\xb0\x80'))
267-
vs[1].append((False, '\xed\xa0\x80\xed\xbf\xbf'))
268-
vs[1].append((False, '\xed\xad\xbf\xed\xb0\x80'))
269-
vs[1].append((False, '\xed\xad\xbf\xed\xbf\xbf'))
270-
vs[1].append((False, '\xed\xae\x80\xed\xb0\x80'))
271-
vs[1].append((False, '\xed\xae\x80\xed\xbf\xbf'))
272-
vs[1].append((False, '\xed\xaf\xbf\xed\xb0\x80'))
273-
vs[1].append((False, '\xed\xaf\xbf\xed\xbf\xbf'))
274-
UTF8_TEST_SEQUENCES.append(vs)
275-
276-
# 5.3 Other illegal code positions
277-
# Those are non-character code points and valid UTF-8 by RFC 3629
278-
vs = ["Non-character code points (valid UTF-8)", []]
279-
vs[1].append((True, '\xef\xbf\xbe'))
280-
vs[1].append((True, '\xef\xbf\xbf'))
281-
UTF8_TEST_SEQUENCES.append(vs)
282-
283-
# Unicode replacement character
284-
vs = ["Unicode replacement character", []]
285-
vs[1].append((True, '\xef\xbf\xbd'))
286-
UTF8_TEST_SEQUENCES.append(vs)
287-
288-
289-
setTestSequences()
290-
291-
292-
def test_utf8():
293-
"""
294-
These tests verify the UTF-8 decoder/validator on the various test cases from
295-
http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
296-
"""
297-
298-
v = Utf8Validator()
299-
vs = []
300-
for k in UTF8_TEST_SEQUENCES:
301-
vs.extend(k[1])
302-
303-
# All Unicode code points
304-
for i in xrange(0, 0xffff): # should by 0x10ffff, but non-wide Python build is limited to 16-bits
305-
if i < 0xD800 or i > 0xDFFF: # filter surrogate code points, which are disallowed to encode in UTF-8
306-
vs.append((True, unichr(i).encode("utf-8")))
307-
308-
# 5.1 Single UTF-16 surrogates
309-
for i in xrange(0xD800, 0xDBFF): # high-surrogate
310-
ss = unichr(i).encode("utf-8")
311-
vs.append((False, ss))
312-
for i in xrange(0xDC00, 0xDFFF): # low-surrogate
313-
ss = unichr(i).encode("utf-8")
314-
vs.append((False, ss))
315-
316-
# 5.2 Paired UTF-16 surrogates
317-
for i in xrange(0xD800, 0xDBFF): # high-surrogate
318-
for j in xrange(0xDC00, 0xDFFF): # low-surrogate
319-
ss1 = unichr(i).encode("utf-8")
320-
ss2 = unichr(j).encode("utf-8")
321-
vs.append((False, ss1 + ss2))
322-
vs.append((False, ss2 + ss1))
323-
324-
# now test and assert ..
325-
for s in vs:
326-
v.reset()
327-
r = v.validate(bytearray(s[1]))
328-
res = r[0] and r[1] # no UTF-8 decode error and everything consumed
329-
assert res == s[0]
330-
331-
332-
def test_utf8_incremental():
333-
"""
334-
These tests verify that the UTF-8 decoder/validator can operate incrementally.
335-
"""
336-
337-
v = Utf8Validator()
338-
339-
v.reset()
340-
assert (True, True, 15, 15) == v.validate(bytearray("µ@ßöäüàá"))
341-
342-
v.reset()
343-
assert (False, False, 0, 0) == v.validate(bytearray([0xF5]))
344-
345-
## the following 3 all fail on eating byte 7 (0xA0)
346-
v.reset()
347-
assert (True, True, 6, 6) == v.validate(bytearray([0x65, 0x64, 0x69, 0x74, 0x65, 0x64]))
348-
assert (False, False, 1, 7) == v.validate(bytearray([0xED, 0xA0, 0x80]))
349-
350-
v.reset()
351-
assert (True, True, 4, 4) == v.validate(bytearray([0x65, 0x64, 0x69, 0x74]))
352-
assert (False, False, 3, 7) == v.validate(bytearray([0x65, 0x64, 0xED, 0xA0, 0x80]))
353-
354-
v.reset()
355-
assert (True, False, 7, 7) == v.validate(bytearray([0x65, 0x64, 0x69, 0x74, 0x65, 0x64, 0xED]))
356-
assert (False, False, 0, 7) == v.validate(bytearray([0xA0, 0x80]))
357-
358-
359-
if __name__ == '__main__':
360-
"""
361-
Run unit tests.
362-
"""
363-
test_utf8_incremental()
364-
test_utf8()

0 commit comments

Comments
 (0)