@@ -112,253 +112,3 @@ def validate(self, ba):
112112 self .i += l
113113 return True , self .state == Utf8Validator .UTF8_ACCEPT , l , self .i
114114
115-
116- UTF8_TEST_SEQUENCES = []
117-
118-
119- def setTestSequences ():
120- """
121- Setup test sequences for UTF-8 decoder tests from
122- http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
123- """
124-
125- # 1 Some correct UTF-8 text
126- vss = '\xce \xba \xe1 \xbd \xb9 \xcf \x83 \xce \xbc \xce \xb5 '
127- vs = ["Some valid UTF-8 sequences" , []]
128- vs [1 ].append ((True , vss ))
129- UTF8_TEST_SEQUENCES .append (vs )
130-
131- # All prefixes of correct UTF-8 text
132- vs = ["All prefixes of a valid UTF-8 string that contains multi-byte code points" , []]
133- v = Utf8Validator ()
134- for i in xrange (1 , len (vss ) + 1 ):
135- v .reset ()
136- res = v .validate (bytearray (vss [:i ]))
137- vs [1 ].append ((res [0 ] and res [1 ], vss [:i ]))
138- UTF8_TEST_SEQUENCES .append (vs )
139-
140- # 2.1 First possible sequence of a certain length
141- vs = ["First possible sequence of a certain length" , []]
142- vs [1 ].append ((True , '\x00 ' ))
143- vs [1 ].append ((True , '\xc2 \x80 ' ))
144- vs [1 ].append ((True , '\xe0 \xa0 \x80 ' ))
145- vs [1 ].append ((True , '\xf0 \x90 \x80 \x80 ' ))
146- UTF8_TEST_SEQUENCES .append (vs )
147-
148- # the following conform to the UTF-8 integer encoding scheme, but
149- # valid UTF-8 only allows for Unicode code points up to U+10FFFF
150- vs = ["First possible sequence length 5/6 (invalid codepoints)" , []]
151- vs [1 ].append ((False , '\xf8 \x88 \x80 \x80 \x80 ' ))
152- vs [1 ].append ((False , '\xfc \x84 \x80 \x80 \x80 \x80 ' ))
153- UTF8_TEST_SEQUENCES .append (vs )
154-
155- # 2.2 Last possible sequence of a certain length
156- vs = ["Last possible sequence of a certain length" , []]
157- vs [1 ].append ((True , '\x7f ' ))
158- vs [1 ].append ((True , '\xdf \xbf ' ))
159- vs [1 ].append ((True , '\xef \xbf \xbf ' ))
160- vs [1 ].append ((True , '\xf4 \x8f \xbf \xbf ' ))
161- UTF8_TEST_SEQUENCES .append (vs )
162-
163- # the following conform to the UTF-8 integer encoding scheme, but
164- # valid UTF-8 only allows for Unicode code points up to U+10FFFF
165- vs = ["Last possible sequence length 4/5/6 (invalid codepoints)" , []]
166- vs [1 ].append ((False , '\xf7 \xbf \xbf \xbf ' ))
167- vs [1 ].append ((False , '\xfb \xbf \xbf \xbf \xbf ' ))
168- vs [1 ].append ((False , '\xfd \xbf \xbf \xbf \xbf \xbf ' ))
169- UTF8_TEST_SEQUENCES .append (vs )
170-
171- # 2.3 Other boundary conditions
172- vs = ["Other boundary conditions" , []]
173- vs [1 ].append ((True , '\xed \x9f \xbf ' ))
174- vs [1 ].append ((True , '\xee \x80 \x80 ' ))
175- vs [1 ].append ((True , '\xef \xbf \xbd ' ))
176- vs [1 ].append ((True , '\xf4 \x8f \xbf \xbf ' ))
177- vs [1 ].append ((False , '\xf4 \x90 \x80 \x80 ' ))
178- UTF8_TEST_SEQUENCES .append (vs )
179-
180- # 3.1 Unexpected continuation bytes
181- vs = ["Unexpected continuation bytes" , []]
182- vs [1 ].append ((False , '\x80 ' ))
183- vs [1 ].append ((False , '\xbf ' ))
184- vs [1 ].append ((False , '\x80 \xbf ' ))
185- vs [1 ].append ((False , '\x80 \xbf \x80 ' ))
186- vs [1 ].append ((False , '\x80 \xbf \x80 \xbf ' ))
187- vs [1 ].append ((False , '\x80 \xbf \x80 \xbf \x80 ' ))
188- vs [1 ].append ((False , '\x80 \xbf \x80 \xbf \x80 \xbf ' ))
189- s = ""
190- for i in xrange (0x80 , 0xbf ):
191- s += chr (i )
192- vs [1 ].append ((False , s ))
193- UTF8_TEST_SEQUENCES .append (vs )
194-
195- # 3.2 Lonely start characters
196- vs = ["Lonely start characters" , []]
197- m = [(0xc0 , 0xdf ), (0xe0 , 0xef ), (0xf0 , 0xf7 ), (0xf8 , 0xfb ), (0xfc , 0xfd )]
198- for mm in m :
199- s = ''
200- for i in xrange (mm [0 ], mm [1 ]):
201- s += chr (i )
202- s += chr (0x20 )
203- vs [1 ].append ((False , s ))
204- UTF8_TEST_SEQUENCES .append (vs )
205-
206- # 3.3 Sequences with last continuation byte missing
207- vs = ["Sequences with last continuation byte missing" , []]
208- k = ['\xc0 ' , '\xe0 \x80 ' , '\xf0 \x80 \x80 ' , '\xf8 \x80 \x80 \x80 ' , '\xfc \x80 \x80 \x80 \x80 ' ,
209- '\xdf ' , '\xef \xbf ' , '\xf7 \xbf \xbf ' , '\xfb \xbf \xbf \xbf ' , '\xfd \xbf \xbf \xbf \xbf ' ]
210- for kk in k :
211- vs [1 ].append ((False , kk ))
212- UTF8_TEST_SEQUENCES .append (vs )
213-
214- # 3.4 Concatenation of incomplete sequences
215- vs = ["Concatenation of incomplete sequences" , []]
216- vs [1 ].append ((False , '' .join (k )))
217- UTF8_TEST_SEQUENCES .append (vs )
218-
219- # 3.5 Impossible bytes
220- vs = ["Impossible bytes" , []]
221- vs [1 ].append ((False , '\xfe ' ))
222- vs [1 ].append ((False , '\xff ' ))
223- vs [1 ].append ((False , '\xfe \xfe \xff \xff ' ))
224- UTF8_TEST_SEQUENCES .append (vs )
225-
226- # 4.1 Examples of an overlong ASCII character
227- vs = ["Examples of an overlong ASCII character" , []]
228- vs [1 ].append ((False , '\xc0 \xaf ' ))
229- vs [1 ].append ((False , '\xe0 \x80 \xaf ' ))
230- vs [1 ].append ((False , '\xf0 \x80 \x80 \xaf ' ))
231- vs [1 ].append ((False , '\xf8 \x80 \x80 \x80 \xaf ' ))
232- vs [1 ].append ((False , '\xfc \x80 \x80 \x80 \x80 \xaf ' ))
233- UTF8_TEST_SEQUENCES .append (vs )
234-
235- # 4.2 Maximum overlong sequences
236- vs = ["Maximum overlong sequences" , []]
237- vs [1 ].append ((False , '\xc1 \xbf ' ))
238- vs [1 ].append ((False , '\xe0 \x9f \xbf ' ))
239- vs [1 ].append ((False , '\xf0 \x8f \xbf \xbf ' ))
240- vs [1 ].append ((False , '\xf8 \x87 \xbf \xbf \xbf ' ))
241- vs [1 ].append ((False , '\xfc \x83 \xbf \xbf \xbf \xbf ' ))
242- UTF8_TEST_SEQUENCES .append (vs )
243-
244- # 4.3 Overlong representation of the NUL character
245- vs = ["Overlong representation of the NUL character" , []]
246- vs [1 ].append ((False , '\xc0 \x80 ' ))
247- vs [1 ].append ((False , '\xe0 \x80 \x80 ' ))
248- vs [1 ].append ((False , '\xf0 \x80 \x80 \x80 ' ))
249- vs [1 ].append ((False , '\xf8 \x80 \x80 \x80 \x80 ' ))
250- vs [1 ].append ((False , '\xfc \x80 \x80 \x80 \x80 \x80 ' ))
251- UTF8_TEST_SEQUENCES .append (vs )
252-
253- # 5.1 Single UTF-16 surrogates
254- vs = ["Single UTF-16 surrogates" , []]
255- vs [1 ].append ((False , '\xed \xa0 \x80 ' ))
256- vs [1 ].append ((False , '\xed \xad \xbf ' ))
257- vs [1 ].append ((False , '\xed \xae \x80 ' ))
258- vs [1 ].append ((False , '\xed \xaf \xbf ' ))
259- vs [1 ].append ((False , '\xed \xb0 \x80 ' ))
260- vs [1 ].append ((False , '\xed \xbe \x80 ' ))
261- vs [1 ].append ((False , '\xed \xbf \xbf ' ))
262- UTF8_TEST_SEQUENCES .append (vs )
263-
264- # 5.2 Paired UTF-16 surrogates
265- vs = ["Paired UTF-16 surrogates" , []]
266- vs [1 ].append ((False , '\xed \xa0 \x80 \xed \xb0 \x80 ' ))
267- vs [1 ].append ((False , '\xed \xa0 \x80 \xed \xbf \xbf ' ))
268- vs [1 ].append ((False , '\xed \xad \xbf \xed \xb0 \x80 ' ))
269- vs [1 ].append ((False , '\xed \xad \xbf \xed \xbf \xbf ' ))
270- vs [1 ].append ((False , '\xed \xae \x80 \xed \xb0 \x80 ' ))
271- vs [1 ].append ((False , '\xed \xae \x80 \xed \xbf \xbf ' ))
272- vs [1 ].append ((False , '\xed \xaf \xbf \xed \xb0 \x80 ' ))
273- vs [1 ].append ((False , '\xed \xaf \xbf \xed \xbf \xbf ' ))
274- UTF8_TEST_SEQUENCES .append (vs )
275-
276- # 5.3 Other illegal code positions
277- # Those are non-character code points and valid UTF-8 by RFC 3629
278- vs = ["Non-character code points (valid UTF-8)" , []]
279- vs [1 ].append ((True , '\xef \xbf \xbe ' ))
280- vs [1 ].append ((True , '\xef \xbf \xbf ' ))
281- UTF8_TEST_SEQUENCES .append (vs )
282-
283- # Unicode replacement character
284- vs = ["Unicode replacement character" , []]
285- vs [1 ].append ((True , '\xef \xbf \xbd ' ))
286- UTF8_TEST_SEQUENCES .append (vs )
287-
288-
289- setTestSequences ()
290-
291-
292- def test_utf8 ():
293- """
294- These tests verify the UTF-8 decoder/validator on the various test cases from
295- http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
296- """
297-
298- v = Utf8Validator ()
299- vs = []
300- for k in UTF8_TEST_SEQUENCES :
301- vs .extend (k [1 ])
302-
303- # All Unicode code points
304- for i in xrange (0 , 0xffff ): # should by 0x10ffff, but non-wide Python build is limited to 16-bits
305- if i < 0xD800 or i > 0xDFFF : # filter surrogate code points, which are disallowed to encode in UTF-8
306- vs .append ((True , unichr (i ).encode ("utf-8" )))
307-
308- # 5.1 Single UTF-16 surrogates
309- for i in xrange (0xD800 , 0xDBFF ): # high-surrogate
310- ss = unichr (i ).encode ("utf-8" )
311- vs .append ((False , ss ))
312- for i in xrange (0xDC00 , 0xDFFF ): # low-surrogate
313- ss = unichr (i ).encode ("utf-8" )
314- vs .append ((False , ss ))
315-
316- # 5.2 Paired UTF-16 surrogates
317- for i in xrange (0xD800 , 0xDBFF ): # high-surrogate
318- for j in xrange (0xDC00 , 0xDFFF ): # low-surrogate
319- ss1 = unichr (i ).encode ("utf-8" )
320- ss2 = unichr (j ).encode ("utf-8" )
321- vs .append ((False , ss1 + ss2 ))
322- vs .append ((False , ss2 + ss1 ))
323-
324- # now test and assert ..
325- for s in vs :
326- v .reset ()
327- r = v .validate (bytearray (s [1 ]))
328- res = r [0 ] and r [1 ] # no UTF-8 decode error and everything consumed
329- assert res == s [0 ]
330-
331-
332- def test_utf8_incremental ():
333- """
334- These tests verify that the UTF-8 decoder/validator can operate incrementally.
335- """
336-
337- v = Utf8Validator ()
338-
339- v .reset ()
340- assert (True , True , 15 , 15 ) == v .validate (bytearray ("µ@ßöäüàá" ))
341-
342- v .reset ()
343- assert (False , False , 0 , 0 ) == v .validate (bytearray ([0xF5 ]))
344-
345- ## the following 3 all fail on eating byte 7 (0xA0)
346- v .reset ()
347- assert (True , True , 6 , 6 ) == v .validate (bytearray ([0x65 , 0x64 , 0x69 , 0x74 , 0x65 , 0x64 ]))
348- assert (False , False , 1 , 7 ) == v .validate (bytearray ([0xED , 0xA0 , 0x80 ]))
349-
350- v .reset ()
351- assert (True , True , 4 , 4 ) == v .validate (bytearray ([0x65 , 0x64 , 0x69 , 0x74 ]))
352- assert (False , False , 3 , 7 ) == v .validate (bytearray ([0x65 , 0x64 , 0xED , 0xA0 , 0x80 ]))
353-
354- v .reset ()
355- assert (True , False , 7 , 7 ) == v .validate (bytearray ([0x65 , 0x64 , 0x69 , 0x74 , 0x65 , 0x64 , 0xED ]))
356- assert (False , False , 0 , 7 ) == v .validate (bytearray ([0xA0 , 0x80 ]))
357-
358-
359- if __name__ == '__main__' :
360- """
361- Run unit tests.
362- """
363- test_utf8_incremental ()
364- test_utf8 ()
0 commit comments