@@ -300,8 +300,46 @@ def test_bug1098990_b(self):
300300 self .assertEqual (reader .readline (), s5 )
301301 self .assertEqual (reader .readline (), "" )
302302
303+ ill_formed_sequence_replace = "\ufffd "
304+
305+ def test_lone_surrogates (self ):
306+ self .assertRaises (UnicodeEncodeError , "\ud800 " .encode , self .encoding )
307+ self .assertEqual ("[\uDC80 ]" .encode (self .encoding , "backslashreplace" ),
308+ "[\\ udc80]" .encode (self .encoding ))
309+ self .assertEqual ("[\uDC80 ]" .encode (self .encoding , "xmlcharrefreplace" ),
310+ "[�]" .encode (self .encoding ))
311+ self .assertEqual ("[\uDC80 ]" .encode (self .encoding , "ignore" ),
312+ "[]" .encode (self .encoding ))
313+ self .assertEqual ("[\uDC80 ]" .encode (self .encoding , "replace" ),
314+ "[?]" .encode (self .encoding ))
315+
316+ bom = "" .encode (self .encoding )
317+ for before , after in [("\U00010fff " , "A" ), ("[" , "]" ),
318+ ("A" , "\U00010fff " )]:
319+ before_sequence = before .encode (self .encoding )[len (bom ):]
320+ after_sequence = after .encode (self .encoding )[len (bom ):]
321+ test_string = before + "\uDC80 " + after
322+ test_sequence = (bom + before_sequence +
323+ self .ill_formed_sequence + after_sequence )
324+ self .assertRaises (UnicodeDecodeError , test_sequence .decode ,
325+ self .encoding )
326+ self .assertEqual (test_string .encode (self .encoding ,
327+ "surrogatepass" ),
328+ test_sequence )
329+ self .assertEqual (test_sequence .decode (self .encoding ,
330+ "surrogatepass" ),
331+ test_string )
332+ self .assertEqual (test_sequence .decode (self .encoding , "ignore" ),
333+ before + after )
334+ self .assertEqual (test_sequence .decode (self .encoding , "replace" ),
335+ before + self .ill_formed_sequence_replace + after )
336+
303337class UTF32Test (ReadTest , unittest .TestCase ):
304338 encoding = "utf-32"
339+ if sys .byteorder == 'little' :
340+ ill_formed_sequence = b"\x80 \xdc \x00 \x00 "
341+ else :
342+ ill_formed_sequence = b"\x00 \x00 \xdc \x80 "
305343
306344 spamle = (b'\xff \xfe \x00 \x00 '
307345 b's\x00 \x00 \x00 p\x00 \x00 \x00 a\x00 \x00 \x00 m\x00 \x00 \x00 '
@@ -393,6 +431,7 @@ def test_issue8941(self):
393431
394432class UTF32LETest (ReadTest , unittest .TestCase ):
395433 encoding = "utf-32-le"
434+ ill_formed_sequence = b"\x80 \xdc \x00 \x00 "
396435
397436 def test_partial (self ):
398437 self .check_partial (
@@ -437,6 +476,7 @@ def test_issue8941(self):
437476
438477class UTF32BETest (ReadTest , unittest .TestCase ):
439478 encoding = "utf-32-be"
479+ ill_formed_sequence = b"\x00 \x00 \xdc \x80 "
440480
441481 def test_partial (self ):
442482 self .check_partial (
@@ -482,6 +522,10 @@ def test_issue8941(self):
482522
483523class UTF16Test (ReadTest , unittest .TestCase ):
484524 encoding = "utf-16"
525+ if sys .byteorder == 'little' :
526+ ill_formed_sequence = b"\x80 \xdc "
527+ else :
528+ ill_formed_sequence = b"\xdc \x80 "
485529
486530 spamle = b'\xff \xfe s\x00 p\x00 a\x00 m\x00 s\x00 p\x00 a\x00 m\x00 '
487531 spambe = b'\xfe \xff \x00 s\x00 p\x00 a\x00 m\x00 s\x00 p\x00 a\x00 m'
@@ -562,6 +606,7 @@ def test_bug691291(self):
562606
563607class UTF16LETest (ReadTest , unittest .TestCase ):
564608 encoding = "utf-16-le"
609+ ill_formed_sequence = b"\x80 \xdc "
565610
566611 def test_partial (self ):
567612 self .check_partial (
@@ -605,6 +650,7 @@ def test_nonbmp(self):
605650
606651class UTF16BETest (ReadTest , unittest .TestCase ):
607652 encoding = "utf-16-be"
653+ ill_formed_sequence = b"\xdc \x80 "
608654
609655 def test_partial (self ):
610656 self .check_partial (
@@ -648,6 +694,8 @@ def test_nonbmp(self):
648694
649695class UTF8Test (ReadTest , unittest .TestCase ):
650696 encoding = "utf-8"
697+ ill_formed_sequence = b"\xed \xb2 \x80 "
698+ ill_formed_sequence_replace = "\ufffd " * 3
651699
652700 def test_partial (self ):
653701 self .check_partial (
@@ -677,18 +725,11 @@ def test_decoder_state(self):
677725 u , u .encode (self .encoding ))
678726
679727 def test_lone_surrogates (self ):
680- self .assertRaises (UnicodeEncodeError , "\ud800 " .encode , "utf-8" )
681- self .assertRaises (UnicodeDecodeError , b"\xed \xa0 \x80 " .decode , "utf-8" )
682- self .assertEqual ("[\uDC80 ]" .encode ("utf-8" , "backslashreplace" ),
683- b'[\\ udc80]' )
684- self .assertEqual ("[\uDC80 ]" .encode ("utf-8" , "xmlcharrefreplace" ),
685- b'[�]' )
686- self .assertEqual ("[\uDC80 ]" .encode ("utf-8" , "surrogateescape" ),
728+ super ().test_lone_surrogates ()
729+ # not sure if this is making sense for
730+ # UTF-16 and UTF-32
731+ self .assertEqual ("[\uDC80 ]" .encode ('utf-8' , "surrogateescape" ),
687732 b'[\x80 ]' )
688- self .assertEqual ("[\uDC80 ]" .encode ("utf-8" , "ignore" ),
689- b'[]' )
690- self .assertEqual ("[\uDC80 ]" .encode ("utf-8" , "replace" ),
691- b'[?]' )
692733
693734 def test_surrogatepass_handler (self ):
694735 self .assertEqual ("abc\ud800 def" .encode ("utf-8" , "surrogatepass" ),
@@ -851,6 +892,9 @@ def test_nonbmp(self):
851892 self .assertEqual ('\ud801 \udca0 ' .encode (self .encoding ), b'+2AHcoA-' )
852893 self .assertEqual (b'+2AHcoA-' .decode (self .encoding ), '\U000104A0 ' )
853894
895+ test_lone_surrogates = None
896+
897+
854898class UTF16ExTest (unittest .TestCase ):
855899
856900 def test_errors (self ):
@@ -875,7 +919,7 @@ def test_bad_args(self):
875919 self .assertRaises (TypeError , codecs .readbuffer_encode )
876920 self .assertRaises (TypeError , codecs .readbuffer_encode , 42 )
877921
878- class UTF8SigTest (ReadTest , unittest .TestCase ):
922+ class UTF8SigTest (UTF8Test , unittest .TestCase ):
879923 encoding = "utf-8-sig"
880924
881925 def test_partial (self ):
0 commit comments