Skip to content

Commit 58cf607

Browse files
Issue python#12892: The utf-16* and utf-32* codecs now reject (lone) surrogates.
The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass error handler now works with the utf-16* and utf-32* codecs. Based on patches by Victor Stinner and Kang-Hao (Kenny) Lu.
1 parent a938bcf commit 58cf607

File tree

8 files changed

+637
-76
lines changed

8 files changed

+637
-76
lines changed

Doc/library/codecs.rst

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -365,18 +365,23 @@ and implemented by all standard Python codecs:
365365
| | in :pep:`383`. |
366366
+-------------------------+-----------------------------------------------+
367367

368-
In addition, the following error handlers are specific to a single codec:
368+
In addition, the following error handlers are specific to Unicode encoding
369+
schemes:
369370

370-
+-------------------+---------+-------------------------------------------+
371-
| Value | Codec | Meaning |
372-
+===================+=========+===========================================+
373-
|``'surrogatepass'``| utf-8 | Allow encoding and decoding of surrogate |
374-
| | | codes in UTF-8. |
375-
+-------------------+---------+-------------------------------------------+
371+
+-------------------+------------------------+-------------------------------------------+
372+
| Value | Codec | Meaning |
373+
+===================+========================+===========================================+
374+
|``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate |
375+
| | utf-16-be, utf-16-le, | codes in all the Unicode encoding schemes.|
376+
| | utf-32-be, utf-32-le | |
377+
+-------------------+------------------------+-------------------------------------------+
376378

377379
.. versionadded:: 3.1
378380
The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
379381

382+
.. versionchanged:: 3.4
383+
The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
384+
380385
The set of allowed values can be extended via :meth:`register_error`.
381386

382387

@@ -1167,6 +1172,12 @@ particular, the following variants typically exist:
11671172
| utf_8_sig | | all languages |
11681173
+-----------------+--------------------------------+--------------------------------+
11691174

1175+
.. versionchanged:: 3.4
1176+
The utf-16\* and utf-32\* encoders no longer allow surrogate code points
1177+
(U+D800--U+DFFF) to be encoded. The utf-32\* decoders no longer decode
1178+
byte sequences that correspond to surrogate code points.
1179+
1180+
11701181
Python Specific Encodings
11711182
-------------------------
11721183

Doc/whatsnew/3.4.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,13 @@ Some smaller changes made to the core Python language are:
253253
``__main__.__file__`` when a script has been executed directly using
254254
a relative path (Contributed by Brett Cannon in :issue:`18416`).
255255

256+
* Now all the UTF-\* codecs (except UTF-7) reject surrogates during both
257+
encoding and decoding unless the ``surrogatepass`` error handler is used,
258+
with the exception of the UTF-16 decoder that accepts valid surrogate pairs,
259+
and the UTF-16 encoder that produces them while encoding non-BMP characters.
260+
Contributed by Victor Stinner, Kang-Hao (Kenny) Lu and Serhiy Storchaka in
261+
:issue:`12892`.
262+
256263

257264
New Modules
258265
===========

Lib/test/test_codecs.py

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,46 @@ def test_bug1098990_b(self):
300300
self.assertEqual(reader.readline(), s5)
301301
self.assertEqual(reader.readline(), "")
302302

303+
ill_formed_sequence_replace = "\ufffd"
304+
305+
def test_lone_surrogates(self):
306+
self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
307+
self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
308+
"[\\udc80]".encode(self.encoding))
309+
self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
310+
"[�]".encode(self.encoding))
311+
self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
312+
"[]".encode(self.encoding))
313+
self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
314+
"[?]".encode(self.encoding))
315+
316+
bom = "".encode(self.encoding)
317+
for before, after in [("\U00010fff", "A"), ("[", "]"),
318+
("A", "\U00010fff")]:
319+
before_sequence = before.encode(self.encoding)[len(bom):]
320+
after_sequence = after.encode(self.encoding)[len(bom):]
321+
test_string = before + "\uDC80" + after
322+
test_sequence = (bom + before_sequence +
323+
self.ill_formed_sequence + after_sequence)
324+
self.assertRaises(UnicodeDecodeError, test_sequence.decode,
325+
self.encoding)
326+
self.assertEqual(test_string.encode(self.encoding,
327+
"surrogatepass"),
328+
test_sequence)
329+
self.assertEqual(test_sequence.decode(self.encoding,
330+
"surrogatepass"),
331+
test_string)
332+
self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
333+
before + after)
334+
self.assertEqual(test_sequence.decode(self.encoding, "replace"),
335+
before + self.ill_formed_sequence_replace + after)
336+
303337
class UTF32Test(ReadTest, unittest.TestCase):
304338
encoding = "utf-32"
339+
if sys.byteorder == 'little':
340+
ill_formed_sequence = b"\x80\xdc\x00\x00"
341+
else:
342+
ill_formed_sequence = b"\x00\x00\xdc\x80"
305343

306344
spamle = (b'\xff\xfe\x00\x00'
307345
b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
@@ -393,6 +431,7 @@ def test_issue8941(self):
393431

394432
class UTF32LETest(ReadTest, unittest.TestCase):
395433
encoding = "utf-32-le"
434+
ill_formed_sequence = b"\x80\xdc\x00\x00"
396435

397436
def test_partial(self):
398437
self.check_partial(
@@ -437,6 +476,7 @@ def test_issue8941(self):
437476

438477
class UTF32BETest(ReadTest, unittest.TestCase):
439478
encoding = "utf-32-be"
479+
ill_formed_sequence = b"\x00\x00\xdc\x80"
440480

441481
def test_partial(self):
442482
self.check_partial(
@@ -482,6 +522,10 @@ def test_issue8941(self):
482522

483523
class UTF16Test(ReadTest, unittest.TestCase):
484524
encoding = "utf-16"
525+
if sys.byteorder == 'little':
526+
ill_formed_sequence = b"\x80\xdc"
527+
else:
528+
ill_formed_sequence = b"\xdc\x80"
485529

486530
spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
487531
spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
@@ -562,6 +606,7 @@ def test_bug691291(self):
562606

563607
class UTF16LETest(ReadTest, unittest.TestCase):
564608
encoding = "utf-16-le"
609+
ill_formed_sequence = b"\x80\xdc"
565610

566611
def test_partial(self):
567612
self.check_partial(
@@ -605,6 +650,7 @@ def test_nonbmp(self):
605650

606651
class UTF16BETest(ReadTest, unittest.TestCase):
607652
encoding = "utf-16-be"
653+
ill_formed_sequence = b"\xdc\x80"
608654

609655
def test_partial(self):
610656
self.check_partial(
@@ -648,6 +694,8 @@ def test_nonbmp(self):
648694

649695
class UTF8Test(ReadTest, unittest.TestCase):
650696
encoding = "utf-8"
697+
ill_formed_sequence = b"\xed\xb2\x80"
698+
ill_formed_sequence_replace = "\ufffd" * 3
651699

652700
def test_partial(self):
653701
self.check_partial(
@@ -677,18 +725,11 @@ def test_decoder_state(self):
677725
u, u.encode(self.encoding))
678726

679727
def test_lone_surrogates(self):
680-
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
681-
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
682-
self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
683-
b'[\\udc80]')
684-
self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
685-
b'[�]')
686-
self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
728+
super().test_lone_surrogates()
729+
# not sure if this is making sense for
730+
# UTF-16 and UTF-32
731+
self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
687732
b'[\x80]')
688-
self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
689-
b'[]')
690-
self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
691-
b'[?]')
692733

693734
def test_surrogatepass_handler(self):
694735
self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
@@ -851,6 +892,9 @@ def test_nonbmp(self):
851892
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
852893
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
853894

895+
test_lone_surrogates = None
896+
897+
854898
class UTF16ExTest(unittest.TestCase):
855899

856900
def test_errors(self):
@@ -875,7 +919,7 @@ def test_bad_args(self):
875919
self.assertRaises(TypeError, codecs.readbuffer_encode)
876920
self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
877921

878-
class UTF8SigTest(ReadTest, unittest.TestCase):
922+
class UTF8SigTest(UTF8Test, unittest.TestCase):
879923
encoding = "utf-8-sig"
880924

881925
def test_partial(self):

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,7 @@ Ned Jackson Lovely
783783
Jason Lowe
784784
Tony Lownds
785785
Ray Loyzaga
786+
Kang-Hao (Kenny) Lu
786787
Lukas Lueg
787788
Loren Luke
788789
Fredrik Lundh

Misc/NEWS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ Projected release date: 2013-11-24
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #12892: The utf-16* and utf-32* encoders no longer allow surrogate code
14+
points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode
15+
byte sequences that correspond to surrogate code points. The surrogatepass
16+
error handler now works with the utf-16* and utf-32* codecs. Based on
17+
patches by Victor Stinner and Kang-Hao (Kenny) Lu.
18+
1319
- Issue #17806: Added keyword-argument support for "tabsize" to
1420
str/bytes.expandtabs().
1521

0 commit comments

Comments
 (0)