https-github-com-nzysoft
diff --git a/‎Doc/library/codecs.rst‎
Lines changed: 18 additions & 7 deletions b/‎Doc/library/codecs.rst‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎Doc/whatsnew/3.4.rst‎
Lines changed: 7 additions & 0 deletions b/‎Doc/whatsnew/3.4.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Lib/test/test_codecs.py‎
Lines changed: 56 additions & 12 deletions b/‎Lib/test/test_codecs.py‎
Lines changed: 56 additions & 12 deletions
diff --git a/‎Misc/ACKS‎
Lines changed: 1 addition & 0 deletions b/‎Misc/ACKS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Misc/NEWS‎
Lines changed: 6 additions & 0 deletions b/‎Misc/NEWS‎
Lines changed: 6 additions & 0 deletions
@@ -365,18 +365,23 @@ and implemented by all standard Python codecs:
 |                         | in :pep:`383`.                                |
 +-------------------------+-----------------------------------------------+
 
-In addition, the following error handlers are specific to a single codec:
+In addition, the following error handlers are specific to Unicode encoding
+schemes:
 
-+-------------------+---------+-------------------------------------------+
-| Value             | Codec   | Meaning                                   |
-+===================+=========+===========================================+
-|``'surrogatepass'``| utf-8   | Allow encoding and decoding of surrogate  |
-|                   |         | codes in UTF-8.                           |
-+-------------------+---------+-------------------------------------------+
++-------------------+------------------------+-------------------------------------------+
+| Value             | Codec                  | Meaning                                   |
++===================+========================+===========================================+
+|``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate  |
+|                   | utf-16-be, utf-16-le,  | codes in all the Unicode encoding schemes.|
+|                   | utf-32-be, utf-32-le   |                                           |
++-------------------+------------------------+-------------------------------------------+
 
 .. versionadded:: 3.1
    The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
 
+.. versionchanged:: 3.4
+   The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
+
 The set of allowed values can be extended via :meth:`register_error`.
 
 
@@ -1167,6 +1172,12 @@ particular, the following variants typically exist:
 | utf_8_sig       |                                | all languages                  |
 +-----------------+--------------------------------+--------------------------------+
 
+.. versionchanged:: 3.4
+   The utf-16\* and utf-32\* encoders no longer allow surrogate code points
+   (U+D800--U+DFFF) to be encoded.  The utf-32\* decoders no longer decode
+   byte sequences that correspond to surrogate code points.
+
+
 Python Specific Encodings
 -------------------------
 
 
@@ -253,6 +253,13 @@ Some smaller changes made to the core Python language are:
   ``__main__.__file__`` when a script has been executed directly using
   a relative path (Contributed by Brett Cannon in :issue:`18416`).
 
+* Now all the UTF-\* codecs (except UTF-7) reject surrogates during both
+  encoding and decoding unless the ``surrogatepass`` error handler is used,
+  with the exception of the UTF-16 decoder that accepts valid surrogate pairs,
+  and the UTF-16 encoder that produces them while encoding non-BMP characters.
+  Contributed by Victor Stinner, Kang-Hao (Kenny) Lu and Serhiy Storchaka in
+  :issue:`12892`.
+
 
 New Modules
 ===========
 
@@ -300,8 +300,46 @@ def test_bug1098990_b(self):
         self.assertEqual(reader.readline(), s5)
         self.assertEqual(reader.readline(), "")
 
+    ill_formed_sequence_replace = "\ufffd"
+
+    def test_lone_surrogates(self):
+        self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
+        self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
+                         "[\\udc80]".encode(self.encoding))
+        self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
+                         "[&#56448;]".encode(self.encoding))
+        self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
+                         "[]".encode(self.encoding))
+        self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
+                         "[?]".encode(self.encoding))
+
+        bom = "".encode(self.encoding)
+        for before, after in [("\U00010fff", "A"), ("[", "]"),
+                              ("A", "\U00010fff")]:
+            before_sequence = before.encode(self.encoding)[len(bom):]
+            after_sequence = after.encode(self.encoding)[len(bom):]
+            test_string = before + "\uDC80" + after
+            test_sequence = (bom + before_sequence +
+                             self.ill_formed_sequence + after_sequence)
+            self.assertRaises(UnicodeDecodeError, test_sequence.decode,
+                              self.encoding)
+            self.assertEqual(test_string.encode(self.encoding,
+                                                "surrogatepass"),
+                             test_sequence)
+            self.assertEqual(test_sequence.decode(self.encoding,
+                                                  "surrogatepass"),
+                             test_string)
+            self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
+                             before + after)
+            self.assertEqual(test_sequence.decode(self.encoding, "replace"),
+                             before + self.ill_formed_sequence_replace + after)
+
 class UTF32Test(ReadTest, unittest.TestCase):
     encoding = "utf-32"
+    if sys.byteorder == 'little':
+        ill_formed_sequence = b"\x80\xdc\x00\x00"
+    else:
+        ill_formed_sequence = b"\x00\x00\xdc\x80"
 
     spamle = (b'\xff\xfe\x00\x00'
               b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
@@ -393,6 +431,7 @@ def test_issue8941(self):
 
 class UTF32LETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-le"
+    ill_formed_sequence = b"\x80\xdc\x00\x00"
 
     def test_partial(self):
         self.check_partial(
@@ -437,6 +476,7 @@ def test_issue8941(self):
 
 class UTF32BETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-be"
+    ill_formed_sequence = b"\x00\x00\xdc\x80"
 
     def test_partial(self):
         self.check_partial(
@@ -482,6 +522,10 @@ def test_issue8941(self):
 
 class UTF16Test(ReadTest, unittest.TestCase):
     encoding = "utf-16"
+    if sys.byteorder == 'little':
+        ill_formed_sequence = b"\x80\xdc"
+    else:
+        ill_formed_sequence = b"\xdc\x80"
 
     spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
     spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
@@ -562,6 +606,7 @@ def test_bug691291(self):
 
 class UTF16LETest(ReadTest, unittest.TestCase):
     encoding = "utf-16-le"
+    ill_formed_sequence = b"\x80\xdc"
 
     def test_partial(self):
         self.check_partial(
@@ -605,6 +650,7 @@ def test_nonbmp(self):
 
 class UTF16BETest(ReadTest, unittest.TestCase):
     encoding = "utf-16-be"
+    ill_formed_sequence = b"\xdc\x80"
 
     def test_partial(self):
         self.check_partial(
@@ -648,6 +694,8 @@ def test_nonbmp(self):
 
 class UTF8Test(ReadTest, unittest.TestCase):
     encoding = "utf-8"
+    ill_formed_sequence = b"\xed\xb2\x80"
+    ill_formed_sequence_replace = "\ufffd" * 3
 
     def test_partial(self):
         self.check_partial(
@@ -677,18 +725,11 @@ def test_decoder_state(self):
                                          u, u.encode(self.encoding))
 
     def test_lone_surrogates(self):
-        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
-        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
-        self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
-                         b'[\\udc80]')
-        self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
-                         b'[&#56448;]')
-        self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
+        super().test_lone_surrogates()
+        # not sure if this is making sense for
+        # UTF-16 and UTF-32
+        self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
                          b'[\x80]')
-        self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
-                         b'[]')
-        self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
-                         b'[?]')
 
     def test_surrogatepass_handler(self):
         self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
@@ -851,6 +892,9 @@ def test_nonbmp(self):
         self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
         self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
 
+    test_lone_surrogates = None
+
+
 class UTF16ExTest(unittest.TestCase):
 
     def test_errors(self):
@@ -875,7 +919,7 @@ def test_bad_args(self):
         self.assertRaises(TypeError, codecs.readbuffer_encode)
         self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
 
-class UTF8SigTest(ReadTest, unittest.TestCase):
+class UTF8SigTest(UTF8Test, unittest.TestCase):
     encoding = "utf-8-sig"
 
     def test_partial(self):
 
@@ -783,6 +783,7 @@ Ned Jackson Lovely
 Jason Lowe
 Tony Lownds
 Ray Loyzaga
+Kang-Hao (Kenny) Lu
 Lukas Lueg
 Loren Luke
 Fredrik Lundh
 
@@ -10,6 +10,12 @@ Projected release date: 2013-11-24
 Core and Builtins
 -----------------
 
+- Issue #12892: The utf-16* and utf-32* encoders no longer allow surrogate code
+  points (U+D800-U+DFFF) to be encoded.  The utf-32* decoders no longer decode
+  byte sequences that correspond to surrogate code points.  The surrogatepass
+  error handler now works with the utf-16* and utf-32* codecs.  Based on
+  patches by Victor Stinner and Kang-Hao (Kenny) Lu.
+
 - Issue #17806: Added keyword-argument support for "tabsize" to
   str/bytes.expandtabs().