Skip to content

Commit c72e4e6

Browse files
committed
Issue #19619: Blacklist non-text codecs in method API
str.encode, bytes.decode and bytearray.decode now use an internal API to throw LookupError for known non-text encodings, rather than attempting the encoding or decoding operation and then throwing a TypeError for an unexpected output type. The latter mechanism remains in place for third party non-text encodings.
1 parent 322f5ba commit c72e4e6

File tree

13 files changed

+285
-87
lines changed

13 files changed

+285
-87
lines changed

Include/codecs.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
9494
const char *errors
9595
);
9696

97+
#ifndef PY_LIMITED_API
98+
/* Text codec specific encoding and decoding API.
99+
100+
Checks the encoding against a list of codecs which do not
101+
implement a str<->bytes encoding before attempting the
102+
operation.
103+
104+
Please note that these APIs are internal and should not
105+
be used in Python C extensions.
106+
107+
*/
108+
109+
PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
110+
PyObject *object,
111+
const char *encoding,
112+
const char *errors
113+
);
114+
115+
PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
116+
PyObject *object,
117+
const char *encoding,
118+
const char *errors
119+
);
120+
#endif
121+
122+
123+
97124
/* --- Codec Lookup APIs --------------------------------------------------
98125
99126
All APIs return a codec object with incremented refcount and are

Lib/codecs.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,19 @@
7373
### Codec base classes (defining the API)
7474

7575
class CodecInfo(tuple):
76+
"""Codec details when looking up the codec registry"""
77+
78+
# Private API to allow Python 3.4 to blacklist the known non-Unicode
79+
# codecs in the standard library. A more general mechanism to
80+
# reliably distinguish test encodings from other codecs will hopefully
81+
# be defined for Python 3.5
82+
#
83+
# See http://bugs.python.org/issue19619
84+
_is_text_encoding = True # Assume codecs are text encodings by default
7685

7786
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78-
incrementalencoder=None, incrementaldecoder=None, name=None):
87+
incrementalencoder=None, incrementaldecoder=None, name=None,
88+
*, _is_text_encoding=None):
7989
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
8090
self.name = name
8191
self.encode = encode
@@ -84,6 +94,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
8494
self.incrementaldecoder = incrementaldecoder
8595
self.streamwriter = streamwriter
8696
self.streamreader = streamreader
97+
if _is_text_encoding is not None:
98+
self._is_text_encoding = _is_text_encoding
8799
return self
88100

89101
def __repr__(self):

Lib/encodings/base64_codec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,5 @@ def getregentry():
5252
incrementaldecoder=IncrementalDecoder,
5353
streamwriter=StreamWriter,
5454
streamreader=StreamReader,
55+
_is_text_encoding=False,
5556
)

Lib/encodings/bz2_codec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,5 @@ def getregentry():
7474
incrementaldecoder=IncrementalDecoder,
7575
streamwriter=StreamWriter,
7676
streamreader=StreamReader,
77+
_is_text_encoding=False,
7778
)

Lib/encodings/hex_codec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,5 @@ def getregentry():
5252
incrementaldecoder=IncrementalDecoder,
5353
streamwriter=StreamWriter,
5454
streamreader=StreamReader,
55+
_is_text_encoding=False,
5556
)

Lib/encodings/quopri_codec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,5 @@ def getregentry():
5353
incrementaldecoder=IncrementalDecoder,
5454
streamwriter=StreamWriter,
5555
streamreader=StreamReader,
56+
_is_text_encoding=False,
5657
)

Lib/encodings/rot_13.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def getregentry():
4343
incrementaldecoder=IncrementalDecoder,
4444
streamwriter=StreamWriter,
4545
streamreader=StreamReader,
46+
_is_text_encoding=False,
4647
)
4748

4849
### Map

Lib/encodings/uu_codec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,5 @@ def getregentry():
9696
incrementaldecoder=IncrementalDecoder,
9797
streamreader=StreamReader,
9898
streamwriter=StreamWriter,
99+
_is_text_encoding=False,
99100
)

Lib/encodings/zlib_codec.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,5 @@ def getregentry():
7474
incrementaldecoder=IncrementalDecoder,
7575
streamreader=StreamReader,
7676
streamwriter=StreamWriter,
77+
_is_text_encoding=False,
7778
)

Lib/test/test_codecs.py

Lines changed: 108 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import sys
77
import unittest
88
import warnings
9+
import encodings
910

1011
from test import support
1112

@@ -2381,68 +2382,69 @@ def test_buffer_api_usage(self):
23812382
view_decoded = codecs.decode(view, encoding)
23822383
self.assertEqual(view_decoded, data)
23832384

2384-
def test_type_error_for_text_input(self):
2385+
def test_text_to_binary_blacklists_binary_transforms(self):
23852386
# Check binary -> binary codecs give a good error for str input
23862387
bad_input = "bad input type"
23872388
for encoding in bytes_transform_encodings:
23882389
with self.subTest(encoding=encoding):
2389-
msg = "^encoding with '{}' codec failed".format(encoding)
2390-
with self.assertRaisesRegex(TypeError, msg) as failure:
2390+
fmt = ( "{!r} is not a text encoding; "
2391+
"use codecs.encode\(\) to handle arbitrary codecs")
2392+
msg = fmt.format(encoding)
2393+
with self.assertRaisesRegex(LookupError, msg) as failure:
23912394
bad_input.encode(encoding)
2392-
self.assertTrue(isinstance(failure.exception.__cause__,
2393-
TypeError))
2395+
self.assertIsNone(failure.exception.__cause__)
2396+
2397+
def test_text_to_binary_blacklists_text_transforms(self):
2398+
# Check str.encode gives a good error message for str -> str codecs
2399+
msg = (r"^'rot_13' is not a text encoding; "
2400+
"use codecs.encode\(\) to handle arbitrary codecs")
2401+
with self.assertRaisesRegex(LookupError, msg):
2402+
"just an example message".encode("rot_13")
23942403

2395-
def test_type_error_for_binary_input(self):
2404+
def test_binary_to_text_blacklists_binary_transforms(self):
2405+
# Check bytes.decode and bytearray.decode give a good error
2406+
# message for binary -> binary codecs
2407+
data = b"encode first to ensure we meet any format restrictions"
2408+
for encoding in bytes_transform_encodings:
2409+
with self.subTest(encoding=encoding):
2410+
encoded_data = codecs.encode(data, encoding)
2411+
fmt = (r"{!r} is not a text encoding; "
2412+
"use codecs.decode\(\) to handle arbitrary codecs")
2413+
msg = fmt.format(encoding)
2414+
with self.assertRaisesRegex(LookupError, msg):
2415+
encoded_data.decode(encoding)
2416+
with self.assertRaisesRegex(LookupError, msg):
2417+
bytearray(encoded_data).decode(encoding)
2418+
2419+
def test_binary_to_text_blacklists_text_transforms(self):
23962420
# Check str -> str codec gives a good error for binary input
23972421
for bad_input in (b"immutable", bytearray(b"mutable")):
23982422
with self.subTest(bad_input=bad_input):
2399-
msg = "^decoding with 'rot_13' codec failed"
2400-
with self.assertRaisesRegex(AttributeError, msg) as failure:
2423+
msg = (r"^'rot_13' is not a text encoding; "
2424+
"use codecs.decode\(\) to handle arbitrary codecs")
2425+
with self.assertRaisesRegex(LookupError, msg) as failure:
24012426
bad_input.decode("rot_13")
2402-
self.assertTrue(isinstance(failure.exception.__cause__,
2403-
AttributeError))
2427+
self.assertIsNone(failure.exception.__cause__)
24042428

24052429
def test_custom_zlib_error_is_wrapped(self):
24062430
# Check zlib codec gives a good error for malformed input
24072431
msg = "^decoding with 'zlib_codec' codec failed"
24082432
with self.assertRaisesRegex(Exception, msg) as failure:
2409-
b"hello".decode("zlib_codec")
2410-
self.assertTrue(isinstance(failure.exception.__cause__,
2411-
type(failure.exception)))
2433+
codecs.decode(b"hello", "zlib_codec")
2434+
self.assertIsInstance(failure.exception.__cause__,
2435+
type(failure.exception))
24122436

24132437
def test_custom_hex_error_is_wrapped(self):
24142438
# Check hex codec gives a good error for malformed input
24152439
msg = "^decoding with 'hex_codec' codec failed"
24162440
with self.assertRaisesRegex(Exception, msg) as failure:
2417-
b"hello".decode("hex_codec")
2418-
self.assertTrue(isinstance(failure.exception.__cause__,
2419-
type(failure.exception)))
2441+
codecs.decode(b"hello", "hex_codec")
2442+
self.assertIsInstance(failure.exception.__cause__,
2443+
type(failure.exception))
24202444

24212445
# Unfortunately, the bz2 module throws OSError, which the codec
24222446
# machinery currently can't wrap :(
24232447

2424-
def test_bad_decoding_output_type(self):
2425-
# Check bytes.decode and bytearray.decode give a good error
2426-
# message for binary -> binary codecs
2427-
data = b"encode first to ensure we meet any format restrictions"
2428-
for encoding in bytes_transform_encodings:
2429-
with self.subTest(encoding=encoding):
2430-
encoded_data = codecs.encode(data, encoding)
2431-
fmt = ("'{}' decoder returned 'bytes' instead of 'str'; "
2432-
"use codecs.decode\(\) to decode to arbitrary types")
2433-
msg = fmt.format(encoding)
2434-
with self.assertRaisesRegex(TypeError, msg):
2435-
encoded_data.decode(encoding)
2436-
with self.assertRaisesRegex(TypeError, msg):
2437-
bytearray(encoded_data).decode(encoding)
2438-
2439-
def test_bad_encoding_output_type(self):
2440-
# Check str.encode gives a good error message for str -> str codecs
2441-
msg = ("'rot_13' encoder returned 'str' instead of 'bytes'; "
2442-
"use codecs.encode\(\) to encode to arbitrary types")
2443-
with self.assertRaisesRegex(TypeError, msg):
2444-
"just an example message".encode("rot_13")
2445-
24462448

24472449
# The codec system tries to wrap exceptions in order to ensure the error
24482450
# mentions the operation being performed and the codec involved. We
@@ -2466,27 +2468,44 @@ def setUp(self):
24662468
# case finishes by using the test case repr as the codec name
24672469
# The codecs module normalizes codec names, although this doesn't
24682470
# appear to be formally documented...
2469-
self.codec_name = repr(self).lower().replace(" ", "-")
2471+
# We also make sure we use a truly unique id for the custom codec
2472+
# to avoid issues with the codec cache when running these tests
2473+
# multiple times (e.g. when hunting for refleaks)
2474+
unique_id = repr(self) + str(id(self))
2475+
self.codec_name = encodings.normalize_encoding(unique_id).lower()
2476+
2477+
# We store the object to raise on the instance because of a bad
2478+
# interaction between the codec caching (which means we can't
2479+
# recreate the codec entry) and regrtest refleak hunting (which
2480+
# runs the same test instance multiple times). This means we
2481+
# need to ensure the codecs call back in to the instance to find
2482+
# out which exception to raise rather than binding them in a
2483+
# closure to an object that may change on the next run
2484+
self.obj_to_raise = RuntimeError
24702485

24712486
def tearDown(self):
24722487
_TEST_CODECS.pop(self.codec_name, None)
24732488

2474-
def set_codec(self, obj_to_raise):
2475-
def raise_obj(*args, **kwds):
2476-
raise obj_to_raise
2477-
codec_info = codecs.CodecInfo(raise_obj, raise_obj,
2489+
def set_codec(self, encode, decode):
2490+
codec_info = codecs.CodecInfo(encode, decode,
24782491
name=self.codec_name)
24792492
_TEST_CODECS[self.codec_name] = codec_info
24802493

24812494
@contextlib.contextmanager
24822495
def assertWrapped(self, operation, exc_type, msg):
2483-
full_msg = "{} with '{}' codec failed \({}: {}\)".format(
2496+
full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
24842497
operation, self.codec_name, exc_type.__name__, msg)
24852498
with self.assertRaisesRegex(exc_type, full_msg) as caught:
24862499
yield caught
2500+
self.assertIsInstance(caught.exception.__cause__, exc_type)
2501+
2502+
def raise_obj(self, *args, **kwds):
2503+
# Helper to dynamically change the object raised by a test codec
2504+
raise self.obj_to_raise
24872505

24882506
def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
2489-
self.set_codec(obj_to_raise)
2507+
self.obj_to_raise = obj_to_raise
2508+
self.set_codec(self.raise_obj, self.raise_obj)
24902509
with self.assertWrapped("encoding", exc_type, msg):
24912510
"str_input".encode(self.codec_name)
24922511
with self.assertWrapped("encoding", exc_type, msg):
@@ -2515,23 +2534,17 @@ class MyRuntimeError(RuntimeError):
25152534
pass
25162535
self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
25172536

2518-
@contextlib.contextmanager
2519-
def assertNotWrapped(self, operation, exc_type, msg_re, msg=None):
2520-
if msg is None:
2521-
msg = msg_re
2522-
with self.assertRaisesRegex(exc_type, msg) as caught:
2523-
yield caught
2524-
self.assertEqual(str(caught.exception), msg)
2525-
2526-
def check_not_wrapped(self, obj_to_raise, msg_re, msg=None):
2527-
self.set_codec(obj_to_raise)
2528-
with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
2537+
def check_not_wrapped(self, obj_to_raise, msg):
2538+
def raise_obj(*args, **kwds):
2539+
raise obj_to_raise
2540+
self.set_codec(raise_obj, raise_obj)
2541+
with self.assertRaisesRegex(RuntimeError, msg):
25292542
"str input".encode(self.codec_name)
2530-
with self.assertNotWrapped("encoding", RuntimeError, msg_re, msg):
2543+
with self.assertRaisesRegex(RuntimeError, msg):
25312544
codecs.encode("str input", self.codec_name)
2532-
with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
2545+
with self.assertRaisesRegex(RuntimeError, msg):
25332546
b"bytes input".decode(self.codec_name)
2534-
with self.assertNotWrapped("decoding", RuntimeError, msg_re, msg):
2547+
with self.assertRaisesRegex(RuntimeError, msg):
25352548
codecs.decode(b"bytes input", self.codec_name)
25362549

25372550
def test_init_override_is_not_wrapped(self):
@@ -2550,29 +2563,56 @@ def test_instance_attribute_is_not_wrapped(self):
25502563
msg = "This should NOT be wrapped"
25512564
exc = RuntimeError(msg)
25522565
exc.attr = 1
2553-
self.check_not_wrapped(exc, msg)
2566+
self.check_not_wrapped(exc, "^{}$".format(msg))
25542567

25552568
def test_non_str_arg_is_not_wrapped(self):
25562569
self.check_not_wrapped(RuntimeError(1), "1")
25572570

25582571
def test_multiple_args_is_not_wrapped(self):
2559-
msg_re = "\('a', 'b', 'c'\)"
2560-
msg = "('a', 'b', 'c')"
2561-
self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re, msg)
2572+
msg_re = r"^\('a', 'b', 'c'\)$"
2573+
self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
25622574

25632575
# http://bugs.python.org/issue19609
25642576
def test_codec_lookup_failure_not_wrapped(self):
2565-
msg = "unknown encoding: %s" % self.codec_name
2577+
msg = "^unknown encoding: {}$".format(self.codec_name)
25662578
# The initial codec lookup should not be wrapped
2567-
with self.assertNotWrapped("encoding", LookupError, msg):
2579+
with self.assertRaisesRegex(LookupError, msg):
25682580
"str input".encode(self.codec_name)
2569-
with self.assertNotWrapped("encoding", LookupError, msg):
2581+
with self.assertRaisesRegex(LookupError, msg):
25702582
codecs.encode("str input", self.codec_name)
2571-
with self.assertNotWrapped("decoding", LookupError, msg):
2583+
with self.assertRaisesRegex(LookupError, msg):
25722584
b"bytes input".decode(self.codec_name)
2573-
with self.assertNotWrapped("decoding", LookupError, msg):
2585+
with self.assertRaisesRegex(LookupError, msg):
25742586
codecs.decode(b"bytes input", self.codec_name)
25752587

2588+
def test_unflagged_non_text_codec_handling(self):
2589+
# The stdlib non-text codecs are now marked so they're
2590+
# pre-emptively skipped by the text model related methods
2591+
# However, third party codecs won't be flagged, so we still make
2592+
# sure the case where an inappropriate output type is produced is
2593+
# handled appropriately
2594+
def encode_to_str(*args, **kwds):
2595+
return "not bytes!", 0
2596+
def decode_to_bytes(*args, **kwds):
2597+
return b"not str!", 0
2598+
self.set_codec(encode_to_str, decode_to_bytes)
2599+
# No input or output type checks on the codecs module functions
2600+
encoded = codecs.encode(None, self.codec_name)
2601+
self.assertEqual(encoded, "not bytes!")
2602+
decoded = codecs.decode(None, self.codec_name)
2603+
self.assertEqual(decoded, b"not str!")
2604+
# Text model methods should complain
2605+
fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
2606+
"use codecs.encode\(\) to encode to arbitrary types$")
2607+
msg = fmt.format(self.codec_name)
2608+
with self.assertRaisesRegex(TypeError, msg):
2609+
"str_input".encode(self.codec_name)
2610+
fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
2611+
"use codecs.decode\(\) to decode to arbitrary types$")
2612+
msg = fmt.format(self.codec_name)
2613+
with self.assertRaisesRegex(TypeError, msg):
2614+
b"bytes input".decode(self.codec_name)
2615+
25762616

25772617

25782618
@unittest.skipUnless(sys.platform == 'win32',

0 commit comments

Comments
 (0)