Skip to content

Commit f5aba58

Browse files
committed
Issue python#27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec lookup
1 parent 22d0698 commit f5aba58

File tree

8 files changed

+198
-51
lines changed

8 files changed

+198
-51
lines changed

Include/unicodeobject.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1663,7 +1663,7 @@ PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
16631663

16641664
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
16651665
const char *string, /* MBCS encoded string */
1666-
Py_ssize_t length, /* size of string */
1666+
Py_ssize_t length, /* size of string */
16671667
const char *errors /* error handling */
16681668
);
16691669

Lib/encodings/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"""#"
3030

3131
import codecs
32+
import sys
3233
from . import aliases
3334

3435
_cache = {}
@@ -151,3 +152,12 @@ def search_function(encoding):
151152

152153
# Register the search_function in the Python codec registry
153154
codecs.register(search_function)
155+
156+
if sys.platform == 'win32':
157+
def _alias_mbcs(encoding):
158+
import _bootlocale
159+
if encoding == _bootlocale.getpreferredencoding(False):
160+
import encodings.mbcs
161+
return encodings.mbcs.getregentry()
162+
163+
codecs.register(_alias_mbcs)

Lib/encodings/aliases.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,7 @@
458458
'macturkish' : 'mac_turkish',
459459

460460
# mbcs codec
461+
'ansi' : 'mbcs',
461462
'dbcs' : 'mbcs',
462463

463464
# ptcp154 codec

Lib/encodings/oem.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
""" Python 'oem' Codec for Windows
2+
3+
"""
4+
# Import them explicitly to cause an ImportError
5+
# on non-Windows systems
6+
from codecs import oem_encode, oem_decode
7+
# for IncrementalDecoder, IncrementalEncoder, ...
8+
import codecs
9+
10+
### Codec APIs
11+
12+
encode = oem_encode
13+
14+
def decode(input, errors='strict'):
15+
return oem_decode(input, errors, True)
16+
17+
class IncrementalEncoder(codecs.IncrementalEncoder):
18+
def encode(self, input, final=False):
19+
return oem_encode(input, self.errors)[0]
20+
21+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
22+
_buffer_decode = oem_decode
23+
24+
class StreamWriter(codecs.StreamWriter):
25+
encode = oem_encode
26+
27+
class StreamReader(codecs.StreamReader):
28+
decode = oem_decode
29+
30+
### encodings module API
31+
32+
def getregentry():
33+
return codecs.CodecInfo(
34+
name='oem',
35+
encode=encode,
36+
decode=decode,
37+
incrementalencoder=IncrementalEncoder,
38+
incrementaldecoder=IncrementalDecoder,
39+
streamreader=StreamReader,
40+
streamwriter=StreamWriter,
41+
)

Lib/site.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -423,21 +423,6 @@ def register_readline():
423423

424424
sys.__interactivehook__ = register_readline
425425

426-
def aliasmbcs():
427-
"""On Windows, some default encodings are not provided by Python,
428-
while they are always available as "mbcs" in each locale. Make
429-
them usable by aliasing to "mbcs" in such a case."""
430-
if sys.platform == 'win32':
431-
import _bootlocale, codecs
432-
enc = _bootlocale.getpreferredencoding(False)
433-
if enc.startswith('cp'): # "cp***" ?
434-
try:
435-
codecs.lookup(enc)
436-
except LookupError:
437-
import encodings
438-
encodings._cache[enc] = encodings._unknown
439-
encodings.aliases.aliases[enc] = 'mbcs'
440-
441426
CONFIG_LINE = r'^(?P<key>(\w|[-_])+)\s*=\s*(?P<value>.*)\s*$'
442427

443428
def venv(known_paths):
@@ -560,7 +545,6 @@ def main():
560545
setcopyright()
561546
sethelper()
562547
enablerlcompleter()
563-
aliasmbcs()
564548
execsitecustomize()
565549
if ENABLE_USER_SITE:
566550
execusercustomize()

Lib/test/test_codecs.py

Lines changed: 29 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@
88

99
from test import support
1010

11-
if sys.platform == 'win32':
12-
VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
13-
else:
14-
VISTA_OR_LATER = False
15-
1611
try:
1712
import ctypes
1813
except ImportError:
@@ -841,18 +836,13 @@ def test_encode(self):
841836
('abc', 'strict', b'abc'),
842837
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
843838
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
839+
('\udc80', 'strict', None),
840+
('\udc80', 'ignore', b''),
841+
('\udc80', 'replace', b'?'),
842+
('\udc80', 'backslashreplace', b'\\udc80'),
843+
('\udc80', 'namereplace', b'\\udc80'),
844+
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
844845
]
845-
if VISTA_OR_LATER:
846-
tests.extend((
847-
('\udc80', 'strict', None),
848-
('\udc80', 'ignore', b''),
849-
('\udc80', 'replace', b'?'),
850-
('\udc80', 'backslashreplace', b'\\udc80'),
851-
('\udc80', 'namereplace', b'\\udc80'),
852-
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
853-
))
854-
else:
855-
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
856846
for text, errors, expected in tests:
857847
if expected is not None:
858848
try:
@@ -879,17 +869,10 @@ def test_decode(self):
879869
(b'[\xff]', 'ignore', '[]'),
880870
(b'[\xff]', 'replace', '[\ufffd]'),
881871
(b'[\xff]', 'surrogateescape', '[\udcff]'),
872+
(b'[\xed\xb2\x80]', 'strict', None),
873+
(b'[\xed\xb2\x80]', 'ignore', '[]'),
874+
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
882875
]
883-
if VISTA_OR_LATER:
884-
tests.extend((
885-
(b'[\xed\xb2\x80]', 'strict', None),
886-
(b'[\xed\xb2\x80]', 'ignore', '[]'),
887-
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
888-
))
889-
else:
890-
tests.extend((
891-
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
892-
))
893876
for raw, errors, expected in tests:
894877
if expected is not None:
895878
try:
@@ -904,7 +887,6 @@ def test_decode(self):
904887
self.assertRaises(UnicodeDecodeError,
905888
raw.decode, 'cp65001', errors)
906889

907-
@unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
908890
def test_lone_surrogates(self):
909891
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
910892
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
@@ -921,7 +903,6 @@ def test_lone_surrogates(self):
921903
self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
922904
b'[?]')
923905

924-
@unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
925906
def test_surrogatepass_handler(self):
926907
self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
927908
b"abc\xed\xa0\x80def")
@@ -1951,6 +1932,8 @@ def test_basic(self):
19511932

19521933
if hasattr(codecs, "mbcs_encode"):
19531934
all_unicode_encodings.append("mbcs")
1935+
if hasattr(codecs, "oem_encode"):
1936+
all_unicode_encodings.append("oem")
19541937

19551938
# The following encoding is not tested, because it's not supposed
19561939
# to work:
@@ -3119,11 +3102,10 @@ def test_multibyte_encoding(self):
31193102
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
31203103
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
31213104
))
3122-
if VISTA_OR_LATER:
3123-
self.check_encode(self.CP_UTF8, (
3124-
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3125-
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3126-
))
3105+
self.check_encode(self.CP_UTF8, (
3106+
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3107+
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3108+
))
31273109

31283110
def test_incremental(self):
31293111
decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
@@ -3144,6 +3126,20 @@ def test_incremental(self):
31443126
False)
31453127
self.assertEqual(decoded, ('abc', 3))
31463128

3129+
def test_mbcs_alias(self):
3130+
# Check that looking up our 'default' codepage will return
3131+
# mbcs when we don't have a more specific one available
3132+
import _bootlocale
3133+
def _get_fake_codepage(*a):
3134+
return 'cp123'
3135+
old_getpreferredencoding = _bootlocale.getpreferredencoding
3136+
_bootlocale.getpreferredencoding = _get_fake_codepage
3137+
try:
3138+
codec = codecs.lookup('cp123')
3139+
self.assertEqual(codec.name, 'mbcs')
3140+
finally:
3141+
_bootlocale.getpreferredencoding = old_getpreferredencoding
3142+
31473143

31483144
class ASCIITest(unittest.TestCase):
31493145
def test_encode(self):

Modules/_codecsmodule.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,25 @@ _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
625625
return codec_tuple(decoded, consumed);
626626
}
627627

628+
/*[clinic input]
629+
_codecs.oem_decode
630+
data: Py_buffer
631+
errors: str(accept={str, NoneType}) = NULL
632+
final: int(c_default="0") = False
633+
/
634+
[clinic start generated code]*/
635+
636+
static PyObject *
637+
_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
638+
const char *errors, int final)
639+
/*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/
640+
{
641+
Py_ssize_t consumed = data->len;
642+
PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
643+
data->buf, data->len, errors, final ? NULL : &consumed);
644+
return codec_tuple(decoded, consumed);
645+
}
646+
628647
/*[clinic input]
629648
_codecs.code_page_decode
630649
codepage: int
@@ -970,6 +989,21 @@ _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
970989
PyUnicode_GET_LENGTH(str));
971990
}
972991

992+
/*[clinic input]
993+
_codecs.oem_encode
994+
str: unicode
995+
errors: str(accept={str, NoneType}) = NULL
996+
/
997+
[clinic start generated code]*/
998+
999+
static PyObject *
1000+
_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
1001+
/*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/
1002+
{
1003+
return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
1004+
PyUnicode_GET_LENGTH(str));
1005+
}
1006+
9731007
/*[clinic input]
9741008
_codecs.code_page_encode
9751009
code_page: int
@@ -1075,6 +1109,8 @@ static PyMethodDef _codecs_functions[] = {
10751109
_CODECS_READBUFFER_ENCODE_METHODDEF
10761110
_CODECS_MBCS_ENCODE_METHODDEF
10771111
_CODECS_MBCS_DECODE_METHODDEF
1112+
_CODECS_OEM_ENCODE_METHODDEF
1113+
_CODECS_OEM_DECODE_METHODDEF
10781114
_CODECS_CODE_PAGE_ENCODE_METHODDEF
10791115
_CODECS_CODE_PAGE_DECODE_METHODDEF
10801116
_CODECS_REGISTER_ERROR_METHODDEF

Modules/clinic/_codecsmodule.c.h

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,45 @@ _codecs_mbcs_decode(PyObject *module, PyObject *args)
805805

806806
#if defined(HAVE_MBCS)
807807

808+
PyDoc_STRVAR(_codecs_oem_decode__doc__,
809+
"oem_decode($module, data, errors=None, final=False, /)\n"
810+
"--\n"
811+
"\n");
812+
813+
#define _CODECS_OEM_DECODE_METHODDEF \
814+
{"oem_decode", (PyCFunction)_codecs_oem_decode, METH_VARARGS, _codecs_oem_decode__doc__},
815+
816+
static PyObject *
817+
_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
818+
const char *errors, int final);
819+
820+
static PyObject *
821+
_codecs_oem_decode(PyObject *module, PyObject *args)
822+
{
823+
PyObject *return_value = NULL;
824+
Py_buffer data = {NULL, NULL};
825+
const char *errors = NULL;
826+
int final = 0;
827+
828+
if (!PyArg_ParseTuple(args, "y*|zi:oem_decode",
829+
&data, &errors, &final)) {
830+
goto exit;
831+
}
832+
return_value = _codecs_oem_decode_impl(module, &data, errors, final);
833+
834+
exit:
835+
/* Cleanup for data */
836+
if (data.obj) {
837+
PyBuffer_Release(&data);
838+
}
839+
840+
return return_value;
841+
}
842+
843+
#endif /* defined(HAVE_MBCS) */
844+
845+
#if defined(HAVE_MBCS)
846+
808847
PyDoc_STRVAR(_codecs_code_page_decode__doc__,
809848
"code_page_decode($module, codepage, data, errors=None, final=False, /)\n"
810849
"--\n"
@@ -1346,6 +1385,38 @@ _codecs_mbcs_encode(PyObject *module, PyObject *args)
13461385

13471386
#if defined(HAVE_MBCS)
13481387

1388+
PyDoc_STRVAR(_codecs_oem_encode__doc__,
1389+
"oem_encode($module, str, errors=None, /)\n"
1390+
"--\n"
1391+
"\n");
1392+
1393+
#define _CODECS_OEM_ENCODE_METHODDEF \
1394+
{"oem_encode", (PyCFunction)_codecs_oem_encode, METH_VARARGS, _codecs_oem_encode__doc__},
1395+
1396+
static PyObject *
1397+
_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors);
1398+
1399+
static PyObject *
1400+
_codecs_oem_encode(PyObject *module, PyObject *args)
1401+
{
1402+
PyObject *return_value = NULL;
1403+
PyObject *str;
1404+
const char *errors = NULL;
1405+
1406+
if (!PyArg_ParseTuple(args, "U|z:oem_encode",
1407+
&str, &errors)) {
1408+
goto exit;
1409+
}
1410+
return_value = _codecs_oem_encode_impl(module, str, errors);
1411+
1412+
exit:
1413+
return return_value;
1414+
}
1415+
1416+
#endif /* defined(HAVE_MBCS) */
1417+
1418+
#if defined(HAVE_MBCS)
1419+
13491420
PyDoc_STRVAR(_codecs_code_page_encode__doc__,
13501421
"code_page_encode($module, code_page, str, errors=None, /)\n"
13511422
"--\n"
@@ -1446,6 +1517,10 @@ _codecs_lookup_error(PyObject *module, PyObject *arg)
14461517
#define _CODECS_MBCS_DECODE_METHODDEF
14471518
#endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
14481519

1520+
#ifndef _CODECS_OEM_DECODE_METHODDEF
1521+
#define _CODECS_OEM_DECODE_METHODDEF
1522+
#endif /* !defined(_CODECS_OEM_DECODE_METHODDEF) */
1523+
14491524
#ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF
14501525
#define _CODECS_CODE_PAGE_DECODE_METHODDEF
14511526
#endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */
@@ -1454,7 +1529,11 @@ _codecs_lookup_error(PyObject *module, PyObject *arg)
14541529
#define _CODECS_MBCS_ENCODE_METHODDEF
14551530
#endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */
14561531

1532+
#ifndef _CODECS_OEM_ENCODE_METHODDEF
1533+
#define _CODECS_OEM_ENCODE_METHODDEF
1534+
#endif /* !defined(_CODECS_OEM_ENCODE_METHODDEF) */
1535+
14571536
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
14581537
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
14591538
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
1460-
/*[clinic end generated code: output=0221e4eece62c905 input=a9049054013a1b77]*/
1539+
/*[clinic end generated code: output=7874e2d559d49368 input=a9049054013a1b77]*/

0 commit comments

Comments
 (0)