Skip to content

Commit d3fe16f

Browse files
committed
Fix unicode_decode_utf8() perf regression
1 parent 994e4fa commit d3fe16f

File tree

1 file changed

+75
-33
lines changed

1 file changed

+75
-33
lines changed

Objects/unicodeobject.c

Lines changed: 75 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4749,35 +4749,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47494749

47504750

47514751
static int
4752-
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
4753-
const char *s, Py_ssize_t size,
4754-
_Py_error_handler error_handler, const char *errors,
4755-
Py_ssize_t *consumed)
4752+
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
4753+
const char *starts, const char *s, const char *end,
4754+
_Py_error_handler error_handler,
4755+
const char *errors,
4756+
Py_ssize_t *consumed)
47564757
{
4757-
const char *starts = s;
4758-
const char *end = s + size;
4759-
4760-
// fast path: try ASCII string.
4761-
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
4762-
return -1;
4763-
}
4764-
4765-
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
4766-
if (writer->kind == PyUnicode_1BYTE_KIND
4767-
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
4768-
{
4769-
Py_ssize_t decoded = ascii_decode(s, end, dest);
4770-
writer->pos += decoded;
4771-
4772-
if (decoded == size) {
4773-
if (consumed) {
4774-
*consumed = size;
4775-
}
4776-
return 0;
4777-
}
4778-
s += decoded;
4779-
}
4780-
47814758
Py_ssize_t startinpos, endinpos;
47824759
const char *errmsg = "";
47834760
PyObject *error_handler_obj = NULL;
@@ -4827,6 +4804,8 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
48274804
endinpos = startinpos + ch - 1;
48284805
break;
48294806
default:
4807+
// ch doesn't fit into kind, so change the buffer kind to write
4808+
// the character
48304809
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
48314810
goto onError;
48324811
continue;
@@ -4898,8 +4877,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48984877
Py_ssize_t *consumed)
48994878
{
49004879
if (size == 0) {
4901-
if (consumed)
4880+
if (consumed) {
49024881
*consumed = 0;
4882+
}
49034883
_Py_RETURN_UNICODE_EMPTY();
49044884
}
49054885

@@ -4911,19 +4891,81 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
49114891
return get_latin1_char((unsigned char)s[0]);
49124892
}
49134893

4894+
// fast path: try ASCII string.
4895+
const char *starts = s;
4896+
const char *end = s + size;
4897+
PyObject *u = PyUnicode_New(size, 127);
4898+
if (u == NULL) {
4899+
return NULL;
4900+
}
4901+
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4902+
if (decoded == size) {
4903+
if (consumed) {
4904+
*consumed = size;
4905+
}
4906+
return u;
4907+
}
4908+
s += decoded;
4909+
size -= decoded;
4910+
4911+
// Use _PyUnicodeWriter after fast path is failed.
49144912
_PyUnicodeWriter writer;
4915-
_PyUnicodeWriter_Init(&writer);
4913+
_PyUnicodeWriter_InitWithBuffer(&writer, u);
4914+
writer.pos = decoded;
49164915

4917-
if (unicode_decode_utf8_writer(&writer, s, size,
4918-
error_handler, errors,
4919-
consumed) < 0) {
4916+
if (unicode_decode_utf8_impl(&writer, starts, s, end,
4917+
error_handler, errors,
4918+
consumed) < 0) {
49204919
_PyUnicodeWriter_Dealloc(&writer);
49214920
return NULL;
49224921
}
49234922
return _PyUnicodeWriter_Finish(&writer);
49244923
}
49254924

49264925

4926+
static int
4927+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
4928+
const char *s, Py_ssize_t size,
4929+
_Py_error_handler error_handler, const char *errors,
4930+
Py_ssize_t *consumed)
4931+
{
4932+
if (size == 0) {
4933+
if (consumed) {
4934+
*consumed = 0;
4935+
}
4936+
return 0;
4937+
}
4938+
4939+
// fast path: try ASCII string.
4940+
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
4941+
return -1;
4942+
}
4943+
4944+
const char *starts = s;
4945+
const char *end = s + size;
4946+
Py_ssize_t decoded = 0;
4947+
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
4948+
if (writer->kind == PyUnicode_1BYTE_KIND
4949+
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
4950+
{
4951+
decoded = ascii_decode(s, end, dest);
4952+
writer->pos += decoded;
4953+
4954+
if (decoded == size) {
4955+
if (consumed) {
4956+
*consumed = size;
4957+
}
4958+
return 0;
4959+
}
4960+
s += decoded;
4961+
size -= decoded;
4962+
}
4963+
4964+
return unicode_decode_utf8_impl(writer, starts, s, end,
4965+
error_handler, errors, consumed);
4966+
}
4967+
4968+
49274969
PyObject *
49284970
PyUnicode_DecodeUTF8Stateful(const char *s,
49294971
Py_ssize_t size,

0 commit comments

Comments
 (0)