Skip to content

Commit 0d4df75

Browse files
Issue python#15027: The UTF-32 encoder is now 3x to 7x faster.
1 parent fdba838 commit 0d4df75

File tree

4 files changed

+133
-61
lines changed

4 files changed

+133
-61
lines changed

Doc/whatsnew/3.5.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,9 @@ The following performance enhancements have been added:
629629
versions 0--2 on typical data, and up to 5x in best cases).
630630
(Contributed by Serhiy Storchaka in :issue:`20416` and :issue:`23344`.)
631631

632+
* The UTF-32 encoder is now 3x to 7x faster. (Contributed by Serhiy Storchaka
633+
in :issue:`15027`.)
634+
632635

633636
Build and C API Changes
634637
=======================

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ Release date: 2015-04-24
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #15027: The UTF-32 encoder is now 3x to 7x faster.
14+
1315
- Issue #20274: When calling a _sqlite.Connection, it now complains if passed
1416
any keyword arguments. Previously it silently ignored them.
1517

Objects/stringlib/codecs.h

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,93 @@ STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
718718
return len - (end - in + 1);
719719
#endif
720720
}
721+
722+
#if STRINGLIB_SIZEOF_CHAR == 1
723+
# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
724+
#elif STRINGLIB_SIZEOF_CHAR == 2
725+
# define SWAB4(CH, tmp) (tmp = (CH), \
726+
((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
727+
/* high bytes are zero */
728+
#else
729+
# define SWAB4(CH, tmp) (tmp = (CH), \
730+
tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
731+
((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
732+
#endif
733+
Py_LOCAL_INLINE(Py_ssize_t)
734+
STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
735+
Py_ssize_t len,
736+
PY_UINT32_T **outptr,
737+
int native_ordering)
738+
{
739+
PY_UINT32_T *out = *outptr;
740+
const STRINGLIB_CHAR *end = in + len;
741+
if (native_ordering) {
742+
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
743+
while (in < unrolled_end) {
744+
#if STRINGLIB_SIZEOF_CHAR > 1
745+
/* check if any character is a surrogate character */
746+
if (((in[0] ^ 0xd800) &
747+
(in[1] ^ 0xd800) &
748+
(in[2] ^ 0xd800) &
749+
(in[3] ^ 0xd800) & 0xf800) == 0)
750+
break;
751+
#endif
752+
out[0] = in[0];
753+
out[1] = in[1];
754+
out[2] = in[2];
755+
out[3] = in[3];
756+
in += 4; out += 4;
757+
}
758+
while (in < end) {
759+
Py_UCS4 ch;
760+
ch = *in++;
761+
#if STRINGLIB_SIZEOF_CHAR > 1
762+
if (Py_UNICODE_IS_SURROGATE(ch)) {
763+
/* reject surrogate characters (U+DC800-U+DFFF) */
764+
goto fail;
765+
}
766+
#endif
767+
*out++ = ch;
768+
}
769+
} else {
770+
const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
771+
while (in < unrolled_end) {
772+
#if STRINGLIB_SIZEOF_CHAR > 1
773+
Py_UCS4 ch1, ch2, ch3, ch4;
774+
/* check if any character is a surrogate character */
775+
if (((in[0] ^ 0xd800) &
776+
(in[1] ^ 0xd800) &
777+
(in[2] ^ 0xd800) &
778+
(in[3] ^ 0xd800) & 0xf800) == 0)
779+
break;
780+
#endif
781+
out[0] = SWAB4(in[0], ch1);
782+
out[1] = SWAB4(in[1], ch2);
783+
out[2] = SWAB4(in[2], ch3);
784+
out[3] = SWAB4(in[3], ch4);
785+
in += 4; out += 4;
786+
}
787+
while (in < end) {
788+
Py_UCS4 ch = *in++;
789+
#if STRINGLIB_SIZEOF_CHAR > 1
790+
if (Py_UNICODE_IS_SURROGATE(ch)) {
791+
/* reject surrogate characters (U+DC800-U+DFFF) */
792+
goto fail;
793+
}
794+
#endif
795+
*out++ = SWAB4(ch, ch);
796+
}
797+
}
798+
*outptr = out;
799+
return len;
800+
#if STRINGLIB_SIZEOF_CHAR > 1
801+
fail:
802+
*outptr = out;
803+
return len - (end - in + 1);
804+
#endif
805+
}
806+
#undef SWAB4
807+
721808
#endif
722809

723810
#endif /* STRINGLIB_IS_UNICODE */

Objects/unicodeobject.c

Lines changed: 41 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -5051,32 +5051,22 @@ _PyUnicode_EncodeUTF32(PyObject *str,
50515051
const char *errors,
50525052
int byteorder)
50535053
{
5054-
int kind;
5055-
void *data;
5054+
enum PyUnicode_Kind kind;
5055+
const void *data;
50565056
Py_ssize_t len;
50575057
PyObject *v;
5058-
unsigned char *p;
5059-
Py_ssize_t nsize, i;
5060-
/* Offsets from p for storing byte pairs in the right order. */
5058+
PY_UINT32_T *out;
50615059
#if PY_LITTLE_ENDIAN
5062-
int iorder[] = {0, 1, 2, 3};
5060+
int native_ordering = byteorder <= 0;
50635061
#else
5064-
int iorder[] = {3, 2, 1, 0};
5062+
int native_ordering = byteorder >= 0;
50655063
#endif
50665064
const char *encoding;
5065+
Py_ssize_t nsize, pos;
50675066
PyObject *errorHandler = NULL;
50685067
PyObject *exc = NULL;
50695068
PyObject *rep = NULL;
50705069

5071-
#define STORECHAR(CH) \
5072-
do { \
5073-
p[iorder[3]] = ((CH) >> 24) & 0xff; \
5074-
p[iorder[2]] = ((CH) >> 16) & 0xff; \
5075-
p[iorder[1]] = ((CH) >> 8) & 0xff; \
5076-
p[iorder[0]] = (CH) & 0xff; \
5077-
p += 4; \
5078-
} while(0)
5079-
50805070
if (!PyUnicode_Check(str)) {
50815071
PyErr_BadArgument();
50825072
return NULL;
@@ -5087,67 +5077,61 @@ _PyUnicode_EncodeUTF32(PyObject *str,
50875077
data = PyUnicode_DATA(str);
50885078
len = PyUnicode_GET_LENGTH(str);
50895079

5090-
nsize = len + (byteorder == 0);
5091-
if (nsize > PY_SSIZE_T_MAX / 4)
5080+
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
50925081
return PyErr_NoMemory();
5082+
nsize = len + (byteorder == 0);
50935083
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
50945084
if (v == NULL)
50955085
return NULL;
50965086

5097-
p = (unsigned char *)PyBytes_AS_STRING(v);
5087+
/* output buffer is 4-bytes aligned */
5088+
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5089+
out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
50985090
if (byteorder == 0)
5099-
STORECHAR(0xFEFF);
5091+
*out++ = 0xFEFF;
51005092
if (len == 0)
5101-
return v;
5093+
goto done;
51025094

5103-
if (byteorder == -1) {
5104-
/* force LE */
5105-
iorder[0] = 0;
5106-
iorder[1] = 1;
5107-
iorder[2] = 2;
5108-
iorder[3] = 3;
5095+
if (byteorder == -1)
51095096
encoding = "utf-32-le";
5110-
}
5111-
else if (byteorder == 1) {
5112-
/* force BE */
5113-
iorder[0] = 3;
5114-
iorder[1] = 2;
5115-
iorder[2] = 1;
5116-
iorder[3] = 0;
5097+
else if (byteorder == 1)
51175098
encoding = "utf-32-be";
5118-
}
51195099
else
51205100
encoding = "utf-32";
51215101

51225102
if (kind == PyUnicode_1BYTE_KIND) {
5123-
for (i = 0; i < len; i++)
5124-
STORECHAR(PyUnicode_READ(kind, data, i));
5125-
return v;
5103+
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5104+
goto done;
51265105
}
51275106

5128-
for (i = 0; i < len;) {
5107+
pos = 0;
5108+
while (pos < len) {
51295109
Py_ssize_t repsize, moreunits;
5130-
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5131-
i++;
5132-
assert(ch <= MAX_UNICODE);
5133-
if (!Py_UNICODE_IS_SURROGATE(ch)) {
5134-
STORECHAR(ch);
5135-
continue;
5110+
5111+
if (kind == PyUnicode_2BYTE_KIND) {
5112+
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5113+
&out, native_ordering);
51365114
}
5115+
else {
5116+
assert(kind == PyUnicode_4BYTE_KIND);
5117+
pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5118+
&out, native_ordering);
5119+
}
5120+
if (pos == len)
5121+
break;
51375122

51385123
rep = unicode_encode_call_errorhandler(
51395124
errors, &errorHandler,
51405125
encoding, "surrogates not allowed",
5141-
str, &exc, i-1, i, &i);
5142-
5126+
str, &exc, pos, pos + 1, &pos);
51435127
if (!rep)
51445128
goto error;
51455129

51465130
if (PyBytes_Check(rep)) {
51475131
repsize = PyBytes_GET_SIZE(rep);
51485132
if (repsize & 3) {
51495133
raise_encode_exception(&exc, encoding,
5150-
str, i - 1, i,
5134+
str, pos - 1, pos,
51515135
"surrogates not allowed");
51525136
goto error;
51535137
}
@@ -5160,15 +5144,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51605144
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
51615145
if (!PyUnicode_IS_ASCII(rep)) {
51625146
raise_encode_exception(&exc, encoding,
5163-
str, i - 1, i,
5147+
str, pos - 1, pos,
51645148
"surrogates not allowed");
51655149
goto error;
51665150
}
51675151
}
51685152

51695153
/* four bytes are reserved for each surrogate */
51705154
if (moreunits > 1) {
5171-
Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
5155+
Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
51725156
Py_ssize_t morebytes = 4 * (moreunits - 1);
51735157
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
51745158
/* integer overflow */
@@ -5177,20 +5161,16 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51775161
}
51785162
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
51795163
goto error;
5180-
p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
5164+
out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
51815165
}
51825166

51835167
if (PyBytes_Check(rep)) {
5184-
Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5185-
p += repsize;
5168+
Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5169+
out += moreunits;
51865170
} else /* rep is unicode */ {
5187-
const Py_UCS1 *repdata;
51885171
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5189-
repdata = PyUnicode_1BYTE_DATA(rep);
5190-
while (repsize--) {
5191-
Py_UCS4 ch = *repdata++;
5192-
STORECHAR(ch);
5193-
}
5172+
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5173+
&out, native_ordering);
51945174
}
51955175

51965176
Py_CLEAR(rep);
@@ -5199,19 +5179,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
51995179
/* Cut back to size actually needed. This is necessary for, for example,
52005180
encoding of a string containing isolated surrogates and the 'ignore'
52015181
handler is used. */
5202-
nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
5182+
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
52035183
if (nsize != PyBytes_GET_SIZE(v))
52045184
_PyBytes_Resize(&v, nsize);
52055185
Py_XDECREF(errorHandler);
52065186
Py_XDECREF(exc);
5187+
done:
52075188
return v;
52085189
error:
52095190
Py_XDECREF(rep);
52105191
Py_XDECREF(errorHandler);
52115192
Py_XDECREF(exc);
52125193
Py_XDECREF(v);
52135194
return NULL;
5214-
#undef STORECHAR
52155195
}
52165196

52175197
PyObject *

0 commit comments

Comments
 (0)