Skip to content

Commit 2ec8063

Browse files
committed
Modify _PyBytes_DecodeEscapeRecode() to use _PyBytesAPI
* Don't overallocate by 400% when recode is needed: only overallocate on demand using _PyBytesWriter. * Use _PyLong_DigitValue to convert hexadecimal digit to int * Create _PyBytes_DecodeEscapeRecode() subfunction
1 parent 1285e5c commit 2ec8063

File tree

2 files changed

+75
-59
lines changed

2 files changed

+75
-59
lines changed

Include/longobject.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
6565
# error "void* different in size from int, long and long long"
6666
#endif /* SIZEOF_VOID_P */
6767

68-
/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
68+
/* Used by Python/mystrtoul.c, _PyBytes_FromHex(),
69+
_PyBytes_DecodeEscapeRecode(), etc. */
6970
#ifndef Py_LIMITED_API
7071
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
7172
#endif

Objects/bytesobject.c

Lines changed: 73 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,61 +1068,85 @@ bytes_dealloc(PyObject *op)
10681068
the string is UTF-8 encoded and should be re-encoded in the
10691069
specified encoding. */
10701070

1071+
static char *
1072+
_PyBytes_DecodeEscapeRecode(const char **s, const char *end,
1073+
const char *errors, const char *recode_encoding,
1074+
_PyBytesWriter *writer, char *p)
1075+
{
1076+
PyObject *u, *w;
1077+
const char* t;
1078+
1079+
t = *s;
1080+
/* Decode non-ASCII bytes as UTF-8. */
1081+
while (t < end && (*t & 0x80))
1082+
t++;
1083+
u = PyUnicode_DecodeUTF8(*s, t - *s, errors);
1084+
if (u == NULL)
1085+
return NULL;
1086+
1087+
/* Recode them in target encoding. */
1088+
w = PyUnicode_AsEncodedString(u, recode_encoding, errors);
1089+
Py_DECREF(u);
1090+
if (w == NULL)
1091+
return NULL;
1092+
assert(PyBytes_Check(w));
1093+
1094+
/* Append bytes to output buffer. */
1095+
writer->min_size--; /* substract 1 preallocated byte */
1096+
p = _PyBytesWriter_WriteBytes(writer, p,
1097+
PyBytes_AS_STRING(w),
1098+
PyBytes_GET_SIZE(w));
1099+
Py_DECREF(w);
1100+
if (p == NULL)
1101+
return NULL;
1102+
1103+
*s = t;
1104+
return p;
1105+
}
1106+
10711107
PyObject *PyBytes_DecodeEscape(const char *s,
10721108
Py_ssize_t len,
10731109
const char *errors,
10741110
Py_ssize_t unicode,
10751111
const char *recode_encoding)
10761112
{
10771113
int c;
1078-
char *p, *buf;
1114+
char *p;
10791115
const char *end;
1080-
PyObject *v;
1081-
Py_ssize_t newlen = recode_encoding ? 4*len:len;
1082-
v = PyBytes_FromStringAndSize((char *)NULL, newlen);
1083-
if (v == NULL)
1116+
_PyBytesWriter writer;
1117+
1118+
_PyBytesWriter_Init(&writer);
1119+
1120+
p = _PyBytesWriter_Alloc(&writer, len);
1121+
if (p == NULL)
10841122
return NULL;
1085-
p = buf = PyBytes_AsString(v);
1123+
writer.overallocate = 1;
1124+
10861125
end = s + len;
10871126
while (s < end) {
10881127
if (*s != '\\') {
10891128
non_esc:
1090-
if (recode_encoding && (*s & 0x80)) {
1091-
PyObject *u, *w;
1092-
char *r;
1093-
const char* t;
1094-
Py_ssize_t rn;
1095-
t = s;
1096-
/* Decode non-ASCII bytes as UTF-8. */
1097-
while (t < end && (*t & 0x80)) t++;
1098-
u = PyUnicode_DecodeUTF8(s, t - s, errors);
1099-
if(!u) goto failed;
1100-
1101-
/* Recode them in target encoding. */
1102-
w = PyUnicode_AsEncodedString(
1103-
u, recode_encoding, errors);
1104-
Py_DECREF(u);
1105-
if (!w) goto failed;
1106-
1107-
/* Append bytes to output buffer. */
1108-
assert(PyBytes_Check(w));
1109-
r = PyBytes_AS_STRING(w);
1110-
rn = PyBytes_GET_SIZE(w);
1111-
Py_MEMCPY(p, r, rn);
1112-
p += rn;
1113-
Py_DECREF(w);
1114-
s = t;
1115-
} else {
1129+
if (!(recode_encoding && (*s & 0x80))) {
11161130
*p++ = *s++;
11171131
}
1132+
else {
1133+
/* non-ASCII character and need to recode */
1134+
p = _PyBytes_DecodeEscapeRecode(&s, end,
1135+
errors, recode_encoding,
1136+
&writer, p);
1137+
if (p == NULL)
1138+
goto failed;
1139+
}
11181140
continue;
11191141
}
1142+
11201143
s++;
1121-
if (s==end) {
1144+
if (s == end) {
11221145
PyErr_SetString(PyExc_ValueError,
11231146
"Trailing \\ in string");
11241147
goto failed;
11251148
}
1149+
11261150
switch (*s++) {
11271151
/* XXX This assumes ASCII! */
11281152
case '\n': break;
@@ -1147,28 +1171,18 @@ PyObject *PyBytes_DecodeEscape(const char *s,
11471171
*p++ = c;
11481172
break;
11491173
case 'x':
1150-
if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) {
1151-
unsigned int x = 0;
1152-
c = Py_CHARMASK(*s);
1153-
s++;
1154-
if (Py_ISDIGIT(c))
1155-
x = c - '0';
1156-
else if (Py_ISLOWER(c))
1157-
x = 10 + c - 'a';
1158-
else
1159-
x = 10 + c - 'A';
1160-
x = x << 4;
1161-
c = Py_CHARMASK(*s);
1162-
s++;
1163-
if (Py_ISDIGIT(c))
1164-
x += c - '0';
1165-
else if (Py_ISLOWER(c))
1166-
x += 10 + c - 'a';
1167-
else
1168-
x += 10 + c - 'A';
1169-
*p++ = x;
1170-
break;
1174+
if (s+1 < end) {
1175+
int digit1, digit2;
1176+
digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])];
1177+
digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])];
1178+
if (digit1 < 16 && digit2 < 16) {
1179+
*p++ = (unsigned char)((digit1 << 4) + digit2);
1180+
s += 2;
1181+
break;
1182+
}
11711183
}
1184+
/* invalid hexadecimal digits */
1185+
11721186
if (!errors || strcmp(errors, "strict") == 0) {
11731187
PyErr_Format(PyExc_ValueError,
11741188
"invalid \\x escape at position %d",
@@ -1190,18 +1204,19 @@ PyObject *PyBytes_DecodeEscape(const char *s,
11901204
if (s < end && Py_ISXDIGIT(s[0]))
11911205
s++; /* and a hexdigit */
11921206
break;
1207+
11931208
default:
11941209
*p++ = '\\';
11951210
s--;
11961211
goto non_esc; /* an arbitrary number of unescaped
11971212
UTF-8 bytes may follow. */
11981213
}
11991214
}
1200-
if (p-buf < newlen)
1201-
_PyBytes_Resize(&v, p - buf);
1202-
return v;
1215+
1216+
return _PyBytesWriter_Finish(&writer, p);
1217+
12031218
failed:
1204-
Py_DECREF(v);
1219+
_PyBytesWriter_Dealloc(&writer);
12051220
return NULL;
12061221
}
12071222

0 commit comments

Comments
 (0)