@@ -4749,35 +4749,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47494749
47504750
47514751static int
4752- unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4753- const char * s , Py_ssize_t size ,
4754- _Py_error_handler error_handler , const char * errors ,
4755- Py_ssize_t * consumed )
4752+ unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
4753+ const char * starts , const char * s , const char * end ,
4754+ _Py_error_handler error_handler ,
4755+ const char * errors ,
4756+ Py_ssize_t * consumed )
47564757{
4757- const char * starts = s ;
4758- const char * end = s + size ;
4759-
4760- // fast path: try ASCII string.
4761- if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4762- return -1 ;
4763- }
4764-
4765- Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4766- if (writer -> kind == PyUnicode_1BYTE_KIND
4767- && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4768- {
4769- Py_ssize_t decoded = ascii_decode (s , end , dest );
4770- writer -> pos += decoded ;
4771-
4772- if (decoded == size ) {
4773- if (consumed ) {
4774- * consumed = size ;
4775- }
4776- return 0 ;
4777- }
4778- s += decoded ;
4779- }
4780-
47814758 Py_ssize_t startinpos , endinpos ;
47824759 const char * errmsg = "" ;
47834760 PyObject * error_handler_obj = NULL ;
@@ -4827,6 +4804,8 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
48274804 endinpos = startinpos + ch - 1 ;
48284805 break ;
48294806 default :
4807+ // ch doesn't fit into kind, so change the buffer kind to write
4808+ // the character
48304809 if (_PyUnicodeWriter_WriteCharInline (writer , ch ) < 0 )
48314810 goto onError ;
48324811 continue ;
@@ -4898,8 +4877,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48984877 Py_ssize_t * consumed )
48994878{
49004879 if (size == 0 ) {
4901- if (consumed )
4880+ if (consumed ) {
49024881 * consumed = 0 ;
4882+ }
49034883 _Py_RETURN_UNICODE_EMPTY ();
49044884 }
49054885
@@ -4911,19 +4891,81 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
49114891 return get_latin1_char ((unsigned char )s [0 ]);
49124892 }
49134893
4894+ // fast path: try ASCII string.
4895+ const char * starts = s ;
4896+ const char * end = s + size ;
4897+ PyObject * u = PyUnicode_New (size , 127 );
4898+ if (u == NULL ) {
4899+ return NULL ;
4900+ }
4901+ Py_ssize_t decoded = ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4902+ if (decoded == size ) {
4903+ if (consumed ) {
4904+ * consumed = size ;
4905+ }
4906+ return u ;
4907+ }
4908+ s += decoded ;
4909+ size -= decoded ;
4910+
4911+ // Use _PyUnicodeWriter after fast path is failed.
49144912 _PyUnicodeWriter writer ;
4915- _PyUnicodeWriter_Init (& writer );
4913+ _PyUnicodeWriter_InitWithBuffer (& writer , u );
4914+ writer .pos = decoded ;
49164915
4917- if (unicode_decode_utf8_writer (& writer , s , size ,
4918- error_handler , errors ,
4919- consumed ) < 0 ) {
4916+ if (unicode_decode_utf8_impl (& writer , starts , s , end ,
4917+ error_handler , errors ,
4918+ consumed ) < 0 ) {
49204919 _PyUnicodeWriter_Dealloc (& writer );
49214920 return NULL ;
49224921 }
49234922 return _PyUnicodeWriter_Finish (& writer );
49244923}
49254924
49264925
4926+ static int
4927+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4928+ const char * s , Py_ssize_t size ,
4929+ _Py_error_handler error_handler , const char * errors ,
4930+ Py_ssize_t * consumed )
4931+ {
4932+ if (size == 0 ) {
4933+ if (consumed ) {
4934+ * consumed = 0 ;
4935+ }
4936+ return 0 ;
4937+ }
4938+
4939+ // fast path: try ASCII string.
4940+ if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4941+ return -1 ;
4942+ }
4943+
4944+ const char * starts = s ;
4945+ const char * end = s + size ;
4946+ Py_ssize_t decoded = 0 ;
4947+ Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4948+ if (writer -> kind == PyUnicode_1BYTE_KIND
4949+ && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4950+ {
4951+ decoded = ascii_decode (s , end , dest );
4952+ writer -> pos += decoded ;
4953+
4954+ if (decoded == size ) {
4955+ if (consumed ) {
4956+ * consumed = size ;
4957+ }
4958+ return 0 ;
4959+ }
4960+ s += decoded ;
4961+ size -= decoded ;
4962+ }
4963+
4964+ return unicode_decode_utf8_impl (writer , starts , s , end ,
4965+ error_handler , errors , consumed );
4966+ }
4967+
4968+
49274969PyObject *
49284970PyUnicode_DecodeUTF8Stateful (const char * s ,
49294971 Py_ssize_t size ,
0 commit comments