Skip to content

Commit bae46f3

Browse files
Unicode Codepoint Escape Syntax
1 parent 0f81564 commit bae46f3

15 files changed

+2965
-2705
lines changed

Zend/zend_language_scanner.c

Lines changed: 2785 additions & 2702 deletions
Large diffs are not rendered by default.

Zend/zend_language_scanner.l

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,85 @@ static void zend_scan_escape_string(zval *zendlval, char *str, int len, char quo
941941
*t++ = *s;
942942
}
943943
break;
944+
/* UTF-8 codepoint escape, format: /\\u\{\x+\}/ */
945+
case 'u':
946+
{
947+
/* cache where we started so we can parse after validating */
948+
char *start = s + 1;
949+
size_t len = 0;
950+
zend_bool valid = 1;
951+
unsigned long codepoint;
952+
953+
if (*start != '{') {
954+
/* we silently let this pass to avoid breaking code
955+
* with JSON in string literals (e.g. "\"\u202e\""
956+
*/
957+
*t++ = '\\';
958+
*t++ = 'u';
959+
break;
960+
} else {
961+
/* on the other hand, invalid \u{blah} errors */
962+
s++;
963+
len++;
964+
s++;
965+
while (*s != '}') {
966+
if (!ZEND_IS_HEX(*s)) {
967+
valid = 0;
968+
break;
969+
} else {
970+
len++;
971+
}
972+
s++;
973+
}
974+
if (*s == '}') {
975+
valid = 1;
976+
len++;
977+
}
978+
}
979+
980+
/* \u{} is invalid */
981+
if (len <= 2) {
982+
valid = 0;
983+
}
984+
985+
if (!valid) {
986+
zend_error(E_COMPILE_ERROR, "Invalid UTF-8 codepoint escape sequence");
987+
}
988+
989+
errno = 0;
990+
codepoint = strtoul(start + 1, NULL, 16);
991+
992+
/* per RFC 3629, UTF-8 can only represent 21 bits */
993+
if (codepoint > 0x10FFFF || errno) {
994+
zend_error_noreturn(E_COMPILE_ERROR, "Invalid UTF-8 codepoint escape sequence: Codepoint too large");
995+
}
996+
997+
/* based on https://en.wikipedia.org/wiki/UTF-8#Sample_code */
998+
size_t byte_len = 0;
999+
if (codepoint < 0x80) {
1000+
byte_len = 1;
1001+
*t++ = codepoint;
1002+
} else if (codepoint <= 0x7FF) {
1003+
byte_len = 2;
1004+
*t++ = (codepoint >> 6) + 0xC0;
1005+
*t++ = (codepoint & 0x3F) + 0x80;
1006+
} else if (codepoint <= 0xFFFF) {
1007+
byte_len = 3;
1008+
*t++ = (codepoint >> 12) + 0xE0;
1009+
*t++ = ((codepoint >> 6) & 0x3F) + 0x80;
1010+
*t++ = (codepoint & 0x3F) + 0x80;
1011+
} else if (codepoint <= 0x10FFFF) {
1012+
byte_len = 4;
1013+
*t++ = (codepoint >> 18) + 0xF0;
1014+
*t++ = ((codepoint >> 12) & 0x3F) + 0x80;
1015+
*t++ = ((codepoint >> 6) & 0x3F) + 0x80;
1016+
*t++ = (codepoint & 0x3F) + 0x80;
1017+
}
1018+
1019+
Z_STRLEN_P(zendlval) -= 2; /* \u */
1020+
Z_STRLEN_P(zendlval) -= (len - byte_len);
1021+
}
1022+
break;
9441023
default:
9451024
/* check for an octal */
9461025
if (ZEND_IS_OCT(*s)) {

Zend/zend_language_scanner_defs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Generated by re2c 0.13.5 */
1+
/* Generated by re2c 0.13.7.5 */
22
#line 3 "Zend/zend_language_scanner_defs.h"
33

44
enum YYCONDTYPE {

ext/standard/tests/array/compact.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ $çity = "San Francisco";
77
$state = "CA";
88
$event = "SIGGRAPH";
99

10-
$location_vars = array("c\u0327ity", "state");
10+
$location_vars = array("c\\u0327ity", "state");
1111

1212
$result = compact("event", $location_vars);
1313
var_dump($result);
1 Byte
Binary file not shown.

ext/standard/tests/strings/ucwords_variation4.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ $str_array = array(
4747
//using special chars in sentence
4848
"t@@#$% %test ^test &test *test +test -test",
4949
"!test ~test `test` =test= @test@test.com",
50-
"/test/r\test\ucwords\t\y\y\u\3 \yy\ /uu/",
50+
"/test/r\test\\ucwords\t\y\y\\u\3 \yy\ /uu/",
5151

5252
//only special chars
5353
"!@#$%^&*()_+=-`~"
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
--TEST--
2+
Valid Unicode escape sequences
3+
--FILE--
4+
<?php
5+
6+
var_dump("\u{61}"); // ASCII "a" - characters below U+007F just encode as ASCII, as it's UTF-8
7+
var_dump("\u{FF}"); // y with diaeresis
8+
var_dump("\u{ff}"); // case-insensitive
9+
var_dump("\u{2603}"); // Unicode snowman
10+
var_dump("\u{1F602}"); // FACE WITH TEARS OF JOY emoji
11+
var_dump("\u{0000001F602}"); // Leading zeroes permitted
12+
13+
14+
--EXPECT--
15+
string(1) "a"
16+
string(2) "ÿ"
17+
string(2) "ÿ"
18+
string(3) ""
19+
string(4) "😂"
20+
string(4) "😂"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
--TEST--
2+
Invalid Unicode escape sequence: Empty
3+
--FILE--
4+
<?php
5+
6+
var_dump("\u{}");
7+
--EXPECTF--
8+
Fatal error: Invalid UTF-8 codepoint escape sequence in %s on line %d
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
--TEST--
2+
Invalid Unicode escape sequence: Incomplete
3+
--FILE--
4+
<?php
5+
6+
var_dunp("\u{blah");
7+
--EXPECTF--
8+
Fatal error: Invalid UTF-8 codepoint escape sequence in %s on line %d
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
--TEST--
2+
Invalid Unicode escape sequence: Large codepoint
3+
--FILE--
4+
<?php
5+
6+
var_dump("\u{110000}"); // U+10FFFF + 1
7+
--EXPECTF--
8+
Fatal error: Invalid UTF-8 codepoint escape sequence: Codepoint too large in %s on line %d

0 commit comments

Comments
 (0)