@@ -82,7 +82,7 @@ static CharSequence replaceAnchors(CharSequence input) {
8282 }
8383
8484 private static boolean isValidHexSubSequence (CharSequence input , int i , int len ) {
85- if (i + len > input .length ()) {
85+ if (len < 1 || i + len > input .length ()) {
8686 return false ;
8787 }
8888 CharSequence maybeHex = input .subSequence (i , i + len );
@@ -100,37 +100,96 @@ static CharSequence expandEscapeCodes(CharSequence input) {
100100
101101 if (parse .inLiteral ) {
102102 result .append (c );
103- } else if (parse .escaped && ESCAPE_CODES .containsKey (c )) {
103+ continue ;
104+ }
105+
106+ if (parse .escaped && ESCAPE_CODES .containsKey (c )) {
104107 result .setLength (result .length () - 1 );
105108 result .append (ESCAPE_CODES .get (c ));
106- } else if (parse .escaped && c == 'u' && isValidHexSubSequence (input , i + 1 , 4 )) {
109+ continue ;
110+ }
111+
112+ if (parse .escaped && c == 'u' && i + 1 < input .length () && input .charAt (i + 1 ) == '{' ) {
113+ // Find closing brace
114+ int braceEnd = -1 ;
115+ for (int j = i + 2 ; j < input .length () && j < i + 10 ; j ++) { // limit search
116+ if (input .charAt (j ) == '}' ) {
117+ braceEnd = j ;
118+ break ;
119+ }
120+ }
121+ if (braceEnd < i + 2 ) {
122+ throw new IllegalArgumentException ("Missing closing brace of \\ u{...} escape sequence" );
123+ }
124+ if (!isValidHexSubSequence (input , i + 2 , braceEnd - i - 2 )) {
125+ throw new IllegalArgumentException ("Invalid hex content in \\ u{...} escape sequence" );
126+ }
127+
128+ String hex = input .subSequence (i + 2 , braceEnd ).toString ();
129+ int codePoint = Integer .parseInt (hex , 16 );
130+ // Java is very flexible, so while > 0x10FFFF will throw, these reserved points do not
131+ // even though they are not valid codepoints.
132+ if (codePoint >= 0xD800 && codePoint <= 0xDFFF ) {
133+ throw new IllegalArgumentException ("Not a valid Unicode code point in \\ u{...} escape sequence" );
134+ }
135+ // modern java would fail this at Character.toChars, but CI still includes java 8 which does
136+ // not. To get consistency for test cases, check explicitly.
137+ if (codePoint > 0x10FFFF ) {
138+ throw new IllegalArgumentException ("Not a valid Unicode code point in \\ u{...} escape sequence" );
139+ }
140+ result .setLength (result .length () - 1 ); // remove the \
141+ result .append ('\\' );
142+ result .append (Character .toChars (codePoint ));
143+ i = braceEnd ; // skip to closing brace
144+ continue ;
145+ }
146+
147+ if (parse .escaped && c == 'u' ) {
148+ if (!isValidHexSubSequence (input , i + 1 , 4 )) {
149+ throw new IllegalArgumentException ("Invalid hex content in \\ uHHHH escape sequence" );
150+ }
107151 String hex = input .subSequence (i + 1 , i + 5 ).toString ();
108152 char firstChar = (char ) Integer .parseInt (hex , 16 );
153+ if (Character .isLowSurrogate (firstChar )) {
154+ // low surrogate can only follow high surrogate
155+ throw new IllegalArgumentException ("Invalid low surrogate in \\ uHHHH escape sequence" );
156+ }
109157 result .setLength (result .length () - 1 );
110158 // prepending \ treats it as a literal value. Yes this is probably the same char that
111159 // was removed with setLength(n-1), but explicit seems better than implicit.
112160 result .append ('\\' );
113161 result .append (firstChar );
114162 i += 4 ;
115163
164+ if (!Character .isHighSurrogate (firstChar )) {
165+ continue ;
166+ }
116167 // directly handle paired surrogate, otherwise the above would
117168 // inject a \ inside the pair.
118- if (Character .isHighSurrogate (firstChar ) &&
119- i + 2 < input .length () &&
120- input .charAt (i + 1 ) == '\\' &&
121- input .charAt (i + 2 ) == 'u' &&
122- isValidHexSubSequence (input , i + 3 , 4 )
169+ if (
170+ i + 2 >= input .length () ||
171+ input .charAt (i + 1 ) != '\\' ||
172+ input .charAt (i + 2 ) != 'u'
123173 ) {
124- String lowHex = input .subSequence (i + 3 , i + 7 ).toString ();
125- char secondChar = (char ) Integer .parseInt (lowHex , 16 );
126- if (Character .isLowSurrogate (secondChar )) {
127- result .append (secondChar );
128- i += 6 ; // Skip the \\uHHHH for the low surrogate
129- }
174+ throw new IllegalArgumentException (
175+ "High surrogate must be followed with low surrogate in \\ uHHHH escape sequence" );
130176 }
131- } else {
132- result .append (c );
177+ if (!isValidHexSubSequence (input , i + 3 , 4 )) {
178+ throw new IllegalArgumentException ("Invalid hex content in \\ uHHHH escape sequence" );
179+ }
180+ String lowHex = input .subSequence (i + 3 , i + 7 ).toString ();
181+ char secondChar = (char ) Integer .parseInt (lowHex , 16 );
182+ if (!Character .isLowSurrogate (secondChar )) {
183+ throw new IllegalArgumentException (
184+ "High surrogate must be followed with low surrogate in \\ uHHHH escape sequence" );
185+ }
186+ result .append (secondChar );
187+ i += 6 ; // Skip the \\uHHHH for the low surrogate
188+ continue ;
133189 }
190+
191+ // Default action
192+ result .append (c );
134193 }
135194 return result .toString ();
136195 }
0 commit comments