@@ -35,7 +35,6 @@ of this software and associated documentation files (the "Software"), to deal
3535 */
3636@ SuppressWarnings ("boxing" )
3737public class XML {
38-
3938 /** The Character '&'. */
4039 public static final Character AMP = '&' ;
4140
@@ -62,6 +61,46 @@ public class XML {
6261
6362 /** The Character '/'. */
6463 public static final Character SLASH = '/' ;
64+
65+ /**
66+ * Creates an iterator for navigating Code Points in a string instead of
67+ * characters. Once Java7 support is dropped, this can be replaced with
68+ * <code>
69+ * string.codePoints()
70+ * </code>
71+ * which is available in Java8 and above.
72+ *
73+ * @see <a href=
74+ * "http://stackoverflow.com/a/21791059/6030888">http://stackoverflow.com/a/21791059/6030888</a>
75+ */
76+ private static Iterable <Integer > codePointIterator (final String string ) {
77+ return new Iterable <Integer >() {
78+ @ Override
79+ public Iterator <Integer > iterator () {
80+ return new Iterator <Integer >() {
81+ private int nextIndex = 0 ;
82+ private int length = string .length ();
83+
84+ @ Override
85+ public boolean hasNext () {
86+ return this .nextIndex < this .length ;
87+ }
88+
89+ @ Override
90+ public Integer next () {
91+ int result = string .codePointAt (this .nextIndex );
92+ this .nextIndex += Character .charCount (result );
93+ return result ;
94+ }
95+
96+ @ Override
97+ public void remove () {
98+ throw new UnsupportedOperationException ();
99+ }
100+ };
101+ }
102+ };
103+ }
65104
66105 /**
67106 * Replace special characters with XML escapes:
@@ -71,6 +110,7 @@ public class XML {
71110 * < <small>(less than)</small> is replaced by &lt;
72111 * > <small>(greater than)</small> is replaced by &gt;
73112 * " <small>(double quote)</small> is replaced by &quot;
113+ * ' <small>(single quote / apostrophe)</small> is replaced by &apos;
74114 * </pre>
75115 *
76116 * @param string
@@ -79,9 +119,8 @@ public class XML {
79119 */
80120 public static String escape (String string ) {
81121 StringBuilder sb = new StringBuilder (string .length ());
82- for (int i = 0 , length = string .length (); i < length ; i ++) {
83- char c = string .charAt (i );
84- switch (c ) {
122+ for (final int cp : codePointIterator (string )) {
123+ switch (cp ) {
85124 case '&' :
86125 sb .append ("&" );
87126 break ;
@@ -98,6 +137,93 @@ public static String escape(String string) {
98137 sb .append ("'" );
99138 break ;
100139 default :
140+ if (mustEscape (cp )) {
141+ sb .append ("&#x" );
142+ sb .append (Integer .toHexString (cp ));
143+ sb .append (";" );
144+ } else {
145+ sb .appendCodePoint (cp );
146+ }
147+ }
148+ }
149+ return sb .toString ();
150+ }
151+
152+ /**
153+ * @param cp code point to test
154+ * @return true if the code point is not valid for an XML
155+ */
156+ private static boolean mustEscape (int cp ) {
157+ /* Valid range from https://www.w3.org/TR/REC-xml/#charsets
158+ *
159+ * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
160+ *
161+ * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
162+ */
163+ // isISOControl is true when (cp >= 0 && cp <= 0x1F) || (cp >= 0x7F && cp <= 0x9F)
164+ // all ISO control characters are out of range except tabs and new lines
165+ return (Character .isISOControl (cp )
166+ && cp != 0x9
167+ && cp != 0xA
168+ && cp != 0xD
169+ ) || !(
170+ // valid the range of acceptable characters that aren't control
171+ (cp >= 0x20 && cp <= 0xD7FF )
172+ || (cp >= 0xE000 && cp <= 0xFFFD )
173+ || (cp >= 0x10000 && cp <= 0x10FFFF )
174+ )
175+ ;
176+ }
177+
178+ /**
179+ * Removes XML escapes from the string.
180+ *
181+ * @param string
182+ * string to remove escapes from
183+ * @return string with converted entities
184+ */
185+ public static String unescape (String string ) {
186+ StringBuilder sb = new StringBuilder (string .length ());
187+ for (int i = 0 , length = string .length (); i < length ; i ++) {
188+ char c = string .charAt (i );
189+ if (c == '&' ) {
190+ final int semic = string .indexOf (';' , i );
191+ if (semic > i ) {
192+ final String entity = string .substring (i + 1 , semic );
193+ if (entity .charAt (0 ) == '#' ) {
194+ int cp ;
195+ if (entity .charAt (1 ) == 'x' ) {
196+ // hex encoded unicode
197+ cp = Integer .parseInt (entity .substring (2 ), 16 );
198+ } else {
199+ // decimal encoded unicode
200+ cp = Integer .parseInt (entity .substring (1 ));
201+ }
202+ sb .appendCodePoint (cp );
203+ } else {
204+ if ("quot" .equalsIgnoreCase (entity )) {
205+ sb .append ('"' );
206+ } else if ("amp" .equalsIgnoreCase (entity )) {
207+ sb .append ('&' );
208+ } else if ("apos" .equalsIgnoreCase (entity )) {
209+ sb .append ('\'' );
210+ } else if ("lt" .equalsIgnoreCase (entity )) {
211+ sb .append ('<' );
212+ } else if ("gt" .equalsIgnoreCase (entity )) {
213+ sb .append ('>' );
214+ } else {
215+ sb .append ('&' ).append (entity ).append (';' );
216+ }
217+ }
218+ // skip past the entity we just parsed.
219+ i += entity .length () + 1 ;
220+ } else {
221+ // this shouldn't happen in most cases since the parser
222+ // errors on unclosed enties.
223+ sb .append (c );
224+ }
225+ } else {
226+ // not part of an entity
101227 sb .append (c );
102228 }
103229 }
@@ -227,7 +353,6 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
227353 if (token == null ) {
228354 token = x .nextToken ();
229355 }
230-
231356 // attribute = value
232357 if (token instanceof String ) {
233358 string = (String ) token ;
@@ -238,7 +363,7 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
238363 throw x .syntaxError ("Missing value" );
239364 }
240365 jsonobject .accumulate (string ,
241- keepStrings ? token : JSONObject . stringToValue ((String ) token ));
366+ keepStrings ? unescape (( String ) token ) : stringToValue ((String ) token ));
242367 token = null ;
243368 } else {
244369 jsonobject .accumulate (string , "" );
@@ -270,7 +395,7 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
270395 string = (String ) token ;
271396 if (string .length () > 0 ) {
272397 jsonobject .accumulate ("content" ,
273- keepStrings ? token : JSONObject . stringToValue (string ));
398+ keepStrings ? unescape ( string ) : stringToValue (string ));
274399 }
275400
276401 } else if (token == LT ) {
@@ -297,16 +422,18 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
297422 }
298423
299424 /**
300- * This method has been deprecated in favor of the
301- * {@link JSONObject.stringToValue(String)} method. Use it instead .
425+ * This method is the same as {@link JSONObject.stringToValue(String)}
426+ * except that this also tries to unescape String values .
302427 *
303- * @deprecated Use JSONObject#stringToValue(String) instead.
304428 * @param string String to convert
305429 * @return JSON value of this string or the string
306430 */
307- @ Deprecated
308431 public static Object stringToValue (String string ) {
309- return JSONObject .stringToValue (string );
432+ Object ret = JSONObject .stringToValue (string );
433+ if (ret instanceof String ){
434+ return unescape ((String )ret );
435+ }
436+ return ret ;
310437 }
311438
312439 /**
0 commit comments