Skip to content

Commit 413bb53

Browse files
authored
Merge pull request stleary#288 from johnjaylward/XmlEscape
Bug fixes for XML Encoding and Decoding
2 parents 237376e + 93ffca3 commit 413bb53

File tree

2 files changed

+141
-14
lines changed

2 files changed

+141
-14
lines changed

JSONML.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ private static Object parse(
175175
if (!(token instanceof String)) {
176176
throw x.syntaxError("Missing value");
177177
}
178-
newjo.accumulate(attribute, keepStrings ? token :JSONObject.stringToValue((String)token));
178+
newjo.accumulate(attribute, keepStrings ? XML.unescape((String)token) :XML.stringToValue((String)token));
179179
token = null;
180180
} else {
181181
newjo.accumulate(attribute, "");
@@ -226,7 +226,7 @@ private static Object parse(
226226
} else {
227227
if (ja != null) {
228228
ja.put(token instanceof String
229-
? keepStrings ? token :JSONObject.stringToValue((String)token)
229+
? keepStrings ? XML.unescape((String)token) :XML.stringToValue((String)token)
230230
: token);
231231
}
232232
}

XML.java

Lines changed: 139 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ of this software and associated documentation files (the "Software"), to deal
3535
*/
3636
@SuppressWarnings("boxing")
3737
public class XML {
38-
3938
/** The Character '&'. */
4039
public static final Character AMP = '&';
4140

@@ -62,6 +61,46 @@ public class XML {
6261

6362
/** The Character '/'. */
6463
public static final Character SLASH = '/';
64+
65+
/**
66+
* Creates an iterator for navigating Code Points in a string instead of
67+
* characters. Once Java7 support is dropped, this can be replaced with
68+
* <code>
69+
* string.codePoints()
70+
* </code>
71+
* which is available in Java8 and above.
72+
*
73+
* @see <a href=
74+
* "http://stackoverflow.com/a/21791059/6030888">http://stackoverflow.com/a/21791059/6030888</a>
75+
*/
76+
private static Iterable<Integer> codePointIterator(final String string) {
77+
return new Iterable<Integer>() {
78+
@Override
79+
public Iterator<Integer> iterator() {
80+
return new Iterator<Integer>() {
81+
private int nextIndex = 0;
82+
private int length = string.length();
83+
84+
@Override
85+
public boolean hasNext() {
86+
return this.nextIndex < this.length;
87+
}
88+
89+
@Override
90+
public Integer next() {
91+
int result = string.codePointAt(this.nextIndex);
92+
this.nextIndex += Character.charCount(result);
93+
return result;
94+
}
95+
96+
@Override
97+
public void remove() {
98+
throw new UnsupportedOperationException();
99+
}
100+
};
101+
}
102+
};
103+
}
65104

66105
/**
67106
* Replace special characters with XML escapes:
@@ -71,6 +110,7 @@ public class XML {
71110
* &lt; <small>(less than)</small> is replaced by &amp;lt;
72111
* &gt; <small>(greater than)</small> is replaced by &amp;gt;
73112
* &quot; <small>(double quote)</small> is replaced by &amp;quot;
113+
* &apos; <small>(single quote / apostrophe)</small> is replaced by &amp;apos;
74114
* </pre>
75115
*
76116
* @param string
@@ -79,9 +119,8 @@ public class XML {
79119
*/
80120
public static String escape(String string) {
81121
StringBuilder sb = new StringBuilder(string.length());
82-
for (int i = 0, length = string.length(); i < length; i++) {
83-
char c = string.charAt(i);
84-
switch (c) {
122+
for (final int cp : codePointIterator(string)) {
123+
switch (cp) {
85124
case '&':
86125
sb.append("&amp;");
87126
break;
@@ -98,6 +137,93 @@ public static String escape(String string) {
98137
sb.append("&apos;");
99138
break;
100139
default:
140+
if (mustEscape(cp)) {
141+
sb.append("&#x");
142+
sb.append(Integer.toHexString(cp));
143+
sb.append(";");
144+
} else {
145+
sb.appendCodePoint(cp);
146+
}
147+
}
148+
}
149+
return sb.toString();
150+
}
151+
152+
/**
153+
* @param cp code point to test
154+
* @return true if the code point is not valid for an XML
155+
*/
156+
private static boolean mustEscape(int cp) {
157+
/* Valid range from https://www.w3.org/TR/REC-xml/#charsets
158+
*
159+
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
160+
*
161+
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
162+
*/
163+
// isISOControl is true when (cp >= 0 && cp <= 0x1F) || (cp >= 0x7F && cp <= 0x9F)
164+
// all ISO control characters are out of range except tabs and new lines
165+
return (Character.isISOControl(cp)
166+
&& cp != 0x9
167+
&& cp != 0xA
168+
&& cp != 0xD
169+
) || !(
170+
// valid the range of acceptable characters that aren't control
171+
(cp >= 0x20 && cp <= 0xD7FF)
172+
|| (cp >= 0xE000 && cp <= 0xFFFD)
173+
|| (cp >= 0x10000 && cp <= 0x10FFFF)
174+
)
175+
;
176+
}
177+
178+
/**
179+
* Removes XML escapes from the string.
180+
*
181+
* @param string
182+
* string to remove escapes from
183+
* @return string with converted entities
184+
*/
185+
public static String unescape(String string) {
186+
StringBuilder sb = new StringBuilder(string.length());
187+
for (int i = 0, length = string.length(); i < length; i++) {
188+
char c = string.charAt(i);
189+
if (c == '&') {
190+
final int semic = string.indexOf(';', i);
191+
if (semic > i) {
192+
final String entity = string.substring(i + 1, semic);
193+
if (entity.charAt(0) == '#') {
194+
int cp;
195+
if (entity.charAt(1) == 'x') {
196+
// hex encoded unicode
197+
cp = Integer.parseInt(entity.substring(2), 16);
198+
} else {
199+
// decimal encoded unicode
200+
cp = Integer.parseInt(entity.substring(1));
201+
}
202+
sb.appendCodePoint(cp);
203+
} else {
204+
if ("quot".equalsIgnoreCase(entity)) {
205+
sb.append('"');
206+
} else if ("amp".equalsIgnoreCase(entity)) {
207+
sb.append('&');
208+
} else if ("apos".equalsIgnoreCase(entity)) {
209+
sb.append('\'');
210+
} else if ("lt".equalsIgnoreCase(entity)) {
211+
sb.append('<');
212+
} else if ("gt".equalsIgnoreCase(entity)) {
213+
sb.append('>');
214+
} else {
215+
sb.append('&').append(entity).append(';');
216+
}
217+
}
218+
// skip past the entity we just parsed.
219+
i += entity.length() + 1;
220+
} else {
221+
// this shouldn't happen in most cases since the parser
222+
// errors on unclosed enties.
223+
sb.append(c);
224+
}
225+
} else {
226+
// not part of an entity
101227
sb.append(c);
102228
}
103229
}
@@ -227,7 +353,6 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
227353
if (token == null) {
228354
token = x.nextToken();
229355
}
230-
231356
// attribute = value
232357
if (token instanceof String) {
233358
string = (String) token;
@@ -238,7 +363,7 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
238363
throw x.syntaxError("Missing value");
239364
}
240365
jsonobject.accumulate(string,
241-
keepStrings ? token : JSONObject.stringToValue((String) token));
366+
keepStrings ? unescape((String)token) : stringToValue((String) token));
242367
token = null;
243368
} else {
244369
jsonobject.accumulate(string, "");
@@ -270,7 +395,7 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
270395
string = (String) token;
271396
if (string.length() > 0) {
272397
jsonobject.accumulate("content",
273-
keepStrings ? token : JSONObject.stringToValue(string));
398+
keepStrings ? unescape(string) : stringToValue(string));
274399
}
275400

276401
} else if (token == LT) {
@@ -297,16 +422,18 @@ private static boolean parse(XMLTokener x, JSONObject context, String name, bool
297422
}
298423

299424
/**
300-
* This method has been deprecated in favor of the
301-
* {@link JSONObject.stringToValue(String)} method. Use it instead.
425+
* This method is the same as {@link JSONObject.stringToValue(String)}
426+
* except that this also tries to unescape String values.
302427
*
303-
* @deprecated Use JSONObject#stringToValue(String) instead.
304428
* @param string String to convert
305429
* @return JSON value of this string or the string
306430
*/
307-
@Deprecated
308431
public static Object stringToValue(String string) {
309-
return JSONObject.stringToValue(string);
432+
Object ret = JSONObject.stringToValue(string);
433+
if(ret instanceof String){
434+
return unescape((String)ret);
435+
}
436+
return ret;
310437
}
311438

312439
/**

0 commit comments

Comments
 (0)