Skip to content

Commit fb1db93

Browse files
author
John J. Aylward
committed
Changes encoding to better match the XML spec section 2.2
1 parent adb0478 commit fb1db93

File tree

1 file changed

+27
-1
lines changed

1 file changed

+27
-1
lines changed

XML.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ public static String escape(String string) {
137137
sb.append("'");
138138
break;
139139
default:
140-
if (Character.isISOControl(cp)) {
140+
if (mustEscape(cp)) {
141141
sb.append("&#x");
142142
sb.append(Integer.toHexString(cp));
143143
sb.append(";");
@@ -149,6 +149,32 @@ public static String escape(String string) {
149149
return sb.toString();
150150
}
151151

152+
/**
153+
* @param cp code point to test
154+
* @return true if the code point is not valid for an XML
155+
*/
156+
private static boolean mustEscape(int cp) {
157+
/* Valid range from https://www.w3.org/TR/REC-xml/#charsets
158+
*
159+
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
160+
*
161+
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
162+
*/
163+
// isISOControl is true when (cp >= 0 && cp <= 0x1F) || (cp >= 0x7F && cp <= 0x9F)
164+
// all ISO control characters are out of range except tabs and new lines
165+
return (Character.isISOControl(cp)
166+
&& cp != 0x9
167+
&& cp != 0xA
168+
&& cp != 0xD
169+
) || !(
170+
// valid the range of acceptable characters that aren't control
171+
(cp >= 0x20 && cp <= 0xD7FF)
172+
|| (cp >= 0xE000 && cp <= 0xFFFD)
173+
|| (cp >= 0x10000 && cp <= 0x10FFFF)
174+
)
175+
;
176+
}
177+
152178
/**
153179
* Removes XML escapes from the string.
154180
*

0 commit comments

Comments
 (0)