@@ -173,8 +173,17 @@ def consumeNumberEntity(self, isHex):
173173
174174 charAsInt = entitiesWindows1252 [charAsInt - 128 ]
175175
176- # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
177- if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343 ):
176+ # Certain characters get replaced with U+FFFD
177+ if ((charAsInt <= 0x0008 ) or (charAsInt == 0x000B ) or (0x000E <= charAsInt <= 0x001F )
178+ or (0x007F <= charAsInt <= 0x009F )
179+ or (0xD800 <= charAsInt <= 0xDFFF ) or (0xFDD0 <= charAsInt <= 0xFDDF )
180+ or (charAsInt & 0xFFFE == 0xFFFE ) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
181+ or (0x10FFFF < charAsInt )):
182+ char = u"\uFFFD "
183+ self .tokenQueue .append ({"type" : "ParseError" , "data" :
184+ "illegal-codepoint-for-numeric-entity" ,
185+ "datavars" : {"charAsInt" : charAsInt }})
186+ else :
178187 try :
179188 # XXX We should have a separate function that does "int" to
180189 # "unicodestring" conversion since this doesn't always work
@@ -187,11 +196,6 @@ def consumeNumberEntity(self, isHex):
187196 self .tokenQueue .append ({"type" : "ParseError" , "data" :
188197 "cant-convert-numeric-entity" ,
189198 "datavars" : {"charAsInt" : charAsInt }})
190- else :
191- char = u"\uFFFD "
192- self .tokenQueue .append ({"type" : "ParseError" , "data" :
193- "illegal-codepoint-for-numeric-entity" ,
194- "datavars" : {"charAsInt" : charAsInt }})
195199
196200 # Discard the ; if present. Otherwise, put it back on the queue and
197201 # invoke parseError on parser.
0 commit comments