@@ -38,7 +38,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
3838 # List of where new lines occur
3939 self .newLines = [0 ]
4040
41- self .charEncoding = encoding
41+ self .charEncoding = ( encoding , "certian" )
4242
4343 # Raw Stream - for unicode objects this will encode to utf-8 and set
4444 # self.charEncoding as appropriate
@@ -54,11 +54,11 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5454 self .defaultEncoding = "windows-1252"
5555
5656 #Detect encoding iff no explicit "transport level" encoding is supplied
57- if self .charEncoding is None or not isValidEncoding (self .charEncoding ):
57+ if self .charEncoding [ 0 ] is None or not isValidEncoding (self .charEncoding [ 0 ] ):
5858 self .charEncoding = self .detectEncoding (parseMeta , chardet )
5959
60- self .dataStream = codecs .getreader (self .charEncoding )(self .rawStream ,
61- 'replace' )
60+ self .dataStream = codecs .getreader (self .charEncoding [ 0 ] )(self .rawStream ,
61+ ' replace' )
6262
6363 self .queue = deque ([])
6464 self .readChars = []
@@ -92,12 +92,15 @@ def detectEncoding(self, parseMeta=True, chardet=True):
9292 #First look for a BOM
9393 #This will also read past the BOM if present
9494 encoding = self .detectBOM ()
95+ confidence = "certain"
9596 #If there is no BOM need to look for meta elements with encoding
9697 #information
9798 if encoding is None and parseMeta :
9899 encoding = self .detectEncodingMeta ()
100+ confidence = "tentative"
99101 #Guess with chardet, if avaliable
100102 if encoding is None and chardet :
103+ confidence = "tentative"
101104 try :
102105 from chardet .universaldetector import UniversalDetector
103106 buffers = []
@@ -115,6 +118,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
115118 pass
116119 # If all else fails use the default encoding
117120 if encoding is None :
121+ confidence = "tentative"
118122 encoding = self .defaultEncoding
119123
120124 #Substitute for equivalent encodings:
@@ -123,7 +127,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
123127 if encoding .lower () in encodingSub :
124128 encoding = encodingSub [encoding .lower ()]
125129
126- return encoding
130+ return encoding , confidence
127131
128132 def detectBOM (self ):
129133 """Attempts to detect at BOM at the start of the stream. If
@@ -200,7 +204,8 @@ def detectEncodingMeta(self):
200204 buffer = self .rawStream .read (self .numBytesMeta )
201205 parser = EncodingParser (buffer )
202206 self .seek (buffer , 0 )
203- return parser .getEncoding ()
207+ encoding = parser .getEncoding ()
208+ return encoding
204209
205210 def updatePosition (self ):
206211 #Remove EOF from readChars, if present
@@ -414,7 +419,12 @@ def getEncoding(self):
414419 if not keepParsing :
415420 break
416421 if self .encoding is not None :
417- self .encoding = self .encoding .strip ()
422+ self .encoding = self .encoding .strip ()
423+ #Spec violation that complies with hsivonen + mjs
424+ if self .encoding .upper () in ("UTF-16" , "UTF-16BE" , "UTF-16LE" ,
425+ "UTF-32" , "UTF-32BE" , "UTF-32LE" ):
426+ self .encoding = "utf-8"
427+
418428 return self .encoding
419429
420430 def handleComment (self ):
@@ -531,7 +541,7 @@ def getAttribute(self):
531541 #11.5
532542 else :
533543 attrValue .extend (self .data .currentByte )
534- elif self .data .currentByte in (">" , '<' ):
544+ elif self .data .currentByte in (">" , "<" ):
535545 return "" .join (attrName ), ""
536546 elif self .data .currentByte in asciiUppercase :
537547 attrValue .extend (self .data .currentByte .lower ())
@@ -540,7 +550,7 @@ def getAttribute(self):
540550 while True :
541551 self .data .position += 1
542552 if self .data .currentByte in (
543- list (spaceCharacters ) + [">" , '<' ]):
553+ list (spaceCharacters ) + [">" , "<" ]):
544554 return "" .join (attrName ), "" .join (attrValue )
545555 elif self .data .currentByte in asciiUppercase :
546556 attrValue .extend (self .data .currentByte .lower ())
0 commit comments