@@ -72,11 +72,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7272 self .chunk = u""
7373 self .chunkSize = 0
7474 self .chunkOffset = 0
75- self .ungetBuffer = [] # reversed list of chars from unget()
76- self .readChars = []
7775 self .errors = []
78-
79- self .lineLengths = []
76+ # Single-character buffer to handle 'unget'
77+ self .ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
78+
79+ # Remember the current position in the document
80+ self .positionLine = 1
81+ self .positionCol = 0
82+ # Remember the length of the last line, so unget("\n") can restore
83+ # positionCol. (Only one character can be ungot at once, so we only
84+ # need to remember the single last line.)
85+ self .lastLineLength = None
8086
8187 #Flag to indicate we may have a CR LF broken across a data chunk
8288 self ._lastChunkEndsWithCR = False
@@ -219,51 +225,59 @@ def detectEncodingMeta(self):
219225 encoding = parser .getEncoding ()
220226 return encoding
221227
222- def updatePosition (self ):
223- #Remove EOF from readChars, if present
224- if not self .readChars :
225- return
226- if self .readChars and self .readChars [- 1 ] == EOF :
227- #There may be more than one EOF in readChars so we cannot assume
228- #readChars.index(EOF) == -1
229- self .readChars = self .readChars [:self .readChars .index (EOF )]
230- readChars = "" .join (self .readChars )
231- lines = readChars .split ("\n " )
232- if self .lineLengths :
233- self .lineLengths [- 1 ] += len (lines [0 ])
228+ def updatePosition (self , chars ):
229+ # Update the position attributes to correspond to some sequence of
230+ # read characters
231+
232+ # Find the last newline character
233+ idx = chars .rfind (u"\n " )
234+ if idx == - 1 :
235+ # No newlines in chars
236+ self .positionCol += len (chars )
234237 else :
235- self .lineLengths .append (len (lines [0 ]))
236- for line in lines [1 :]:
237- self .lineLengths .append (len (line ))
238- self .readChars = []
239- #print self.lineLengths
238+ # Find the last-but-one newline character
239+ idx2 = chars .rfind (u"\n " , 0 , idx )
240+ if idx2 == - 1 :
241+ # Only one newline in chars
242+ self .positionLine += 1
243+ self .lastLineLength = self .positionCol + idx
244+ self .positionCol = len (chars ) - (idx + 1 )
245+ else :
246+ # At least two newlines in chars
247+ newlines = chars .count (u"\n " )
248+ self .positionLine += newlines
249+ self .lastLineLength = idx - (idx2 + 1 )
250+ self .positionCol = len (chars ) - (idx + 1 )
240251
241252 def position (self ):
242253 """Returns (line, col) of the current position in the stream."""
243- self .updatePosition ()
244- if self .lineLengths :
245- line , col = len (self .lineLengths ), self .lineLengths [- 1 ]
246- else :
247- line , col = 1 ,0
248- return (line , col )
254+ return (self .positionLine , self .positionCol )
249255
250256 def char (self ):
251257 """ Read one character from the stream or queue if available. Return
252258 EOF when EOF is reached.
253259 """
254- if self .ungetBuffer :
255- char = self .ungetBuffer .pop ()
256- self .readChars .append (char )
257- return char
258-
259- if self .chunkOffset >= self .chunkSize :
260- if not self .readChunk ():
261- return EOF
262-
263- char = self .chunk [self .chunkOffset ]
264- self .chunkOffset += 1
260+ char = self .ungetChar
261+ if char != u"" :
262+ # Use the ungot character, and reset the buffer
263+ self .ungetChar = u""
264+ else :
265+ # Read a new chunk from the input stream if necessary
266+ if self .chunkOffset >= self .chunkSize :
267+ if not self .readChunk ():
268+ return EOF
269+
270+ char = self .chunk [self .chunkOffset ]
271+ self .chunkOffset += 1
272+
273+ # Update the position attributes
274+ if char == u"\n " :
275+ self .lastLineLength = self .positionCol
276+ self .positionCol = 0
277+ self .positionLine += 1
278+ elif char is not EOF :
279+ self .positionCol += 1
265280
266- self .readChars .append (char )
267281 return char
268282
269283 def readChunk (self , chunkSize = _defaultChunkSize ):
@@ -282,20 +296,18 @@ def readChunk(self, chunkSize=_defaultChunkSize):
282296
283297 data = data .replace (u"\u0000 " , u"\ufffd " )
284298 #Check for CR LF broken across chunks
285- if (self ._lastChunkEndsWithCR and data [0 ] == "\n " ):
299+ if (self ._lastChunkEndsWithCR and data [0 ] == u "\n " ):
286300 data = data [1 :]
287301 # Stop if the chunk is now empty
288302 if not data :
289303 return False
290- self ._lastChunkEndsWithCR = data [- 1 ] == "\r "
291- data = data .replace ("\r \n " , "\n " )
292- data = data .replace ("\r " , "\n " )
304+ self ._lastChunkEndsWithCR = data [- 1 ] == u "\r "
305+ data = data .replace (u "\r \n " , u "\n " )
306+ data = data .replace (u "\r " , u "\n " )
293307
294- data = unicode (data )
295308 self .chunk = data
296309 self .chunkSize = len (data )
297310
298- self .updatePosition ()
299311 return True
300312
301313 def charsUntil (self , characters , opposite = False ):
@@ -307,22 +319,22 @@ def charsUntil(self, characters, opposite = False):
307319
308320 rv = []
309321
310- # The unget buffer is typically small and rarely used, so
311- # just check each character individually
312- while self .ungetBuffer :
313- if self .ungetBuffer [- 1 ] == EOF or (self .ungetBuffer [- 1 ] in characters ) != opposite :
314- r = u"" .join (rv )
315- self .readChars .extend (list (r ))
316- return r
322+ # Check the ungot character, if any.
323+ # (Since it's only a single character, don't use the regex here)
324+ char = self .ungetChar
325+ if char != u"" :
326+ if char is EOF or (char in characters ) != opposite :
327+ return u""
317328 else :
318- rv .append (self .ungetBuffer .pop ())
329+ rv .append (char )
330+ self .ungetChar = u""
319331
320332 # Use a cache of regexps to find the required characters
321333 try :
322334 chars = charsUntilRegEx [(characters , opposite )]
323335 except KeyError :
324336 for c in characters : assert (ord (c ) < 128 )
325- regex = u"" .join (["\\ x%02x" % ord (c ) for c in characters ])
337+ regex = u"" .join ([u "\\ x%02x" % ord (c ) for c in characters ])
326338 if not opposite :
327339 regex = u"^%s" % regex
328340 chars = charsUntilRegEx [(characters , opposite )] = re .compile (u"[%s]*" % regex )
@@ -343,24 +355,27 @@ def charsUntil(self, characters, opposite = False):
343355 break
344356
345357 r = u"" .join (rv )
346- self .readChars . extend ( list ( r ) )
358+ self .updatePosition ( r )
347359 return r
348360
349- def unget (self , chars ):
350- self .updatePosition ()
351- if chars :
352- l = list (chars )
353- l .reverse ()
354- self .ungetBuffer .extend (l )
355- #Alter the current line, col position
356- for c in chars [::- 1 ]:
357- if c is None :
358- continue
359- elif c == '\n ' :
360- assert self .lineLengths [- 1 ] == 0
361- self .lineLengths .pop ()
362- else :
363- self .lineLengths [- 1 ] -= 1
361+ def unget (self , char ):
362+ # Only one character is allowed to be ungotten at once - it must
363+ # be consumed again before any further call to unget
364+ assert self .ungetChar == u""
365+
366+ self .ungetChar = char
367+
368+ # Update the position attributes
369+ if char is None :
370+ pass
371+ elif char == u"\n " :
372+ assert self .positionLine >= 1
373+ assert self .lastLineLength is not None
374+ self .positionLine -= 1
375+ self .positionCol = self .lastLineLength
376+ self .lastLineLength = None
377+ else :
378+ self .positionCol -= 1
364379
365380class EncodingBytes (str ):
366381 """String-like object with an assosiated position and various extra methods
0 commit comments