comparison roundup/cgi/TAL/HTMLParser.py @ 2348:8c2402a78bb0

beginning getting ZPT up to date: TAL first
author Richard Jones <richard@users.sourceforge.net>
date Fri, 21 May 2004 05:36:30 +0000
parents fc52d57c6c3e
children 63868084b8bb
comparison
equal deleted inserted replaced
2347:fbbda3b1816d 2348:8c2402a78bb0
1 """A parser for HTML and XHTML.""" 1 """A parser for HTML and XHTML."""
2 __docformat__ = 'restructuredtext'
3 2
4 # This file is based on sgmllib.py, but the API is slightly different. 3 # This file is based on sgmllib.py, but the API is slightly different.
5 4
6 # XXX There should be a way to distinguish between PCDATA (parsed 5 # XXX There should be a way to distinguish between PCDATA (parsed
7 # character data -- the normal case), RCDATA (replaceable character 6 # character data -- the normal case), RCDATA (replaceable character
9 # and CDATA (character data -- only end tags are special). 8 # and CDATA (character data -- only end tags are special).
10 9
11 10
12 import markupbase 11 import markupbase
13 import re 12 import re
14 import string
15 13
16 # Regular expressions used for parsing 14 # Regular expressions used for parsing
17 15
18 interesting_normal = re.compile('[&<]') 16 interesting_normal = re.compile('[&<]')
19 interesting_cdata = re.compile(r'<(/|\Z)') 17 interesting_cdata = re.compile(r'<(/|\Z)')
259 # Now parse the data between i+1 and j into a tag and attrs 257 # Now parse the data between i+1 and j into a tag and attrs
260 attrs = [] 258 attrs = []
261 match = tagfind.match(rawdata, i+1) 259 match = tagfind.match(rawdata, i+1)
262 assert match, 'unexpected call to parse_starttag()' 260 assert match, 'unexpected call to parse_starttag()'
263 k = match.end() 261 k = match.end()
264 self.lasttag = tag = string.lower(rawdata[i+1:k]) 262 self.lasttag = tag = rawdata[i+1:k].lower()
265 263
266 while k < endpos: 264 while k < endpos:
267 m = attrfind.match(rawdata, k) 265 m = attrfind.match(rawdata, k)
268 if not m: 266 if not m:
269 break 267 break
272 attrvalue = None 270 attrvalue = None
273 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 271 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
274 attrvalue[:1] == '"' == attrvalue[-1:]: 272 attrvalue[:1] == '"' == attrvalue[-1:]:
275 attrvalue = attrvalue[1:-1] 273 attrvalue = attrvalue[1:-1]
276 attrvalue = self.unescape(attrvalue) 274 attrvalue = self.unescape(attrvalue)
277 attrs.append((string.lower(attrname), attrvalue)) 275 attrs.append((attrname.lower(), attrvalue))
278 k = m.end() 276 k = m.end()
279 277
280 end = string.strip(rawdata[k:endpos]) 278 end = rawdata[k:endpos].strip()
281 if end not in (">", "/>"): 279 if end not in (">", "/>"):
282 lineno, offset = self.getpos() 280 lineno, offset = self.getpos()
283 if "\n" in self.__starttag_text: 281 if "\n" in self.__starttag_text:
284 lineno = lineno + string.count(self.__starttag_text, "\n") 282 lineno = lineno + self.__starttag_text.count("\n")
285 offset = len(self.__starttag_text) \ 283 offset = len(self.__starttag_text) \
286 - string.rfind(self.__starttag_text, "\n") 284 - self.__starttag_text.rfind("\n")
287 else: 285 else:
288 offset = offset + len(self.__starttag_text) 286 offset = offset + len(self.__starttag_text)
289 self.error("junk characters in start tag: %s" 287 self.error("junk characters in start tag: %s"
290 % `rawdata[k:endpos][:20]`) 288 % `rawdata[k:endpos][:20]`)
291 if end[-2:] == '/>': 289 if end[-2:] == '/>':
338 return -1 336 return -1
339 j = match.end() 337 j = match.end()
340 match = endtagfind.match(rawdata, i) # </ + tag + > 338 match = endtagfind.match(rawdata, i) # </ + tag + >
341 if not match: 339 if not match:
342 self.error("bad end tag: %s" % `rawdata[i:j]`) 340 self.error("bad end tag: %s" % `rawdata[i:j]`)
343 tag = string.lower(match.group(1)) 341 tag = match.group(1).lower()
344 if ( self.cdata_endtag is not None 342 if ( self.cdata_endtag is not None
345 and tag != self.cdata_endtag): 343 and tag != self.cdata_endtag):
346 # Should be a mismatched end tag, but we'll treat it 344 # Should be a mismatched end tag, but we'll treat it
347 # as text anyway, since most HTML authors aren't 345 # as text anyway, since most HTML authors aren't
348 # interested in the finer points of syntax. 346 # interested in the finer points of syntax.
394 392
395 # Internal -- helper to remove special character quoting 393 # Internal -- helper to remove special character quoting
396 def unescape(self, s): 394 def unescape(self, s):
397 if '&' not in s: 395 if '&' not in s:
398 return s 396 return s
399 s = string.replace(s, "&lt;", "<") 397 s = s.replace("&lt;", "<")
400 s = string.replace(s, "&gt;", ">") 398 s = s.replace("&gt;", ">")
401 s = string.replace(s, "&apos;", "'") 399 s = s.replace("&apos;", "'")
402 s = string.replace(s, "&quot;", '"') 400 s = s.replace("&quot;", '"')
403 s = string.replace(s, "&amp;", "&") # Must be last 401 s = s.replace("&amp;", "&") # Must be last
404 return s 402 return s

Roundup Issue Tracker: http://roundup-tracker.org/