Mercurial > p > roundup > code
comparison roundup/cgi/TAL/HTMLParser.py @ 2348:8c2402a78bb0
beginning getting ZPT up to date: TAL first
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Fri, 21 May 2004 05:36:30 +0000 |
| parents | fc52d57c6c3e |
| children | 63868084b8bb |
comparison
equal
deleted
inserted
replaced
| 2347:fbbda3b1816d | 2348:8c2402a78bb0 |
|---|---|
| 1 """A parser for HTML and XHTML.""" | 1 """A parser for HTML and XHTML.""" |
| 2 __docformat__ = 'restructuredtext' | |
| 3 | 2 |
| 4 # This file is based on sgmllib.py, but the API is slightly different. | 3 # This file is based on sgmllib.py, but the API is slightly different. |
| 5 | 4 |
| 6 # XXX There should be a way to distinguish between PCDATA (parsed | 5 # XXX There should be a way to distinguish between PCDATA (parsed |
| 7 # character data -- the normal case), RCDATA (replaceable character | 6 # character data -- the normal case), RCDATA (replaceable character |
| 9 # and CDATA (character data -- only end tags are special). | 8 # and CDATA (character data -- only end tags are special). |
| 10 | 9 |
| 11 | 10 |
| 12 import markupbase | 11 import markupbase |
| 13 import re | 12 import re |
| 14 import string | |
| 15 | 13 |
| 16 # Regular expressions used for parsing | 14 # Regular expressions used for parsing |
| 17 | 15 |
| 18 interesting_normal = re.compile('[&<]') | 16 interesting_normal = re.compile('[&<]') |
| 19 interesting_cdata = re.compile(r'<(/|\Z)') | 17 interesting_cdata = re.compile(r'<(/|\Z)') |
| 259 # Now parse the data between i+1 and j into a tag and attrs | 257 # Now parse the data between i+1 and j into a tag and attrs |
| 260 attrs = [] | 258 attrs = [] |
| 261 match = tagfind.match(rawdata, i+1) | 259 match = tagfind.match(rawdata, i+1) |
| 262 assert match, 'unexpected call to parse_starttag()' | 260 assert match, 'unexpected call to parse_starttag()' |
| 263 k = match.end() | 261 k = match.end() |
| 264 self.lasttag = tag = string.lower(rawdata[i+1:k]) | 262 self.lasttag = tag = rawdata[i+1:k].lower() |
| 265 | 263 |
| 266 while k < endpos: | 264 while k < endpos: |
| 267 m = attrfind.match(rawdata, k) | 265 m = attrfind.match(rawdata, k) |
| 268 if not m: | 266 if not m: |
| 269 break | 267 break |
| 272 attrvalue = None | 270 attrvalue = None |
| 273 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | 271 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
| 274 attrvalue[:1] == '"' == attrvalue[-1:]: | 272 attrvalue[:1] == '"' == attrvalue[-1:]: |
| 275 attrvalue = attrvalue[1:-1] | 273 attrvalue = attrvalue[1:-1] |
| 276 attrvalue = self.unescape(attrvalue) | 274 attrvalue = self.unescape(attrvalue) |
| 277 attrs.append((string.lower(attrname), attrvalue)) | 275 attrs.append((attrname.lower(), attrvalue)) |
| 278 k = m.end() | 276 k = m.end() |
| 279 | 277 |
| 280 end = string.strip(rawdata[k:endpos]) | 278 end = rawdata[k:endpos].strip() |
| 281 if end not in (">", "/>"): | 279 if end not in (">", "/>"): |
| 282 lineno, offset = self.getpos() | 280 lineno, offset = self.getpos() |
| 283 if "\n" in self.__starttag_text: | 281 if "\n" in self.__starttag_text: |
| 284 lineno = lineno + string.count(self.__starttag_text, "\n") | 282 lineno = lineno + self.__starttag_text.count("\n") |
| 285 offset = len(self.__starttag_text) \ | 283 offset = len(self.__starttag_text) \ |
| 286 - string.rfind(self.__starttag_text, "\n") | 284 - self.__starttag_text.rfind("\n") |
| 287 else: | 285 else: |
| 288 offset = offset + len(self.__starttag_text) | 286 offset = offset + len(self.__starttag_text) |
| 289 self.error("junk characters in start tag: %s" | 287 self.error("junk characters in start tag: %s" |
| 290 % `rawdata[k:endpos][:20]`) | 288 % `rawdata[k:endpos][:20]`) |
| 291 if end[-2:] == '/>': | 289 if end[-2:] == '/>': |
| 338 return -1 | 336 return -1 |
| 339 j = match.end() | 337 j = match.end() |
| 340 match = endtagfind.match(rawdata, i) # </ + tag + > | 338 match = endtagfind.match(rawdata, i) # </ + tag + > |
| 341 if not match: | 339 if not match: |
| 342 self.error("bad end tag: %s" % `rawdata[i:j]`) | 340 self.error("bad end tag: %s" % `rawdata[i:j]`) |
| 343 tag = string.lower(match.group(1)) | 341 tag = match.group(1).lower() |
| 344 if ( self.cdata_endtag is not None | 342 if ( self.cdata_endtag is not None |
| 345 and tag != self.cdata_endtag): | 343 and tag != self.cdata_endtag): |
| 346 # Should be a mismatched end tag, but we'll treat it | 344 # Should be a mismatched end tag, but we'll treat it |
| 347 # as text anyway, since most HTML authors aren't | 345 # as text anyway, since most HTML authors aren't |
| 348 # interested in the finer points of syntax. | 346 # interested in the finer points of syntax. |
| 394 | 392 |
| 395 # Internal -- helper to remove special character quoting | 393 # Internal -- helper to remove special character quoting |
| 396 def unescape(self, s): | 394 def unescape(self, s): |
| 397 if '&' not in s: | 395 if '&' not in s: |
| 398 return s | 396 return s |
| 399 s = string.replace(s, "<", "<") | 397 s = s.replace("<", "<") |
| 400 s = string.replace(s, ">", ">") | 398 s = s.replace(">", ">") |
| 401 s = string.replace(s, "'", "'") | 399 s = s.replace("'", "'") |
| 402 s = string.replace(s, """, '"') | 400 s = s.replace(""", '"') |
| 403 s = string.replace(s, "&", "&") # Must be last | 401 s = s.replace("&", "&") # Must be last |
| 404 return s | 402 return s |
