Mercurial > p > roundup > code
comparison TAL/HTMLParser.py @ 982:bfd348432420
Adding TAL to the dist
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Fri, 30 Aug 2002 08:23:53 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 979:f36ffa50374f | 982:bfd348432420 |
|---|---|
| 1 """A parser for HTML and XHTML.""" | |
| 2 | |
| 3 # This file is based on sgmllib.py, but the API is slightly different. | |
| 4 | |
| 5 # XXX There should be a way to distinguish between PCDATA (parsed | |
| 6 # character data -- the normal case), RCDATA (replaceable character | |
| 7 # data -- only char and entity references and end tags are special) | |
| 8 # and CDATA (character data -- only end tags are special). | |
| 9 | |
| 10 | |
| 11 import markupbase | |
| 12 import re | |
| 13 import string | |
| 14 | |
| 15 # Regular expressions used for parsing | |
| 16 | |
| 17 interesting_normal = re.compile('[&<]') | |
| 18 interesting_cdata = re.compile(r'<(/|\Z)') | |
| 19 incomplete = re.compile('&[a-zA-Z#]') | |
| 20 | |
| 21 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') | |
| 22 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') | |
| 23 | |
| 24 starttagopen = re.compile('<[a-zA-Z]') | |
| 25 piclose = re.compile('>') | |
| 26 endtagopen = re.compile('</') | |
| 27 commentclose = re.compile(r'--\s*>') | |
| 28 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') | |
| 29 attrfind = re.compile( | |
| 30 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' | |
| 31 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?') | |
| 32 | |
| 33 locatestarttagend = re.compile(r""" | |
| 34 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name | |
| 35 (?:\s+ # whitespace before attribute name | |
| 36 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name | |
| 37 (?:\s*=\s* # value indicator | |
| 38 (?:'[^']*' # LITA-enclosed value | |
| 39 |\"[^\"]*\" # LIT-enclosed value | |
| 40 |[^'\">\s]+ # bare value | |
| 41 ) | |
| 42 )? | |
| 43 ) | |
| 44 )* | |
| 45 \s* # trailing whitespace | |
| 46 """, re.VERBOSE) | |
| 47 endendtag = re.compile('>') | |
| 48 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') | |
| 49 | |
| 50 | |
| 51 class HTMLParseError(Exception): | |
| 52 """Exception raised for all parse errors.""" | |
| 53 | |
| 54 def __init__(self, msg, position=(None, None)): | |
| 55 assert msg | |
| 56 self.msg = msg | |
| 57 self.lineno = position[0] | |
| 58 self.offset = position[1] | |
| 59 | |
| 60 def __str__(self): | |
| 61 result = self.msg | |
| 62 if self.lineno is not None: | |
| 63 result = result + ", at line %d" % self.lineno | |
| 64 if self.offset is not None: | |
| 65 result = result + ", column %d" % (self.offset + 1) | |
| 66 return result | |
| 67 | |
| 68 | |
| 69 def _contains_at(s, sub, pos): | |
| 70 return s[pos:pos+len(sub)] == sub | |
| 71 | |
| 72 | |
| 73 class HTMLParser(markupbase.ParserBase): | |
| 74 """Find tags and other markup and call handler functions. | |
| 75 | |
| 76 Usage: | |
| 77 p = HTMLParser() | |
| 78 p.feed(data) | |
| 79 ... | |
| 80 p.close() | |
| 81 | |
| 82 Start tags are handled by calling self.handle_starttag() or | |
| 83 self.handle_startendtag(); end tags by self.handle_endtag(). The | |
| 84 data between tags is passed from the parser to the derived class | |
| 85 by calling self.handle_data() with the data as argument (the data | |
| 86 may be split up in arbitrary chunks). Entity references are | |
| 87 passed by calling self.handle_entityref() with the entity | |
| 88 reference as the argument. Numeric character references are | |
| 89 passed to self.handle_charref() with the string containing the | |
| 90 reference as the argument. | |
| 91 """ | |
| 92 | |
| 93 CDATA_CONTENT_ELEMENTS = ("script", "style") | |
| 94 | |
| 95 | |
| 96 def __init__(self): | |
| 97 """Initialize and reset this instance.""" | |
| 98 self.reset() | |
| 99 | |
| 100 def reset(self): | |
| 101 """Reset this instance. Loses all unprocessed data.""" | |
| 102 self.rawdata = '' | |
| 103 self.stack = [] | |
| 104 self.lasttag = '???' | |
| 105 self.interesting = interesting_normal | |
| 106 markupbase.ParserBase.reset(self) | |
| 107 | |
| 108 def feed(self, data): | |
| 109 """Feed data to the parser. | |
| 110 | |
| 111 Call this as often as you want, with as little or as much text | |
| 112 as you want (may include '\n'). | |
| 113 """ | |
| 114 self.rawdata = self.rawdata + data | |
| 115 self.goahead(0) | |
| 116 | |
| 117 def close(self): | |
| 118 """Handle any buffered data.""" | |
| 119 self.goahead(1) | |
| 120 | |
| 121 def error(self, message): | |
| 122 raise HTMLParseError(message, self.getpos()) | |
| 123 | |
| 124 __starttag_text = None | |
| 125 | |
| 126 def get_starttag_text(self): | |
| 127 """Return full source of start tag: '<...>'.""" | |
| 128 return self.__starttag_text | |
| 129 | |
| 130 cdata_endtag = None | |
| 131 | |
| 132 def set_cdata_mode(self, endtag=None): | |
| 133 self.cdata_endtag = endtag | |
| 134 self.interesting = interesting_cdata | |
| 135 | |
| 136 def clear_cdata_mode(self): | |
| 137 self.cdata_endtag = None | |
| 138 self.interesting = interesting_normal | |
| 139 | |
| 140 # Internal -- handle data as far as reasonable. May leave state | |
| 141 # and data to be processed by a subsequent call. If 'end' is | |
| 142 # true, force handling all data as if followed by EOF marker. | |
| 143 def goahead(self, end): | |
| 144 rawdata = self.rawdata | |
| 145 i = 0 | |
| 146 n = len(rawdata) | |
| 147 while i < n: | |
| 148 match = self.interesting.search(rawdata, i) # < or & | |
| 149 if match: | |
| 150 j = match.start() | |
| 151 else: | |
| 152 j = n | |
| 153 if i < j: self.handle_data(rawdata[i:j]) | |
| 154 i = self.updatepos(i, j) | |
| 155 if i == n: break | |
| 156 if rawdata[i] == '<': | |
| 157 if starttagopen.match(rawdata, i): # < + letter | |
| 158 k = self.parse_starttag(i) | |
| 159 elif endtagopen.match(rawdata, i): # </ | |
| 160 k = self.parse_endtag(i) | |
| 161 elif _contains_at(rawdata, "<!--", i): # <!-- | |
| 162 k = self.parse_comment(i) | |
| 163 elif _contains_at(rawdata, "<!", i): # <! | |
| 164 k = self.parse_declaration(i) | |
| 165 elif _contains_at(rawdata, "<?", i): # <? | |
| 166 k = self.parse_pi(i) | |
| 167 elif _contains_at(rawdata, "<?", i): # <! | |
| 168 k = self.parse_declaration(i) | |
| 169 elif (i + 1) < n: | |
| 170 self.handle_data("<") | |
| 171 k = i + 1 | |
| 172 else: | |
| 173 break | |
| 174 if k < 0: | |
| 175 if end: | |
| 176 self.error("EOF in middle of construct") | |
| 177 break | |
| 178 i = self.updatepos(i, k) | |
| 179 elif rawdata[i:i+2] == "&#": | |
| 180 match = charref.match(rawdata, i) | |
| 181 if match: | |
| 182 name = match.group()[2:-1] | |
| 183 self.handle_charref(name) | |
| 184 k = match.end() | |
| 185 if rawdata[k-1] != ';': | |
| 186 k = k - 1 | |
| 187 i = self.updatepos(i, k) | |
| 188 continue | |
| 189 else: | |
| 190 break | |
| 191 elif rawdata[i] == '&': | |
| 192 match = entityref.match(rawdata, i) | |
| 193 if match: | |
| 194 name = match.group(1) | |
| 195 self.handle_entityref(name) | |
| 196 k = match.end() | |
| 197 if rawdata[k-1] != ';': | |
| 198 k = k - 1 | |
| 199 i = self.updatepos(i, k) | |
| 200 continue | |
| 201 match = incomplete.match(rawdata, i) | |
| 202 if match: | |
| 203 # match.group() will contain at least 2 chars | |
| 204 rest = rawdata[i:] | |
| 205 if end and match.group() == rest: | |
| 206 self.error("EOF in middle of entity or char ref") | |
| 207 # incomplete | |
| 208 break | |
| 209 elif (i + 1) < n: | |
| 210 # not the end of the buffer, and can't be confused | |
| 211 # with some other construct | |
| 212 self.handle_data("&") | |
| 213 i = self.updatepos(i, i + 1) | |
| 214 else: | |
| 215 break | |
| 216 else: | |
| 217 assert 0, "interesting.search() lied" | |
| 218 # end while | |
| 219 if end and i < n: | |
| 220 self.handle_data(rawdata[i:n]) | |
| 221 i = self.updatepos(i, n) | |
| 222 self.rawdata = rawdata[i:] | |
| 223 | |
| 224 # Internal -- parse comment, return end or -1 if not terminated | |
| 225 def parse_comment(self, i, report=1): | |
| 226 rawdata = self.rawdata | |
| 227 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()' | |
| 228 match = commentclose.search(rawdata, i+4) | |
| 229 if not match: | |
| 230 return -1 | |
| 231 if report: | |
| 232 j = match.start() | |
| 233 self.handle_comment(rawdata[i+4: j]) | |
| 234 j = match.end() | |
| 235 return j | |
| 236 | |
| 237 # Internal -- parse processing instr, return end or -1 if not terminated | |
| 238 def parse_pi(self, i): | |
| 239 rawdata = self.rawdata | |
| 240 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' | |
| 241 match = piclose.search(rawdata, i+2) # > | |
| 242 if not match: | |
| 243 return -1 | |
| 244 j = match.start() | |
| 245 self.handle_pi(rawdata[i+2: j]) | |
| 246 j = match.end() | |
| 247 return j | |
| 248 | |
| 249 # Internal -- handle starttag, return end or -1 if not terminated | |
| 250 def parse_starttag(self, i): | |
| 251 self.__starttag_text = None | |
| 252 endpos = self.check_for_whole_start_tag(i) | |
| 253 if endpos < 0: | |
| 254 return endpos | |
| 255 rawdata = self.rawdata | |
| 256 self.__starttag_text = rawdata[i:endpos] | |
| 257 | |
| 258 # Now parse the data between i+1 and j into a tag and attrs | |
| 259 attrs = [] | |
| 260 match = tagfind.match(rawdata, i+1) | |
| 261 assert match, 'unexpected call to parse_starttag()' | |
| 262 k = match.end() | |
| 263 self.lasttag = tag = string.lower(rawdata[i+1:k]) | |
| 264 | |
| 265 while k < endpos: | |
| 266 m = attrfind.match(rawdata, k) | |
| 267 if not m: | |
| 268 break | |
| 269 attrname, rest, attrvalue = m.group(1, 2, 3) | |
| 270 if not rest: | |
| 271 attrvalue = None | |
| 272 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | |
| 273 attrvalue[:1] == '"' == attrvalue[-1:]: | |
| 274 attrvalue = attrvalue[1:-1] | |
| 275 attrvalue = self.unescape(attrvalue) | |
| 276 attrs.append((string.lower(attrname), attrvalue)) | |
| 277 k = m.end() | |
| 278 | |
| 279 end = string.strip(rawdata[k:endpos]) | |
| 280 if end not in (">", "/>"): | |
| 281 lineno, offset = self.getpos() | |
| 282 if "\n" in self.__starttag_text: | |
| 283 lineno = lineno + string.count(self.__starttag_text, "\n") | |
| 284 offset = len(self.__starttag_text) \ | |
| 285 - string.rfind(self.__starttag_text, "\n") | |
| 286 else: | |
| 287 offset = offset + len(self.__starttag_text) | |
| 288 self.error("junk characters in start tag: %s" | |
| 289 % `rawdata[k:endpos][:20]`) | |
| 290 if end[-2:] == '/>': | |
| 291 # XHTML-style empty tag: <span attr="value" /> | |
| 292 self.handle_startendtag(tag, attrs) | |
| 293 else: | |
| 294 self.handle_starttag(tag, attrs) | |
| 295 if tag in self.CDATA_CONTENT_ELEMENTS: | |
| 296 self.set_cdata_mode(tag) | |
| 297 return endpos | |
| 298 | |
| 299 # Internal -- check to see if we have a complete starttag; return end | |
| 300 # or -1 if incomplete. | |
| 301 def check_for_whole_start_tag(self, i): | |
| 302 rawdata = self.rawdata | |
| 303 m = locatestarttagend.match(rawdata, i) | |
| 304 if m: | |
| 305 j = m.end() | |
| 306 next = rawdata[j:j+1] | |
| 307 if next == ">": | |
| 308 return j + 1 | |
| 309 if next == "/": | |
| 310 s = rawdata[j:j+2] | |
| 311 if s == "/>": | |
| 312 return j + 2 | |
| 313 if s == "/": | |
| 314 # buffer boundary | |
| 315 return -1 | |
| 316 # else bogus input | |
| 317 self.updatepos(i, j + 1) | |
| 318 self.error("malformed empty start tag") | |
| 319 if next == "": | |
| 320 # end of input | |
| 321 return -1 | |
| 322 if next in ("abcdefghijklmnopqrstuvwxyz=/" | |
| 323 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): | |
| 324 # end of input in or before attribute value, or we have the | |
| 325 # '/' from a '/>' ending | |
| 326 return -1 | |
| 327 self.updatepos(i, j) | |
| 328 self.error("malformed start tag") | |
| 329 raise AssertionError("we should not get here!") | |
| 330 | |
| 331 # Internal -- parse endtag, return end or -1 if incomplete | |
| 332 def parse_endtag(self, i): | |
| 333 rawdata = self.rawdata | |
| 334 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" | |
| 335 match = endendtag.search(rawdata, i+1) # > | |
| 336 if not match: | |
| 337 return -1 | |
| 338 j = match.end() | |
| 339 match = endtagfind.match(rawdata, i) # </ + tag + > | |
| 340 if not match: | |
| 341 self.error("bad end tag: %s" % `rawdata[i:j]`) | |
| 342 tag = string.lower(match.group(1)) | |
| 343 if ( self.cdata_endtag is not None | |
| 344 and tag != self.cdata_endtag): | |
| 345 # Should be a mismatched end tag, but we'll treat it | |
| 346 # as text anyway, since most HTML authors aren't | |
| 347 # interested in the finer points of syntax. | |
| 348 self.handle_data(match.group(0)) | |
| 349 else: | |
| 350 self.handle_endtag(tag) | |
| 351 self.clear_cdata_mode() | |
| 352 return j | |
| 353 | |
| 354 # Overridable -- finish processing of start+end tag: <tag.../> | |
| 355 def handle_startendtag(self, tag, attrs): | |
| 356 self.handle_starttag(tag, attrs) | |
| 357 self.handle_endtag(tag) | |
| 358 | |
| 359 # Overridable -- handle start tag | |
| 360 def handle_starttag(self, tag, attrs): | |
| 361 pass | |
| 362 | |
| 363 # Overridable -- handle end tag | |
| 364 def handle_endtag(self, tag): | |
| 365 pass | |
| 366 | |
| 367 # Overridable -- handle character reference | |
| 368 def handle_charref(self, name): | |
| 369 pass | |
| 370 | |
| 371 # Overridable -- handle entity reference | |
| 372 def handle_entityref(self, name): | |
| 373 pass | |
| 374 | |
| 375 # Overridable -- handle data | |
| 376 def handle_data(self, data): | |
| 377 pass | |
| 378 | |
| 379 # Overridable -- handle comment | |
| 380 def handle_comment(self, data): | |
| 381 pass | |
| 382 | |
| 383 # Overridable -- handle declaration | |
| 384 def handle_decl(self, decl): | |
| 385 pass | |
| 386 | |
| 387 # Overridable -- handle processing instruction | |
| 388 def handle_pi(self, data): | |
| 389 pass | |
| 390 | |
| 391 def unknown_decl(self, data): | |
| 392 self.error("unknown declaration: " + `data`) | |
| 393 | |
| 394 # Internal -- helper to remove special character quoting | |
| 395 def unescape(self, s): | |
| 396 if '&' not in s: | |
| 397 return s | |
| 398 s = string.replace(s, "<", "<") | |
| 399 s = string.replace(s, ">", ">") | |
| 400 s = string.replace(s, "'", "'") | |
| 401 s = string.replace(s, """, '"') | |
| 402 s = string.replace(s, "&", "&") # Must be last | |
| 403 return s |
