Mercurial > p > roundup > code
comparison TAL/HTMLTALParser.py @ 996:03cc9a57cb4c
missed one
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Fri, 30 Aug 2002 08:46:22 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 995:b59b60306914 | 996:03cc9a57cb4c |
|---|---|
| 1 ############################################################################## | |
| 2 # | |
| 3 # Copyright (c) 2001, 2002 Zope Corporation and Contributors. | |
| 4 # All Rights Reserved. | |
| 5 # | |
| 6 # This software is subject to the provisions of the Zope Public License, | |
| 7 # Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | |
| 8 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | |
| 9 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| 10 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | |
| 11 # FOR A PARTICULAR PURPOSE | |
| 12 # | |
| 13 ############################################################################## | |
| 14 """ | |
| 15 Parse HTML and compile to TALInterpreter intermediate code. | |
| 16 """ | |
| 17 | |
| 18 import sys | |
| 19 import string | |
| 20 | |
| 21 from TALGenerator import TALGenerator | |
| 22 from TALDefs import ZOPE_METAL_NS, ZOPE_TAL_NS, METALError, TALError | |
| 23 from HTMLParser import HTMLParser, HTMLParseError | |
| 24 | |
| 25 BOOLEAN_HTML_ATTRS = [ | |
| 26 # List of Boolean attributes in HTML that may be given in | |
| 27 # minimized form (e.g. <img ismap> rather than <img ismap="">) | |
| 28 # From http://www.w3.org/TR/xhtml1/#guidelines (C.10) | |
| 29 "compact", "nowrap", "ismap", "declare", "noshade", "checked", | |
| 30 "disabled", "readonly", "multiple", "selected", "noresize", | |
| 31 "defer" | |
| 32 ] | |
| 33 | |
| 34 EMPTY_HTML_TAGS = [ | |
| 35 # List of HTML tags with an empty content model; these are | |
| 36 # rendered in minimized form, e.g. <img />. | |
| 37 # From http://www.w3.org/TR/xhtml1/#dtds | |
| 38 "base", "meta", "link", "hr", "br", "param", "img", "area", | |
| 39 "input", "col", "basefont", "isindex", "frame", | |
| 40 ] | |
| 41 | |
| 42 PARA_LEVEL_HTML_TAGS = [ | |
| 43 # List of HTML elements that close open paragraph-level elements | |
| 44 # and are themselves paragraph-level. | |
| 45 "h1", "h2", "h3", "h4", "h5", "h6", "p", | |
| 46 ] | |
| 47 | |
| 48 BLOCK_CLOSING_TAG_MAP = { | |
| 49 "tr": ("tr", "td", "th"), | |
| 50 "td": ("td", "th"), | |
| 51 "th": ("td", "th"), | |
| 52 "li": ("li",), | |
| 53 "dd": ("dd", "dt"), | |
| 54 "dt": ("dd", "dt"), | |
| 55 } | |
| 56 | |
| 57 BLOCK_LEVEL_HTML_TAGS = [ | |
| 58 # List of HTML tags that denote larger sections than paragraphs. | |
| 59 "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody", | |
| 60 "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div", | |
| 61 ] | |
| 62 | |
| 63 TIGHTEN_IMPLICIT_CLOSE_TAGS = (PARA_LEVEL_HTML_TAGS | |
| 64 + BLOCK_CLOSING_TAG_MAP.keys()) | |
| 65 | |
| 66 | |
| 67 class NestingError(HTMLParseError): | |
| 68 """Exception raised when elements aren't properly nested.""" | |
| 69 | |
| 70 def __init__(self, tagstack, endtag, position=(None, None)): | |
| 71 self.endtag = endtag | |
| 72 if tagstack: | |
| 73 if len(tagstack) == 1: | |
| 74 msg = ('Open tag <%s> does not match close tag </%s>' | |
| 75 % (tagstack[0], endtag)) | |
| 76 else: | |
| 77 msg = ('Open tags <%s> do not match close tag </%s>' | |
| 78 % (string.join(tagstack, '>, <'), endtag)) | |
| 79 else: | |
| 80 msg = 'No tags are open to match </%s>' % endtag | |
| 81 HTMLParseError.__init__(self, msg, position) | |
| 82 | |
| 83 class EmptyTagError(NestingError): | |
| 84 """Exception raised when empty elements have an end tag.""" | |
| 85 | |
| 86 def __init__(self, tag, position=(None, None)): | |
| 87 self.tag = tag | |
| 88 msg = 'Close tag </%s> should be removed' % tag | |
| 89 HTMLParseError.__init__(self, msg, position) | |
| 90 | |
| 91 class OpenTagError(NestingError): | |
| 92 """Exception raised when a tag is not allowed in another tag.""" | |
| 93 | |
| 94 def __init__(self, tagstack, tag, position=(None, None)): | |
| 95 self.tag = tag | |
| 96 msg = 'Tag <%s> is not allowed in <%s>' % (tag, tagstack[-1]) | |
| 97 HTMLParseError.__init__(self, msg, position) | |
| 98 | |
| 99 class HTMLTALParser(HTMLParser): | |
| 100 | |
| 101 # External API | |
| 102 | |
| 103 def __init__(self, gen=None): | |
| 104 HTMLParser.__init__(self) | |
| 105 if gen is None: | |
| 106 gen = TALGenerator(xml=0) | |
| 107 self.gen = gen | |
| 108 self.tagstack = [] | |
| 109 self.nsstack = [] | |
| 110 self.nsdict = {'tal': ZOPE_TAL_NS, 'metal': ZOPE_METAL_NS} | |
| 111 | |
| 112 def parseFile(self, file): | |
| 113 f = open(file) | |
| 114 data = f.read() | |
| 115 f.close() | |
| 116 self.parseString(data) | |
| 117 | |
| 118 def parseString(self, data): | |
| 119 self.feed(data) | |
| 120 self.close() | |
| 121 while self.tagstack: | |
| 122 self.implied_endtag(self.tagstack[-1], 2) | |
| 123 assert self.nsstack == [], self.nsstack | |
| 124 | |
| 125 def getCode(self): | |
| 126 return self.gen.getCode() | |
| 127 | |
| 128 def getWarnings(self): | |
| 129 return () | |
| 130 | |
| 131 # Overriding HTMLParser methods | |
| 132 | |
| 133 def handle_starttag(self, tag, attrs): | |
| 134 self.close_para_tags(tag) | |
| 135 self.scan_xmlns(attrs) | |
| 136 tag, attrlist, taldict, metaldict = self.process_ns(tag, attrs) | |
| 137 self.tagstack.append(tag) | |
| 138 self.gen.emitStartElement(tag, attrlist, taldict, metaldict, | |
| 139 self.getpos()) | |
| 140 if tag in EMPTY_HTML_TAGS: | |
| 141 self.implied_endtag(tag, -1) | |
| 142 | |
| 143 def handle_startendtag(self, tag, attrs): | |
| 144 self.close_para_tags(tag) | |
| 145 self.scan_xmlns(attrs) | |
| 146 tag, attrlist, taldict, metaldict = self.process_ns(tag, attrs) | |
| 147 if taldict.get("content"): | |
| 148 self.gen.emitStartElement(tag, attrlist, taldict, metaldict, | |
| 149 self.getpos()) | |
| 150 self.gen.emitEndElement(tag, implied=-1) | |
| 151 else: | |
| 152 self.gen.emitStartElement(tag, attrlist, taldict, metaldict, | |
| 153 self.getpos(), isend=1) | |
| 154 self.pop_xmlns() | |
| 155 | |
| 156 def handle_endtag(self, tag): | |
| 157 if tag in EMPTY_HTML_TAGS: | |
| 158 # </img> etc. in the source is an error | |
| 159 raise EmptyTagError(tag, self.getpos()) | |
| 160 self.close_enclosed_tags(tag) | |
| 161 self.gen.emitEndElement(tag) | |
| 162 self.pop_xmlns() | |
| 163 self.tagstack.pop() | |
| 164 | |
| 165 def close_para_tags(self, tag): | |
| 166 if tag in EMPTY_HTML_TAGS: | |
| 167 return | |
| 168 close_to = -1 | |
| 169 if BLOCK_CLOSING_TAG_MAP.has_key(tag): | |
| 170 blocks_to_close = BLOCK_CLOSING_TAG_MAP[tag] | |
| 171 for i in range(len(self.tagstack)): | |
| 172 t = self.tagstack[i] | |
| 173 if t in blocks_to_close: | |
| 174 if close_to == -1: | |
| 175 close_to = i | |
| 176 elif t in BLOCK_LEVEL_HTML_TAGS: | |
| 177 close_to = -1 | |
| 178 elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS: | |
| 179 i = len(self.tagstack) - 1 | |
| 180 while i >= 0: | |
| 181 closetag = self.tagstack[i] | |
| 182 if closetag in BLOCK_LEVEL_HTML_TAGS: | |
| 183 break | |
| 184 if closetag in PARA_LEVEL_HTML_TAGS: | |
| 185 if closetag != "p": | |
| 186 raise OpenTagError(self.tagstack, tag, self.getpos()) | |
| 187 close_to = i | |
| 188 i = i - 1 | |
| 189 if close_to >= 0: | |
| 190 while len(self.tagstack) > close_to: | |
| 191 self.implied_endtag(self.tagstack[-1], 1) | |
| 192 | |
| 193 def close_enclosed_tags(self, tag): | |
| 194 if tag not in self.tagstack: | |
| 195 raise NestingError(self.tagstack, tag, self.getpos()) | |
| 196 while tag != self.tagstack[-1]: | |
| 197 self.implied_endtag(self.tagstack[-1], 1) | |
| 198 assert self.tagstack[-1] == tag | |
| 199 | |
| 200 def implied_endtag(self, tag, implied): | |
| 201 assert tag == self.tagstack[-1] | |
| 202 assert implied in (-1, 1, 2) | |
| 203 isend = (implied < 0) | |
| 204 if tag in TIGHTEN_IMPLICIT_CLOSE_TAGS: | |
| 205 # Pick out trailing whitespace from the program, and | |
| 206 # insert the close tag before the whitespace. | |
| 207 white = self.gen.unEmitWhitespace() | |
| 208 else: | |
| 209 white = None | |
| 210 self.gen.emitEndElement(tag, isend=isend, implied=implied) | |
| 211 if white: | |
| 212 self.gen.emitRawText(white) | |
| 213 self.tagstack.pop() | |
| 214 self.pop_xmlns() | |
| 215 | |
| 216 def handle_charref(self, name): | |
| 217 self.gen.emitRawText("&#%s;" % name) | |
| 218 | |
| 219 def handle_entityref(self, name): | |
| 220 self.gen.emitRawText("&%s;" % name) | |
| 221 | |
| 222 def handle_data(self, data): | |
| 223 self.gen.emitRawText(data) | |
| 224 | |
| 225 def handle_comment(self, data): | |
| 226 self.gen.emitRawText("<!--%s-->" % data) | |
| 227 | |
| 228 def handle_decl(self, data): | |
| 229 self.gen.emitRawText("<!%s>" % data) | |
| 230 | |
| 231 def handle_pi(self, data): | |
| 232 self.gen.emitRawText("<?%s>" % data) | |
| 233 | |
| 234 # Internal thingies | |
| 235 | |
| 236 def scan_xmlns(self, attrs): | |
| 237 nsnew = {} | |
| 238 for key, value in attrs: | |
| 239 if key[:6] == "xmlns:": | |
| 240 nsnew[key[6:]] = value | |
| 241 if nsnew: | |
| 242 self.nsstack.append(self.nsdict) | |
| 243 self.nsdict = self.nsdict.copy() | |
| 244 self.nsdict.update(nsnew) | |
| 245 else: | |
| 246 self.nsstack.append(self.nsdict) | |
| 247 | |
| 248 def pop_xmlns(self): | |
| 249 self.nsdict = self.nsstack.pop() | |
| 250 | |
| 251 def fixname(self, name): | |
| 252 if ':' in name: | |
| 253 prefix, suffix = string.split(name, ':', 1) | |
| 254 if prefix == 'xmlns': | |
| 255 nsuri = self.nsdict.get(suffix) | |
| 256 if nsuri in (ZOPE_TAL_NS, ZOPE_METAL_NS): | |
| 257 return name, name, prefix | |
| 258 else: | |
| 259 nsuri = self.nsdict.get(prefix) | |
| 260 if nsuri == ZOPE_TAL_NS: | |
| 261 return name, suffix, 'tal' | |
| 262 elif nsuri == ZOPE_METAL_NS: | |
| 263 return name, suffix, 'metal' | |
| 264 return name, name, 0 | |
| 265 | |
| 266 def process_ns(self, name, attrs): | |
| 267 attrlist = [] | |
| 268 taldict = {} | |
| 269 metaldict = {} | |
| 270 name, namebase, namens = self.fixname(name) | |
| 271 for item in attrs: | |
| 272 key, value = item | |
| 273 key, keybase, keyns = self.fixname(key) | |
| 274 ns = keyns or namens # default to tag namespace | |
| 275 if ns and ns != 'unknown': | |
| 276 item = (key, value, ns) | |
| 277 if ns == 'tal': | |
| 278 if taldict.has_key(keybase): | |
| 279 raise TALError("duplicate TAL attribute " + | |
| 280 `keybase`, self.getpos()) | |
| 281 taldict[keybase] = value | |
| 282 elif ns == 'metal': | |
| 283 if metaldict.has_key(keybase): | |
| 284 raise METALError("duplicate METAL attribute " + | |
| 285 `keybase`, self.getpos()) | |
| 286 metaldict[keybase] = value | |
| 287 attrlist.append(item) | |
| 288 if namens in ('metal', 'tal'): | |
| 289 taldict['tal tag'] = namens | |
| 290 return name, attrlist, taldict, metaldict |
