Mercurial > p > roundup > code
comparison roundup/cgi/TAL/markupbase.py @ 1049:b9988e118055
moved
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Thu, 05 Sep 2002 00:37:09 +0000 |
| parents | |
| children | fc52d57c6c3e |
comparison
equal
deleted
inserted
replaced
| 1048:1250251f2793 | 1049:b9988e118055 |
|---|---|
| 1 """Shared support for scanning document type declarations in HTML and XHTML.""" | |
| 2 | |
| 3 import re | |
| 4 import string | |
| 5 | |
| 6 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match | |
| 7 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match | |
| 8 | |
| 9 del re | |
| 10 | |
| 11 | |
| 12 class ParserBase: | |
| 13 """Parser base class which provides some common support methods used | |
| 14 by the SGML/HTML and XHTML parsers.""" | |
| 15 | |
| 16 def reset(self): | |
| 17 self.lineno = 1 | |
| 18 self.offset = 0 | |
| 19 | |
| 20 def getpos(self): | |
| 21 """Return current line number and offset.""" | |
| 22 return self.lineno, self.offset | |
| 23 | |
| 24 # Internal -- update line number and offset. This should be | |
| 25 # called for each piece of data exactly once, in order -- in other | |
| 26 # words the concatenation of all the input strings to this | |
| 27 # function should be exactly the entire input. | |
| 28 def updatepos(self, i, j): | |
| 29 if i >= j: | |
| 30 return j | |
| 31 rawdata = self.rawdata | |
| 32 nlines = string.count(rawdata, "\n", i, j) | |
| 33 if nlines: | |
| 34 self.lineno = self.lineno + nlines | |
| 35 pos = string.rindex(rawdata, "\n", i, j) # Should not fail | |
| 36 self.offset = j-(pos+1) | |
| 37 else: | |
| 38 self.offset = self.offset + j-i | |
| 39 return j | |
| 40 | |
| 41 _decl_otherchars = '' | |
| 42 | |
| 43 # Internal -- parse declaration (for use by subclasses). | |
| 44 def parse_declaration(self, i): | |
| 45 # This is some sort of declaration; in "HTML as | |
| 46 # deployed," this should only be the document type | |
| 47 # declaration ("<!DOCTYPE html...>"). | |
| 48 rawdata = self.rawdata | |
| 49 import sys | |
| 50 j = i + 2 | |
| 51 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" | |
| 52 if rawdata[j:j+1] in ("-", ""): | |
| 53 # Start of comment followed by buffer boundary, | |
| 54 # or just a buffer boundary. | |
| 55 return -1 | |
| 56 # in practice, this should look like: ((name|stringlit) S*)+ '>' | |
| 57 n = len(rawdata) | |
| 58 decltype, j = self._scan_name(j, i) | |
| 59 if j < 0: | |
| 60 return j | |
| 61 if decltype == "doctype": | |
| 62 self._decl_otherchars = '' | |
| 63 while j < n: | |
| 64 c = rawdata[j] | |
| 65 if c == ">": | |
| 66 # end of declaration syntax | |
| 67 data = rawdata[i+2:j] | |
| 68 if decltype == "doctype": | |
| 69 self.handle_decl(data) | |
| 70 else: | |
| 71 self.unknown_decl(data) | |
| 72 return j + 1 | |
| 73 if c in "\"'": | |
| 74 m = _declstringlit_match(rawdata, j) | |
| 75 if not m: | |
| 76 return -1 # incomplete | |
| 77 j = m.end() | |
| 78 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 79 name, j = self._scan_name(j, i) | |
| 80 elif c in self._decl_otherchars: | |
| 81 j = j + 1 | |
| 82 elif c == "[": | |
| 83 if decltype == "doctype": | |
| 84 j = self._parse_doctype_subset(j + 1, i) | |
| 85 else: | |
| 86 self.error("unexpected '[' char in declaration") | |
| 87 else: | |
| 88 self.error( | |
| 89 "unexpected %s char in declaration" % `rawdata[j]`) | |
| 90 if j < 0: | |
| 91 return j | |
| 92 return -1 # incomplete | |
| 93 | |
| 94 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, | |
| 95 # returning the index just past any whitespace following the trailing ']'. | |
| 96 def _parse_doctype_subset(self, i, declstartpos): | |
| 97 rawdata = self.rawdata | |
| 98 n = len(rawdata) | |
| 99 j = i | |
| 100 while j < n: | |
| 101 c = rawdata[j] | |
| 102 if c == "<": | |
| 103 s = rawdata[j:j+2] | |
| 104 if s == "<": | |
| 105 # end of buffer; incomplete | |
| 106 return -1 | |
| 107 if s != "<!": | |
| 108 self.updatepos(declstartpos, j + 1) | |
| 109 self.error("unexpected char in internal subset (in %s)" | |
| 110 % `s`) | |
| 111 if (j + 2) == n: | |
| 112 # end of buffer; incomplete | |
| 113 return -1 | |
| 114 if (j + 4) > n: | |
| 115 # end of buffer; incomplete | |
| 116 return -1 | |
| 117 if rawdata[j:j+4] == "<!--": | |
| 118 j = self.parse_comment(j, report=0) | |
| 119 if j < 0: | |
| 120 return j | |
| 121 continue | |
| 122 name, j = self._scan_name(j + 2, declstartpos) | |
| 123 if j == -1: | |
| 124 return -1 | |
| 125 if name not in ("attlist", "element", "entity", "notation"): | |
| 126 self.updatepos(declstartpos, j + 2) | |
| 127 self.error( | |
| 128 "unknown declaration %s in internal subset" % `name`) | |
| 129 # handle the individual names | |
| 130 meth = getattr(self, "_parse_doctype_" + name) | |
| 131 j = meth(j, declstartpos) | |
| 132 if j < 0: | |
| 133 return j | |
| 134 elif c == "%": | |
| 135 # parameter entity reference | |
| 136 if (j + 1) == n: | |
| 137 # end of buffer; incomplete | |
| 138 return -1 | |
| 139 s, j = self._scan_name(j + 1, declstartpos) | |
| 140 if j < 0: | |
| 141 return j | |
| 142 if rawdata[j] == ";": | |
| 143 j = j + 1 | |
| 144 elif c == "]": | |
| 145 j = j + 1 | |
| 146 while j < n and rawdata[j] in string.whitespace: | |
| 147 j = j + 1 | |
| 148 if j < n: | |
| 149 if rawdata[j] == ">": | |
| 150 return j | |
| 151 self.updatepos(declstartpos, j) | |
| 152 self.error("unexpected char after internal subset") | |
| 153 else: | |
| 154 return -1 | |
| 155 elif c in string.whitespace: | |
| 156 j = j + 1 | |
| 157 else: | |
| 158 self.updatepos(declstartpos, j) | |
| 159 self.error("unexpected char %s in internal subset" % `c`) | |
| 160 # end of buffer reached | |
| 161 return -1 | |
| 162 | |
| 163 # Internal -- scan past <!ELEMENT declarations | |
| 164 def _parse_doctype_element(self, i, declstartpos): | |
| 165 rawdata = self.rawdata | |
| 166 n = len(rawdata) | |
| 167 name, j = self._scan_name(i, declstartpos) | |
| 168 if j == -1: | |
| 169 return -1 | |
| 170 # style content model; just skip until '>' | |
| 171 if '>' in rawdata[j:]: | |
| 172 return string.find(rawdata, ">", j) + 1 | |
| 173 return -1 | |
| 174 | |
| 175 # Internal -- scan past <!ATTLIST declarations | |
| 176 def _parse_doctype_attlist(self, i, declstartpos): | |
| 177 rawdata = self.rawdata | |
| 178 name, j = self._scan_name(i, declstartpos) | |
| 179 c = rawdata[j:j+1] | |
| 180 if c == "": | |
| 181 return -1 | |
| 182 if c == ">": | |
| 183 return j + 1 | |
| 184 while 1: | |
| 185 # scan a series of attribute descriptions; simplified: | |
| 186 # name type [value] [#constraint] | |
| 187 name, j = self._scan_name(j, declstartpos) | |
| 188 if j < 0: | |
| 189 return j | |
| 190 c = rawdata[j:j+1] | |
| 191 if c == "": | |
| 192 return -1 | |
| 193 if c == "(": | |
| 194 # an enumerated type; look for ')' | |
| 195 if ")" in rawdata[j:]: | |
| 196 j = string.find(rawdata, ")", j) + 1 | |
| 197 else: | |
| 198 return -1 | |
| 199 while rawdata[j:j+1] in string.whitespace: | |
| 200 j = j + 1 | |
| 201 if not rawdata[j:]: | |
| 202 # end of buffer, incomplete | |
| 203 return -1 | |
| 204 else: | |
| 205 name, j = self._scan_name(j, declstartpos) | |
| 206 c = rawdata[j:j+1] | |
| 207 if not c: | |
| 208 return -1 | |
| 209 if c in "'\"": | |
| 210 m = _declstringlit_match(rawdata, j) | |
| 211 if m: | |
| 212 j = m.end() | |
| 213 else: | |
| 214 return -1 | |
| 215 c = rawdata[j:j+1] | |
| 216 if not c: | |
| 217 return -1 | |
| 218 if c == "#": | |
| 219 if rawdata[j:] == "#": | |
| 220 # end of buffer | |
| 221 return -1 | |
| 222 name, j = self._scan_name(j + 1, declstartpos) | |
| 223 if j < 0: | |
| 224 return j | |
| 225 c = rawdata[j:j+1] | |
| 226 if not c: | |
| 227 return -1 | |
| 228 if c == '>': | |
| 229 # all done | |
| 230 return j + 1 | |
| 231 | |
| 232 # Internal -- scan past <!NOTATION declarations | |
| 233 def _parse_doctype_notation(self, i, declstartpos): | |
| 234 name, j = self._scan_name(i, declstartpos) | |
| 235 if j < 0: | |
| 236 return j | |
| 237 rawdata = self.rawdata | |
| 238 while 1: | |
| 239 c = rawdata[j:j+1] | |
| 240 if not c: | |
| 241 # end of buffer; incomplete | |
| 242 return -1 | |
| 243 if c == '>': | |
| 244 return j + 1 | |
| 245 if c in "'\"": | |
| 246 m = _declstringlit_match(rawdata, j) | |
| 247 if not m: | |
| 248 return -1 | |
| 249 j = m.end() | |
| 250 else: | |
| 251 name, j = self._scan_name(j, declstartpos) | |
| 252 if j < 0: | |
| 253 return j | |
| 254 | |
| 255 # Internal -- scan past <!ENTITY declarations | |
| 256 def _parse_doctype_entity(self, i, declstartpos): | |
| 257 rawdata = self.rawdata | |
| 258 if rawdata[i:i+1] == "%": | |
| 259 j = i + 1 | |
| 260 while 1: | |
| 261 c = rawdata[j:j+1] | |
| 262 if not c: | |
| 263 return -1 | |
| 264 if c in string.whitespace: | |
| 265 j = j + 1 | |
| 266 else: | |
| 267 break | |
| 268 else: | |
| 269 j = i | |
| 270 name, j = self._scan_name(j, declstartpos) | |
| 271 if j < 0: | |
| 272 return j | |
| 273 while 1: | |
| 274 c = self.rawdata[j:j+1] | |
| 275 if not c: | |
| 276 return -1 | |
| 277 if c in "'\"": | |
| 278 m = _declstringlit_match(rawdata, j) | |
| 279 if m: | |
| 280 j = m.end() | |
| 281 else: | |
| 282 return -1 # incomplete | |
| 283 elif c == ">": | |
| 284 return j + 1 | |
| 285 else: | |
| 286 name, j = self._scan_name(j, declstartpos) | |
| 287 if j < 0: | |
| 288 return j | |
| 289 | |
| 290 # Internal -- scan a name token and the new position and the token, or | |
| 291 # return -1 if we've reached the end of the buffer. | |
| 292 def _scan_name(self, i, declstartpos): | |
| 293 rawdata = self.rawdata | |
| 294 n = len(rawdata) | |
| 295 if i == n: | |
| 296 return None, -1 | |
| 297 m = _declname_match(rawdata, i) | |
| 298 if m: | |
| 299 s = m.group() | |
| 300 name = string.strip(s) | |
| 301 if (i + len(s)) == n: | |
| 302 return None, -1 # end of buffer | |
| 303 return string.lower(name), m.end() | |
| 304 else: | |
| 305 self.updatepos(declstartpos, i) | |
| 306 self.error("expected name token", self.getpos()) |
