Mercurial > p > roundup > code
diff roundup/token.py @ 470:9f7320624bc2
Added better tokenising to roundup-admin - handles spaces and stuff.
Can use quoting or backslashes. See the roundup.token pydoc.
| author | Richard Jones <richard@users.sourceforge.net> |
|---|---|
| date | Mon, 31 Dec 2001 05:09:20 +0000 |
| parents | |
| children | a1a44636bace |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/roundup/token.py Mon Dec 31 05:09:20 2001 +0000 @@ -0,0 +1,118 @@ +# +# Copyright (c) 2001 Richard Jones. +# This module is free software, and you may redistribute it and/or modify +# under the same terms as Python, so long as this copyright message and +# disclaimer are retained in their original form. +# +# This module is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# $Id: token.py,v 1.1 2001-12-31 05:09:20 richard Exp $ +# + +__doc__ = """ +This module provides the tokeniser used by roundup-admin. +""" + +def token_split(s, whitespace=' \r\n\t', quotes='\'"', + escaped={'r':'\r', 'n':'\n', 't':'\t'}): + '''Split the string up into tokens. An occurence of a ' or " in the + input will cause the splitter to ignore whitespace until a matching + quote char is found. Embedded non-matching quote chars are also + skipped. + Whitespace and quoting characters may be escaped using a backslash. + \r, \n and \t are converted to carriage-return, newline and tab. + All other backslashed characters are left as-is. + Valid: + hello world (2 tokens: hello, world) + "hello world" (1 token: hello world) + "Roch'e" Compaan (2 tokens: Roch'e Compaan) + Roch\'e Compaan (2 tokens: Roch'e Compaan) + address="1 2 3" (1 token: address=1 2 3) + \\ (1 token: \) + \n (1 token: a newline) + \o (1 token: \o) + Invalid: + "hello world (no matching quote) + Roch'e Compaan (no matching quote) + ''' + l = [] + pos = 0 + NEWTOKEN = 'newtoken' + TOKEN = 'token' + QUOTE = 'quote' + ESCAPE = 'escape' + quotechar = '' + state = NEWTOKEN + oldstate = '' # one-level state stack ;) + length = len(s) + finish = 0 + token = '' + while 1: + # end of string, finish off the current token + if pos == length: + if state == QUOTE: raise ValueError, "unmatched quote" + elif state == TOKEN: l.append(token) + break + c = s[pos] + if state == NEWTOKEN: + # looking for a new token + if c in quotes: + # quoted token + state = QUOTE + quotechar = c + pos = pos + 1 + continue + elif c in whitespace: + # skip whitespace + pos = pos + 1 + continue + elif c == '\\': + pos = pos + 1 + oldstate = TOKEN + state = ESCAPE + continue + # otherwise we have a token + state = TOKEN + elif state == TOKEN: + if c in whitespace: + # have a token, and have just found a whitespace terminator + l.append(token) + pos = pos + 1 + state = NEWTOKEN + token = '' + continue + elif c in quotes: + # have a token, just found embedded quotes + state = QUOTE + quotechar = c + pos = pos + 1 + continue + elif c == '\\': + pos = pos + 1 + oldstate = state + state = ESCAPE + continue + elif state == QUOTE and c == quotechar: + # in a quoted token and found a matching quote char + pos = pos + 1 + # now we're looking for whitespace + state = TOKEN + continue + elif state == ESCAPE: + # escaped-char conversions (t, r, n) + # TODO: octal, hexdigit + state = oldstate + if escaped.has_key(c): + c = escaped[c] + # just add this char to the token and move along + token = token + c + pos = pos + 1 + return l + +# +# $Log: not supported by cvs2svn $ +# +# +# vim: set filetype=python ts=4 sw=4 et si
