22// See LICENSE in the project root for license information.
33
44import { TextRange } from './TextRange' ;
5+ import { ParseError } from './ParseError' ;
56
67export enum TokenKind {
78 // One or more spaces/tabs
@@ -10,24 +11,32 @@ export enum TokenKind {
1011 NewLine ,
1112 // An unrecognized character
1213 Other ,
14+ // A sequence of characters that doesn't contain any symbols with special meaning
15+ // Characters can be escaped, in which case the Token.text may differ from the
16+ // Token.range.toString()
17+ Text ,
1318 // The end of the input string
1419 EndOfInput
1520}
1621
1722export class Token {
1823 public readonly kind : TokenKind ;
1924 public readonly range : TextRange ;
25+ public readonly text : string ;
2026
21- public constructor ( kind : TokenKind , range : TextRange ) {
27+ public constructor ( kind : TokenKind , range : TextRange , text ?: string ) {
2228 this . kind = kind ;
2329 this . range = range ;
30+ this . text = text === undefined ? this . range . toString ( ) : text ;
2431 }
2532
2633 public toString ( ) : string {
27- return this . range . toString ( ) ;
34+ return this . text ;
2835 }
2936}
3037
38+ const wordCharacterOrBackslashRegExp : RegExp = / [ a - z 0 - 9 _ \\ ] / i;
39+
3140export class Tokenizer {
3241 public readonly input : TextRange ;
3342 private _currentIndex : number ;
@@ -49,7 +58,7 @@ export class Tokenizer {
4958 const input : TextRange = this . input ;
5059
5160 const startIndex : number = this . _currentIndex ;
52- let c : string | undefined = this . _get ( ) ;
61+ let c : string | undefined = this . _peek ( ) ;
5362
5463 // Reached end of input yet?
5564 if ( c === undefined ) {
@@ -58,6 +67,7 @@ export class Tokenizer {
5867
5968 // Is it a sequence of whitespace?
6069 if ( / [ \t ] / . test ( c ) ) {
70+ this . _get ( ) ;
6171
6272 while ( Tokenizer . _isSpace ( this . _peek ( ) ) ) {
6373 this . _get ( ) ;
@@ -68,15 +78,39 @@ export class Tokenizer {
6878
6979 // Is it a newline?
7080 if ( c === '\r' ) {
81+ this . _get ( ) ;
7182 if ( this . _peek ( ) === '\n' ) {
7283 this . _get ( ) ;
7384 }
7485 return new Token ( TokenKind . NewLine , input . getNewRange ( startIndex , this . _currentIndex ) ) ;
7586 } else if ( c === '\n' ) {
87+ this . _get ( ) ;
7688 return new Token ( TokenKind . NewLine , input . getNewRange ( startIndex , this . _currentIndex ) ) ;
7789 }
7890
91+ // Is it a text token?
92+ if ( wordCharacterOrBackslashRegExp . test ( c ) ) {
93+ let text : string = '' ;
94+ while ( wordCharacterOrBackslashRegExp . test ( c ) ) {
95+ if ( c === '\\' ) {
96+ this . _get ( ) ; // discard the backslash
97+ if ( this . _peek ( ) === undefined ) {
98+ throw new ParseError ( 'Backslash encountered at end of stream' ,
99+ input . getNewRange ( this . _currentIndex , this . _currentIndex + 1 ) ) ;
100+ }
101+ text += this . _get ( ) ;
102+ } else {
103+ text += this . _get ( ) ;
104+ }
105+
106+ c = this . _peek ( ) ;
107+ }
108+
109+ return new Token ( TokenKind . Text , input . getNewRange ( startIndex , this . _currentIndex ) , text ) ;
110+ }
111+
79112 // Otherwise treat it as an "other" character
113+ this . _get ( ) ;
80114 return new Token ( TokenKind . Other , input . getNewRange ( startIndex , this . _currentIndex ) ) ;
81115 }
82116
0 commit comments