Skip to content

Commit e825500

Browse files
committed
Implement new "text" token type
1 parent 5999f65 commit e825500

2 files changed

Lines changed: 45 additions & 35 deletions

File tree

libraries/rushell/src/Tokenizer.ts

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// See LICENSE in the project root for license information.
33

44
import { TextRange } from './TextRange';
5+
import { ParseError } from './ParseError';
56

67
export enum TokenKind {
78
// One or more spaces/tabs
@@ -10,24 +11,32 @@ export enum TokenKind {
1011
NewLine,
1112
// An unrecognized character
1213
Other,
14+
// A sequence of characters that doesn't contain any symbols with special meaning
15+
// Characters can be escaped, in which case the Token.text may differ from the
16+
// Token.range.toString()
17+
Text,
1318
// The end of the input string
1419
EndOfInput
1520
}
1621

1722
export class Token {
1823
public readonly kind: TokenKind;
1924
public readonly range: TextRange;
25+
public readonly text: string;
2026

21-
public constructor(kind: TokenKind, range: TextRange) {
27+
public constructor(kind: TokenKind, range: TextRange, text?: string) {
2228
this.kind = kind;
2329
this.range = range;
30+
this.text = text === undefined ? this.range.toString() : text;
2431
}
2532

2633
public toString(): string {
27-
return this.range.toString();
34+
return this.text;
2835
}
2936
}
3037

38+
const wordCharacterOrBackslashRegExp: RegExp = /[a-z0-9_\\]/i;
39+
3140
export class Tokenizer {
3241
public readonly input: TextRange;
3342
private _currentIndex: number;
@@ -49,7 +58,7 @@ export class Tokenizer {
4958
const input: TextRange = this.input;
5059

5160
const startIndex: number = this._currentIndex;
52-
let c: string | undefined = this._get();
61+
let c: string | undefined = this._peek();
5362

5463
// Reached end of input yet?
5564
if (c === undefined) {
@@ -58,6 +67,7 @@ export class Tokenizer {
5867

5968
// Is it a sequence of whitespace?
6069
if (/[ \t]/.test(c)) {
70+
this._get();
6171

6272
while (Tokenizer._isSpace(this._peek())) {
6373
this._get();
@@ -68,15 +78,39 @@ export class Tokenizer {
6878

6979
// Is it a newline?
7080
if (c === '\r') {
81+
this._get();
7182
if (this._peek() === '\n') {
7283
this._get();
7384
}
7485
return new Token(TokenKind.NewLine, input.getNewRange(startIndex, this._currentIndex));
7586
} else if (c === '\n') {
87+
this._get();
7688
return new Token(TokenKind.NewLine, input.getNewRange(startIndex, this._currentIndex));
7789
}
7890

91+
// Is it a text token?
92+
if (wordCharacterOrBackslashRegExp.test(c)) {
93+
let text: string = '';
94+
while (wordCharacterOrBackslashRegExp.test(c)) {
95+
if (c === '\\') {
96+
this._get(); // discard the backslash
97+
if (this._peek() === undefined) {
98+
throw new ParseError('Backslash encountered at end of stream',
99+
input.getNewRange(this._currentIndex, this._currentIndex+1));
100+
}
101+
text += this._get();
102+
} else {
103+
text += this._get();
104+
}
105+
106+
c = this._peek();
107+
}
108+
109+
return new Token(TokenKind.Text, input.getNewRange(startIndex, this._currentIndex), text);
110+
}
111+
79112
// Otherwise treat it as an "other" character
113+
this._get();
80114
return new Token(TokenKind.Other, input.getNewRange(startIndex, this._currentIndex));
81115
}
82116

libraries/rushell/src/test/__snapshots__/Tokenizer.test.ts.snap

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,15 @@ Object {
2121

2222
exports[`white space tokens 1`] = `
2323
Object {
24-
"input": " abc [r][n]def [n] ghi[n][r] ",
24+
"input": " [t] abc [r][n]def [n] ghi[n][r] ",
2525
"tokens": Array [
2626
Array [
2727
"Spaces",
28-
" ",
29-
],
30-
Array [
31-
"Other",
32-
"a",
28+
" [t] ",
3329
],
3430
Array [
35-
"Other",
36-
"b",
37-
],
38-
Array [
39-
"Other",
40-
"c",
31+
"Text",
32+
"abc",
4133
],
4234
Array [
4335
"Spaces",
@@ -48,16 +40,8 @@ Object {
4840
"[r][n]",
4941
],
5042
Array [
51-
"Other",
52-
"d",
53-
],
54-
Array [
55-
"Other",
56-
"e",
57-
],
58-
Array [
59-
"Other",
60-
"f",
43+
"Text",
44+
"def",
6145
],
6246
Array [
6347
"Spaces",
@@ -72,16 +56,8 @@ Object {
7256
" ",
7357
],
7458
Array [
75-
"Other",
76-
"g",
77-
],
78-
Array [
79-
"Other",
80-
"h",
81-
],
82-
Array [
83-
"Other",
84-
"i",
59+
"Text",
60+
"ghi",
8561
],
8662
Array [
8763
"NewLine",

0 commit comments

Comments
 (0)