Skip to content

Commit 5999f65

Browse files
committed
Initial prototype of a Tokenizer that can process whitespace
1 parent e6c0202 commit 5999f65

3 files changed

Lines changed: 237 additions & 0 deletions

File tree

libraries/rushell/src/Tokenizer.ts

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT license.
2+
// See LICENSE in the project root for license information.
3+
4+
import { TextRange } from './TextRange';
5+
6+
export enum TokenKind {
7+
// One or more spaces/tabs
8+
Spaces,
9+
// A single newline sequence such as CRLF or LF
10+
NewLine,
11+
// An unrecognized character
12+
Other,
13+
// The end of the input string
14+
EndOfInput
15+
}
16+
17+
export class Token {
18+
public readonly kind: TokenKind;
19+
public readonly range: TextRange;
20+
21+
public constructor(kind: TokenKind, range: TextRange) {
22+
this.kind = kind;
23+
this.range = range;
24+
}
25+
26+
public toString(): string {
27+
return this.range.toString();
28+
}
29+
}
30+
31+
export class Tokenizer {
32+
public readonly input: TextRange;
33+
private _currentIndex: number;
34+
35+
constructor(input: TextRange | string) {
36+
if (typeof(input) === 'string') {
37+
this.input = TextRange.fromString(input);
38+
} else {
39+
this.input = input;
40+
}
41+
this._currentIndex = this.input.pos;
42+
}
43+
44+
public get currentIndex(): number {
45+
return this._currentIndex;
46+
}
47+
48+
public getToken(): Token {
49+
const input: TextRange = this.input;
50+
51+
const startIndex: number = this._currentIndex;
52+
let c: string | undefined = this._get();
53+
54+
// Reached end of input yet?
55+
if (c === undefined) {
56+
return new Token(TokenKind.EndOfInput, TextRange.empty);
57+
}
58+
59+
// Is it a sequence of whitespace?
60+
if (/[ \t]/.test(c)) {
61+
62+
while (Tokenizer._isSpace(this._peek())) {
63+
this._get();
64+
}
65+
66+
return new Token(TokenKind.Spaces, input.getNewRange(startIndex, this._currentIndex));
67+
}
68+
69+
// Is it a newline?
70+
if (c === '\r') {
71+
if (this._peek() === '\n') {
72+
this._get();
73+
}
74+
return new Token(TokenKind.NewLine, input.getNewRange(startIndex, this._currentIndex));
75+
} else if (c === '\n') {
76+
return new Token(TokenKind.NewLine, input.getNewRange(startIndex, this._currentIndex));
77+
}
78+
79+
// Otherwise treat it as an "other" character
80+
return new Token(TokenKind.Other, input.getNewRange(startIndex, this._currentIndex));
81+
}
82+
83+
public getTokens(): Token[] {
84+
const tokens: Token[] = [];
85+
let token: Token = this.getToken();
86+
while (token.kind !== TokenKind.EndOfInput) {
87+
tokens.push(token);
88+
token = this.getToken();
89+
}
90+
return tokens;
91+
}
92+
93+
private _get(): string | undefined {
94+
if (this._currentIndex >= this.input.end) {
95+
return undefined;
96+
}
97+
return this.input.buffer[this._currentIndex++];
98+
}
99+
100+
private _peek(): string | undefined {
101+
if (this._currentIndex >= this.input.end) {
102+
return undefined;
103+
}
104+
return this.input.buffer[this._currentIndex];
105+
}
106+
107+
private static _isSpace(c: string | undefined): boolean {
108+
return c === ' ' || c === '\t';
109+
}
110+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT license.
2+
// See LICENSE in the project root for license information.
3+
4+
import { Tokenizer, TokenKind } from '../Tokenizer';
5+
6+
function escape(s: string): string {
7+
return s.replace(/\n/g, '[n]')
8+
.replace(/\r/g, '[r]')
9+
.replace(/\t/g, '[t]');
10+
}
11+
12+
function matchSnapshot(input: string): void {
13+
const tokenizer = new Tokenizer(input);
14+
expect({
15+
input: escape(tokenizer.input.toString()),
16+
tokens: tokenizer.getTokens().map(x => [TokenKind[x.kind], escape(x.toString())])
17+
}).toMatchSnapshot();
18+
}
19+
20+
test('empty inputs', () => {
21+
matchSnapshot('');
22+
matchSnapshot('\r\n');
23+
});
24+
25+
test('white space tokens', () => {
26+
matchSnapshot(' abc \r\ndef \n ghi\n\r ');
27+
});
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// Jest Snapshot v1, https://goo.gl/fbAQLP
2+
3+
exports[`empty inputs 1`] = `
4+
Object {
5+
"input": "",
6+
"tokens": Array [],
7+
}
8+
`;
9+
10+
exports[`empty inputs 2`] = `
11+
Object {
12+
"input": "[r][n]",
13+
"tokens": Array [
14+
Array [
15+
"NewLine",
16+
"[r][n]",
17+
],
18+
],
19+
}
20+
`;
21+
22+
exports[`white space tokens 1`] = `
23+
Object {
24+
"input": " abc [r][n]def [n] ghi[n][r] ",
25+
"tokens": Array [
26+
Array [
27+
"Spaces",
28+
" ",
29+
],
30+
Array [
31+
"Other",
32+
"a",
33+
],
34+
Array [
35+
"Other",
36+
"b",
37+
],
38+
Array [
39+
"Other",
40+
"c",
41+
],
42+
Array [
43+
"Spaces",
44+
" ",
45+
],
46+
Array [
47+
"NewLine",
48+
"[r][n]",
49+
],
50+
Array [
51+
"Other",
52+
"d",
53+
],
54+
Array [
55+
"Other",
56+
"e",
57+
],
58+
Array [
59+
"Other",
60+
"f",
61+
],
62+
Array [
63+
"Spaces",
64+
" ",
65+
],
66+
Array [
67+
"NewLine",
68+
"[n]",
69+
],
70+
Array [
71+
"Spaces",
72+
" ",
73+
],
74+
Array [
75+
"Other",
76+
"g",
77+
],
78+
Array [
79+
"Other",
80+
"h",
81+
],
82+
Array [
83+
"Other",
84+
"i",
85+
],
86+
Array [
87+
"NewLine",
88+
"[n]",
89+
],
90+
Array [
91+
"NewLine",
92+
"[r]",
93+
],
94+
Array [
95+
"Spaces",
96+
" ",
97+
],
98+
],
99+
}
100+
`;

0 commit comments

Comments
 (0)