Skip to content

Commit 820573d

Browse files
committed
Completed tokenizer
1 parent 9c8064c commit 820573d

File tree

3 files changed

+180
-1
lines changed

3 files changed

+180
-1
lines changed

libraries/rushell/src/Tokenizer.ts

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ export enum TokenKind {
3333
export class Token {
3434
public readonly kind: TokenKind;
3535
public readonly range: TextRange;
36+
37+
/**
38+
* The extracted content, which depends on the type:
39+
*
40+
* Text: The unescaped content
41+
* DoubleQuotedText: The unescaped contents inside the quotes.
42+
* DollarVariable: The variable name without the "$"
43+
*/
3644
public readonly text: string;
3745

3846
public constructor(kind: TokenKind, range: TextRange, text?: string) {
@@ -105,6 +113,49 @@ export class Tokenizer {
105113
return new Token(TokenKind.NewLine, input.getNewRange(startIndex, this._currentIndex));
106114
}
107115

116+
// Is it a double-quoted string?
117+
if (firstChar === '"') {
118+
this._get(); // consume the opening quote
119+
120+
let text: string = '';
121+
let c: string | undefined = this._peek();
122+
while (c !== '"') {
123+
if (c === undefined) {
124+
throw new ParseError('The double-quoted string is missing the ending quote',
125+
input.getNewRange(startIndex, this._currentIndex));
126+
}
127+
if (c === '\r' || c === '\n') {
128+
throw new ParseError('Newlines are not supported inside strings',
129+
input.getNewRange(this._currentIndex, this._currentIndex + 1));
130+
}
131+
132+
// NOTE: POSIX says that backslash acts as an escape character inside a double-quoted string
133+
// ONLY if followed by certain other characters. For example, yes for "a\$" but no for "a\t".
134+
// Whereas Dash says yes for "a\t" but no for "a\q". And then Bash says yes for "a\t".
135+
// This goes against Rushell's goal of being intuitive: Nobody should have to memorize a list
136+
// of alphabet letters that cannot be escaped. So we just say that backslash is *always* an
137+
// escape character inside a double-quoted string.
138+
//
139+
// NOTE: Dash interprets "\t" as a tab character, but Bash does not.
140+
if (c === '\\') {
141+
this._get(); // discard the backslash
142+
if (this._peek() === undefined) {
143+
throw new ParseError('A backslash must be followed by another character',
144+
input.getNewRange(this._currentIndex, this._currentIndex + 1));
145+
}
146+
// Add the escaped character
147+
text += this._get();
148+
} else {
149+
text += this._get();
150+
}
151+
152+
c = this._peek();
153+
}
154+
this._get(); // consume the closing quote
155+
156+
return new Token(TokenKind.DoubleQuotedText, input.getNewRange(startIndex, this._currentIndex), text);
157+
}
158+
108159
// Is it a text token?
109160
if (textCharacterRegExp.test(firstChar)) {
110161
let text: string = '';
@@ -113,9 +164,10 @@ export class Tokenizer {
113164
if (c === '\\') {
114165
this._get(); // discard the backslash
115166
if (this._peek() === undefined) {
116-
throw new ParseError('Backslash encountered at end of stream',
167+
throw new ParseError('A backslash must be followed by another character',
117168
input.getNewRange(this._currentIndex, this._currentIndex + 1));
118169
}
170+
// Add the escaped character
119171
text += this._get();
120172
} else {
121173
text += this._get();

libraries/rushell/src/test/Tokenizer.test.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ test('01: white space tokens', () => {
3434

3535
test('02: text with escapes', () => {
3636
matchSnapshot(' ab+56\\>qrst(abc\\))');
37+
expect(() => tokenize('Unterminated: \\')).toThrowError();
3738
});
3839

3940
test('03: The && operator', () => {
@@ -49,3 +50,13 @@ test('04: dollar variables', () => {
4950
expect(() => tokenize('$')).toThrowError();
5051
expect(() => tokenize('${abc}')).toThrowError();
5152
});
53+
54+
test('05: double-quoted strings', () => {
55+
matchSnapshot('what "is" is');
56+
matchSnapshot('what"is"is');
57+
matchSnapshot('what"is\\""is');
58+
matchSnapshot('no C-style escapes: "\\t\\r\\n"');
59+
expect(() => tokenize('Unterminated: "')).toThrowError();
60+
expect(() => tokenize('Unterminated: "abc')).toThrowError();
61+
expect(() => tokenize('Unterminated: "abc\\')).toThrowError();
62+
});

libraries/rushell/src/test/__snapshots__/Tokenizer.test.ts.snap

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,3 +218,119 @@ Object {
218218
],
219219
}
220220
`;
221+
222+
exports[`05: double-quoted strings 1`] = `
223+
Object {
224+
"input": "what \\"is\\" is",
225+
"tokens": Array [
226+
Array [
227+
"Text",
228+
"what",
229+
],
230+
Array [
231+
"Spaces",
232+
" ",
233+
],
234+
Array [
235+
"DoubleQuotedText",
236+
"is",
237+
],
238+
Array [
239+
"Spaces",
240+
" ",
241+
],
242+
Array [
243+
"Text",
244+
"is",
245+
],
246+
],
247+
}
248+
`;
249+
250+
exports[`05: double-quoted strings 2`] = `
251+
Object {
252+
"input": "what\\"is\\"is",
253+
"tokens": Array [
254+
Array [
255+
"Text",
256+
"what",
257+
],
258+
Array [
259+
"DoubleQuotedText",
260+
"is",
261+
],
262+
Array [
263+
"Text",
264+
"is",
265+
],
266+
],
267+
}
268+
`;
269+
270+
exports[`05: double-quoted strings 3`] = `
271+
Object {
272+
"input": "what\\"is[b]\\"\\"is",
273+
"tokens": Array [
274+
Array [
275+
"Text",
276+
"what",
277+
],
278+
Array [
279+
"DoubleQuotedText",
280+
"is\\"",
281+
],
282+
Array [
283+
"Text",
284+
"is",
285+
],
286+
],
287+
}
288+
`;
289+
290+
exports[`05: double-quoted strings 4`] = `
291+
Object {
292+
"input": "no C-style escapes: \\"[b]t[b]r[b]n\\"",
293+
"tokens": Array [
294+
Array [
295+
"Text",
296+
"no",
297+
],
298+
Array [
299+
"Spaces",
300+
" ",
301+
],
302+
Array [
303+
"Text",
304+
"C",
305+
],
306+
Array [
307+
"Other",
308+
"-",
309+
],
310+
Array [
311+
"Text",
312+
"style",
313+
],
314+
Array [
315+
"Spaces",
316+
" ",
317+
],
318+
Array [
319+
"Text",
320+
"escapes",
321+
],
322+
Array [
323+
"Other",
324+
":",
325+
],
326+
Array [
327+
"Spaces",
328+
" ",
329+
],
330+
Array [
331+
"DoubleQuotedText",
332+
"trn",
333+
],
334+
],
335+
}
336+
`;

0 commit comments

Comments
 (0)