Initial prototype of a Tokenizer that can process whitespace

pgonzal · pgonzal · commit 5999f6519a66 · 2018-06-21T17:22:48.000-07:00
diff --git a/libraries/rushell/src/Tokenizer.ts b/libraries/rushell/src/Tokenizer.ts
@@ -0,0 +1,110 @@
+// Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT license.
+// See LICENSE in the project root for license information.
+
+import { TextRange } from './TextRange';
+
+export enum TokenKind {
+  // One or more spaces/tabs
+  Spaces,
+  // A single newline sequence such as CRLF or LF
+  NewLine,
+  // An unrecognized character
+  Other,
+  // The end of the input string
+  EndOfInput
+}
+
+export class Token {
+  public readonly kind: TokenKind;
+  public readonly range: TextRange;
+
+  public constructor(kind: TokenKind, range: TextRange) {
+    this.kind = kind;
+    this.range = range;
+  }
+
+  public toString(): string {
+    return this.range.toString();
+  }
+}
+
+export class Tokenizer {
+  public readonly input: TextRange;
+  private _currentIndex: number;
+
+  constructor(input: TextRange | string) {
+    if (typeof(input) === 'string') {
+      this.input = TextRange.fromString(input);
+    } else {
+      this.input = input;
+    }
+    this._currentIndex = this.input.pos;
+  }
+
+  public get currentIndex(): number {
+    return this._currentIndex;
+  }
+
+  public getToken(): Token {
+    const input: TextRange = this.input;
+
+    const startIndex: number = this._currentIndex;
+    let c: string | undefined = this._get();
+
+    // Reached end of input yet?
+    if (c === undefined) {
+      return new Token(TokenKind.EndOfInput, TextRange.empty);
+    }
+
+    // Is it a sequence of whitespace?
+    if (/[ \t]/.test(c)) {
+
+      while (Tokenizer._isSpace(this._peek())) {
+        this._get();
+      }
+
+      return new Token(TokenKind.Spaces, input.getNewRange(startIndex, this._currentIndex));
+    }
+
+    // Is it a newline?
+    if (c === '\r') {
+      if (this._peek() === '\n') {
+        this._get();
+      }
+      return new Token(TokenKind.NewLine, input.getNewRange(startIndex, this._currentIndex));
+    } else if (c === '\n') {
+      return new Token(TokenKind.NewLine, input.getNewRange(startIndex, this._currentIndex));
+    }
+
+    // Otherwise treat it as an "other" character
+    return new Token(TokenKind.Other, input.getNewRange(startIndex, this._currentIndex));
+  }
+
+  public getTokens(): Token[] {
+    const tokens: Token[] = [];
+    let token: Token = this.getToken();
+    while (token.kind !== TokenKind.EndOfInput) {
+      tokens.push(token);
+      token = this.getToken();
+    }
+    return tokens;
+  }
+
+  private _get(): string | undefined {
+    if (this._currentIndex >= this.input.end) {
+      return undefined;
+    }
+    return this.input.buffer[this._currentIndex++];
+  }
+
+  private _peek(): string | undefined {
+    if (this._currentIndex >= this.input.end) {
+      return undefined;
+    }
+    return this.input.buffer[this._currentIndex];
+  }
+
+  private static _isSpace(c: string | undefined): boolean {
+    return c === ' ' || c === '\t';
+  }
+}
diff --git a/libraries/rushell/src/test/Tokenizer.test.ts b/libraries/rushell/src/test/Tokenizer.test.ts
@@ -0,0 +1,27 @@
+// Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT license.
+// See LICENSE in the project root for license information.
+
+import { Tokenizer, TokenKind } from '../Tokenizer';
+
+function escape(s: string): string {
+  return s.replace(/\n/g, '[n]')
+    .replace(/\r/g, '[r]')
+    .replace(/\t/g, '[t]');
+}
+
+function matchSnapshot(input: string): void {
+  const tokenizer = new Tokenizer(input);
+  expect({
+    input: escape(tokenizer.input.toString()),
+    tokens: tokenizer.getTokens().map(x => [TokenKind[x.kind], escape(x.toString())])
+  }).toMatchSnapshot();
+}
+
+test('empty inputs', () => {
+  matchSnapshot('');
+  matchSnapshot('\r\n');
+});
+
+test('white space tokens', () => {
+  matchSnapshot('  abc   \r\ndef  \n  ghi\n\r  ');
+});
diff --git a/libraries/rushell/src/test/__snapshots__/Tokenizer.test.ts.snap b/libraries/rushell/src/test/__snapshots__/Tokenizer.test.ts.snap
@@ -0,0 +1,100 @@
+// Jest Snapshot v1, https://goo.gl/fbAQLP
+
+exports[`empty inputs 1`] = `
+Object {
+  "input": "",
+  "tokens": Array [],
+}
+`;
+
+exports[`empty inputs 2`] = `
+Object {
+  "input": "[r][n]",
+  "tokens": Array [
+    Array [
+      "NewLine",
+      "[r][n]",
+    ],
+  ],
+}
+`;
+
+exports[`white space tokens 1`] = `
+Object {
+  "input": "  abc   [r][n]def  [n]  ghi[n][r]  ",
+  "tokens": Array [
+    Array [
+      "Spaces",
+      "  ",
+    ],
+    Array [
+      "Other",
+      "a",
+    ],
+    Array [
+      "Other",
+      "b",
+    ],
+    Array [
+      "Other",
+      "c",
+    ],
+    Array [
+      "Spaces",
+      "   ",
+    ],
+    Array [
+      "NewLine",
+      "[r][n]",
+    ],
+    Array [
+      "Other",
+      "d",
+    ],
+    Array [
+      "Other",
+      "e",
+    ],
+    Array [
+      "Other",
+      "f",
+    ],
+    Array [
+      "Spaces",
+      "  ",
+    ],
+    Array [
+      "NewLine",
+      "[n]",
+    ],
+    Array [
+      "Spaces",
+      "  ",
+    ],
+    Array [
+      "Other",
+      "g",
+    ],
+    Array [
+      "Other",
+      "h",
+    ],
+    Array [
+      "Other",
+      "i",
+    ],
+    Array [
+      "NewLine",
+      "[n]",
+    ],
+    Array [
+      "NewLine",
+      "[r]",
+    ],
+    Array [
+      "Spaces",
+      "  ",
+    ],
+  ],
+}
+`;