Add StringSHA1

alexdima · alexdima · commit 0f8a68715893 · 2020-03-11T09:46:43.000+01:00
diff --git a/src/vs/base/common/hash.ts b/src/vs/base/common/hash.ts
@@ -3,6 +3,8 @@
  *  Licensed under the MIT License. See License.txt in the project root for license information.
  *--------------------------------------------------------------------------------------------*/
 
+import * as strings from 'vs/base/common/strings';
+
 /**
  * Return a hash value for an object.
  */
@@ -70,3 +72,235 @@ export class Hasher {
 		return this._value;
 	}
 }
+
+const enum SHA1Constant {
+	BLOCK_SIZE = 64, // 512 / 8
+	UNICODE_REPLACEMENT = 0xFFFD,
+}
+
+function leftRotate(value: number, bits: number, totalBits: number = 32): number {
+	// delta + bits = totalBits
+	const delta = totalBits - bits;
+
+	// All ones, expect `delta` zeros aligned to the right
+	const mask = ~((1 << delta) - 1);
+
+	// Join (value left-shifted `bits` bits) with (masked value right-shifted `delta` bits)
+	return ((value << bits) | ((mask & value) >>> delta)) >>> 0;
+}
+
+function fill(dest: Uint8Array, index: number = 0, count: number = dest.byteLength, value: number = 0): void {
+	for (let i = 0; i < count; i++) {
+		dest[index + i] = value;
+	}
+}
+
+function leftPad(value: string, length: number, char: string = '0'): string {
+	while (value.length < length) {
+		value = char + value;
+	}
+	return value;
+}
+
+function toHexString(value: number, bitsize: number = 32): string {
+	return leftPad((value >>> 0).toString(16), bitsize / 4);
+}
+
+/**
+ * A SHA1 implementation that works with strings and does not allocate.
+ */
+export class StringSHA1 {
+	private static _bigBlock32 = new DataView(new ArrayBuffer(320)); // 80 * 4 = 320
+
+	private _h0 = 0x67452301;
+	private _h1 = 0xEFCDAB89;
+	private _h2 = 0x98BADCFE;
+	private _h3 = 0x10325476;
+	private _h4 = 0xC3D2E1F0;
+
+	private readonly _buff: Uint8Array;
+	private readonly _buffDV: DataView;
+	private _buffLen: number;
+	private _totalLen: number;
+	private _leftoverHighSurrogate: number;
+	private _finished: boolean;
+
+	constructor() {
+		this._buff = new Uint8Array(SHA1Constant.BLOCK_SIZE + 3 /* to fit any utf-8 */);
+		this._buffDV = new DataView(this._buff.buffer);
+		this._buffLen = 0;
+		this._totalLen = 0;
+		this._leftoverHighSurrogate = 0;
+		this._finished = false;
+	}
+
+	public update(str: string): void {
+		const strLen = str.length;
+		if (strLen === 0) {
+			return;
+		}
+
+		const buff = this._buff;
+		let buffLen = this._buffLen;
+		let leftoverHighSurrogate = this._leftoverHighSurrogate;
+		let charCode: number;
+		let offset: number;
+
+		if (leftoverHighSurrogate !== 0) {
+			charCode = leftoverHighSurrogate;
+			offset = -1;
+			leftoverHighSurrogate = 0;
+		} else {
+			charCode = str.charCodeAt(0);
+			offset = 0;
+		}
+
+		while (true) {
+			let codePoint = charCode;
+			if (strings.isHighSurrogate(charCode)) {
+				if (offset + 1 < strLen) {
+					const nextCharCode = str.charCodeAt(offset + 1);
+					if (strings.isLowSurrogate(nextCharCode)) {
+						offset++;
+						codePoint = strings.computeCodePoint(charCode, nextCharCode);
+					} else {
+						// illegal => unicode replacement character
+						codePoint = SHA1Constant.UNICODE_REPLACEMENT;
+					}
+				} else {
+					// last character is a surrogate pair
+					leftoverHighSurrogate = charCode;
+					break;
+				}
+			} else if (strings.isLowSurrogate(charCode)) {
+				// illegal => unicode replacement character
+				codePoint = SHA1Constant.UNICODE_REPLACEMENT;
+			}
+
+			buffLen = this._push(buff, buffLen, codePoint);
+			offset++;
+			if (offset < strLen) {
+				charCode = str.charCodeAt(offset);
+			} else {
+				break;
+			}
+		}
+
+		this._buffLen = buffLen;
+		this._leftoverHighSurrogate = leftoverHighSurrogate;
+	}
+
+	private _push(buff: Uint8Array, buffLen: number, codePoint: number): number {
+		if (codePoint < 0x0080) {
+			buff[buffLen++] = codePoint;
+		} else if (codePoint < 0x0800) {
+			buff[buffLen++] = 0b11000000 | ((codePoint & 0b00000000000000000000011111000000) >>> 6);
+			buff[buffLen++] = 0b10000000 | ((codePoint & 0b00000000000000000000000000111111) >>> 0);
+		} else if (codePoint < 0x10000) {
+			buff[buffLen++] = 0b11100000 | ((codePoint & 0b00000000000000001111000000000000) >>> 12);
+			buff[buffLen++] = 0b10000000 | ((codePoint & 0b00000000000000000000111111000000) >>> 6);
+			buff[buffLen++] = 0b10000000 | ((codePoint & 0b00000000000000000000000000111111) >>> 0);
+		} else {
+			buff[buffLen++] = 0b11110000 | ((codePoint & 0b00000000000111000000000000000000) >>> 18);
+			buff[buffLen++] = 0b10000000 | ((codePoint & 0b00000000000000111111000000000000) >>> 12);
+			buff[buffLen++] = 0b10000000 | ((codePoint & 0b00000000000000000000111111000000) >>> 6);
+			buff[buffLen++] = 0b10000000 | ((codePoint & 0b00000000000000000000000000111111) >>> 0);
+		}
+
+		if (buffLen >= SHA1Constant.BLOCK_SIZE) {
+			this._step();
+			buffLen -= SHA1Constant.BLOCK_SIZE;
+			this._totalLen += SHA1Constant.BLOCK_SIZE;
+			// take last 3 in case of UTF8 overflow
+			buff[0] = buff[SHA1Constant.BLOCK_SIZE + 0];
+			buff[1] = buff[SHA1Constant.BLOCK_SIZE + 1];
+			buff[2] = buff[SHA1Constant.BLOCK_SIZE + 2];
+		}
+
+		return buffLen;
+	}
+
+	public digest(): string {
+		if (!this._finished) {
+			this._finished = true;
+			if (this._leftoverHighSurrogate) {
+				// illegal => unicode replacement character
+				this._leftoverHighSurrogate = 0;
+				this._buffLen = this._push(this._buff, this._buffLen, SHA1Constant.UNICODE_REPLACEMENT);
+			}
+			this._totalLen += this._buffLen;
+			this._wrapUp();
+		}
+
+		return toHexString(this._h0) + toHexString(this._h1) + toHexString(this._h2) + toHexString(this._h3) + toHexString(this._h4);
+	}
+
+	private _wrapUp(): void {
+		this._buff[this._buffLen++] = 0x80;
+		fill(this._buff, this._buffLen);
+
+		if (this._buffLen > 56) {
+			this._step();
+			fill(this._buff);
+		}
+
+		// this will fit because the mantissa can cover up to 52 bits
+		const ml = 8 * this._totalLen;
+
+		this._buffDV.setUint32(56, Math.floor(ml / 4294967296), false);
+		this._buffDV.setUint32(60, ml % 4294967296, false);
+
+		this._step();
+	}
+
+	private _step(): void {
+		const bigBlock32 = StringSHA1._bigBlock32;
+		const data = this._buffDV;
+
+		for (let j = 0; j < 64 /* 16*4 */; j += 4) {
+			bigBlock32.setUint32(j, data.getUint32(j, false), false);
+		}
+
+		for (let j = 64; j < 320 /* 80*4 */; j += 4) {
+			bigBlock32.setUint32(j, leftRotate((bigBlock32.getUint32(j - 12, false) ^ bigBlock32.getUint32(j - 32, false) ^ bigBlock32.getUint32(j - 56, false) ^ bigBlock32.getUint32(j - 64, false)), 1), false);
+		}
+
+		let a = this._h0;
+		let b = this._h1;
+		let c = this._h2;
+		let d = this._h3;
+		let e = this._h4;
+
+		let f: number, k: number;
+		let temp: number;
+
+		for (let j = 0; j < 80; j++) {
+			if (j < 20) {
+				f = (b & c) | ((~b) & d);
+				k = 0x5A827999;
+			} else if (j < 40) {
+				f = b ^ c ^ d;
+				k = 0x6ED9EBA1;
+			} else if (j < 60) {
+				f = (b & c) | (b & d) | (c & d);
+				k = 0x8F1BBCDC;
+			} else {
+				f = b ^ c ^ d;
+				k = 0xCA62C1D6;
+			}
+
+			temp = (leftRotate(a, 5) + f + e + k + bigBlock32.getUint32(j * 4, false)) & 0xffffffff;
+			e = d;
+			d = c;
+			c = leftRotate(b, 30);
+			b = a;
+			a = temp;
+		}
+
+		this._h0 = (this._h0 + a) & 0xffffffff;
+		this._h1 = (this._h1 + b) & 0xffffffff;
+		this._h2 = (this._h2 + c) & 0xffffffff;
+		this._h3 = (this._h3 + d) & 0xffffffff;
+		this._h4 = (this._h4 + e) & 0xffffffff;
+	}
+}
diff --git a/src/vs/base/common/strings.ts b/src/vs/base/common/strings.ts
@@ -428,29 +428,27 @@ export function commonSuffixLength(a: string, b: string): number {
 	return len;
 }
 
-// --- unicode
-// http://en.wikipedia.org/wiki/Surrogate_pair
-// Returns the code point starting at a specified index in a string
-// Code points U+0000 to U+D7FF and U+E000 to U+FFFF are represented on a single character
-// Code points U+10000 to U+10FFFF are represented on two consecutive characters
-//export function getUnicodePoint(str:string, index:number, len:number):number {
-//	const chrCode = str.charCodeAt(index);
-//	if (0xD800 <= chrCode && chrCode <= 0xDBFF && index + 1 < len) {
-//		const nextChrCode = str.charCodeAt(index + 1);
-//		if (0xDC00 <= nextChrCode && nextChrCode <= 0xDFFF) {
-//			return (chrCode - 0xD800) << 10 + (nextChrCode - 0xDC00) + 0x10000;
-//		}
-//	}
-//	return chrCode;
-//}
+/**
+ * See http://en.wikipedia.org/wiki/Surrogate_pair
+ */
 export function isHighSurrogate(charCode: number): boolean {
 	return (0xD800 <= charCode && charCode <= 0xDBFF);
 }
 
+/**
+ * See http://en.wikipedia.org/wiki/Surrogate_pair
+ */
 export function isLowSurrogate(charCode: number): boolean {
 	return (0xDC00 <= charCode && charCode <= 0xDFFF);
 }
 
+/**
+ * See http://en.wikipedia.org/wiki/Surrogate_pair
+ */
+export function computeCodePoint(highSurrogate: number, lowSurrogate: number): number {
+	return ((highSurrogate - 0xD800) << 10) + (lowSurrogate - 0xDC00) + 0x10000;
+}
+
 /**
  * get the code point that begins at offset `offset`
  */
@@ -459,7 +457,7 @@ export function getNextCodePoint(str: string, len: number, offset: number): numb
 	if (isHighSurrogate(charCode) && offset + 1 < len) {
 		const nextCharCode = str.charCodeAt(offset + 1);
 		if (isLowSurrogate(nextCharCode)) {
-			return ((charCode - 0xD800) << 10) + (nextCharCode - 0xDC00) + 0x10000;
+			return computeCodePoint(charCode, nextCharCode);
 		}
 	}
 	return charCode;
@@ -473,7 +471,7 @@ function getPrevCodePoint(str: string, offset: number): number {
 	if (isLowSurrogate(charCode) && offset > 1) {
 		const prevCharCode = str.charCodeAt(offset - 2);
 		if (isHighSurrogate(prevCharCode)) {
-			return ((prevCharCode - 0xD800) << 10) + (charCode - 0xDC00) + 0x10000;
+			return computeCodePoint(prevCharCode, charCode);
 		}
 	}
 	return charCode;
diff --git a/src/vs/base/test/common/hash.test.ts b/src/vs/base/test/common/hash.test.ts
@@ -3,7 +3,7 @@
  *  Licensed under the MIT License. See License.txt in the project root for license information.
  *--------------------------------------------------------------------------------------------*/
 import * as assert from 'assert';
-import { hash } from 'vs/base/common/hash';
+import { hash, StringSHA1 } from 'vs/base/common/hash';
 
 suite('Hash', () => {
 	test('string', () => {
@@ -53,4 +53,28 @@ suite('Hash', () => {
 		assert.notEqual(a, b);
 	});
 
+	function checkSHA1(strings: string[], expected: string) {
+		const hash = new StringSHA1();
+		for (const str of strings) {
+			hash.update(str);
+		}
+		const actual = hash.digest();
+		assert.equal(actual, expected);
+	}
+
+	test('sha1-1', () => {
+		checkSHA1(['\udd56'], '9bdb77276c1852e1fb067820472812fcf6084024');
+	});
+
+	test('sha1-2', () => {
+		checkSHA1(['\udb52'], '9bdb77276c1852e1fb067820472812fcf6084024');
+	});
+
+	test('sha1-3', () => {
+		checkSHA1(['\uda02ꑍ'], '9b483a471f22fe7e09d83f221871a987244bbd3f');
+	});
+
+	test('sha1-4', () => {
+		checkSHA1(['hello'], 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d');
+	});
 });