Skip to content

Commit c928a44

Browse files
committed
Fixes microsoft#15622: Better handling of Unicode combining marks
1 parent 7f5f231 commit c928a44

11 files changed

Lines changed: 456 additions & 195 deletions

File tree

src/vs/base/common/strings.ts

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*--------------------------------------------------------------------------------------------*/
55

66
import { CharCode } from 'vs/base/common/charCode';
7+
import { Constants } from 'vs/base/common/uint';
78

89
export function isFalsyOrWhitespace(str: string | undefined): boolean {
910
if (!str || typeof str !== 'string') {
@@ -487,6 +488,190 @@ export function isLowSurrogate(charCode: number): boolean {
487488
return (0xDC00 <= charCode && charCode <= 0xDFFF);
488489
}
489490

491+
/**
492+
* get the code point that begins at offset `offset`
493+
*/
494+
export function getNextCodePoint(str: string, len: number, offset: number): number {
495+
const charCode = str.charCodeAt(offset);
496+
if (isHighSurrogate(charCode) && offset + 1 < len) {
497+
const nextCharCode = str.charCodeAt(offset + 1);
498+
if (isLowSurrogate(nextCharCode)) {
499+
return ((charCode - 0xD800) << 10) + (nextCharCode - 0xDC00) + 0x10000;
500+
}
501+
}
502+
return charCode;
503+
}
504+
505+
/**
506+
* get the code point that ends right before offset `offset`
507+
*/
508+
function getPrevCodePoint(str: string, offset: number): number {
509+
const charCode = str.charCodeAt(offset - 1);
510+
if (isLowSurrogate(charCode) && offset > 1) {
511+
const prevCharCode = str.charCodeAt(offset - 2);
512+
if (isHighSurrogate(prevCharCode)) {
513+
return ((prevCharCode - 0xD800) << 10) + (charCode - 0xDC00) + 0x10000;
514+
}
515+
}
516+
return charCode;
517+
}
518+
519+
export function nextCharLength(str: string, offset: number): number {
520+
const initialOffset = offset;
521+
const len = str.length;
522+
523+
let codePoint = getNextCodePoint(str, len, offset);
524+
offset += (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
525+
526+
while (offset < len) {
527+
codePoint = getNextCodePoint(str, len, offset);
528+
if (!isUnicodeMark(codePoint)) {
529+
break;
530+
}
531+
offset += (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
532+
}
533+
534+
return (offset - initialOffset);
535+
}
536+
537+
export function prevCharLength(str: string, offset: number): number {
538+
const initialOffset = offset;
539+
540+
let codePoint = getPrevCodePoint(str, offset);
541+
offset -= (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
542+
543+
while (offset > 0 && isUnicodeMark(codePoint)) {
544+
codePoint = getPrevCodePoint(str, offset);
545+
offset -= (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
546+
}
547+
548+
return (initialOffset - offset);
549+
}
550+
551+
function _getCharContainingOffset(str: string, offset: number): [number, number] {
552+
const len = str.length;
553+
const initialOffset = offset;
554+
const initialCodePoint = getNextCodePoint(str, len, offset);
555+
offset += (initialCodePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
556+
557+
// extend to the right
558+
while (offset < len) {
559+
const nextCodePoint = getNextCodePoint(str, len, offset);
560+
if (!isUnicodeMark(nextCodePoint)) {
561+
break;
562+
}
563+
offset += (nextCodePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
564+
}
565+
const endOffset = offset;
566+
567+
// extend to the left
568+
offset = initialOffset;
569+
let codePoint = initialCodePoint;
570+
571+
while (offset > 0 && isUnicodeMark(codePoint)) {
572+
codePoint = getPrevCodePoint(str, offset);
573+
offset -= (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
574+
}
575+
576+
return [offset, endOffset];
577+
}
578+
579+
export function getCharContainingOffset(str: string, offset: number): [number, number] {
580+
if (offset > 0 && isLowSurrogate(str.charCodeAt(offset))) {
581+
return _getCharContainingOffset(str, offset - 1);
582+
}
583+
return _getCharContainingOffset(str, offset);
584+
}
585+
586+
export function isUnicodeMark(codePoint: number): boolean {
587+
return MarkClassifier.getInstance().isUnicodeMark(codePoint);
588+
}
589+
590+
class MarkClassifier {
591+
592+
private static _INSTANCE: MarkClassifier | null = null;
593+
594+
public static getInstance(): MarkClassifier {
595+
if (!MarkClassifier._INSTANCE) {
596+
MarkClassifier._INSTANCE = new MarkClassifier();
597+
}
598+
return MarkClassifier._INSTANCE;
599+
}
600+
601+
private arr: Uint8Array;
602+
603+
constructor() {
604+
// generated using https://github.com/alexandrudima/unicode-utils/blob/master/generate-mark-test.js
605+
const ranges = [
606+
0x0300, 0x036F, 0x0483, 0x0489, 0x0591, 0x05BD, 0x05BF, 0x05BF, 0x05C1, 0x05C2, 0x05C4, 0x05C5,
607+
0x05C7, 0x05C7, 0x0610, 0x061A, 0x064B, 0x065F, 0x0670, 0x0670, 0x06D6, 0x06DC, 0x06DF, 0x06E4,
608+
0x06E7, 0x06E8, 0x06EA, 0x06ED, 0x0711, 0x0711, 0x0730, 0x074A, 0x07A6, 0x07B0, 0x07EB, 0x07F3,
609+
0x07FD, 0x07FD, 0x0816, 0x0819, 0x081B, 0x0823, 0x0825, 0x0827, 0x0829, 0x082D, 0x0859, 0x085B,
610+
0x08D3, 0x08E1, 0x08E3, 0x0903, 0x093A, 0x093C, 0x093E, 0x094F, 0x0951, 0x0957, 0x0962, 0x0963,
611+
0x0981, 0x0983, 0x09BC, 0x09BC, 0x09BE, 0x09CD, 0x09D7, 0x09D7, 0x09E2, 0x09E3, 0x09FE, 0x0A03,
612+
0x0A3C, 0x0A51, 0x0A70, 0x0A71, 0x0A75, 0x0A75, 0x0A81, 0x0A83, 0x0ABC, 0x0ABC, 0x0ABE, 0x0ACD,
613+
0x0AE2, 0x0AE3, 0x0AFA, 0x0B03, 0x0B3C, 0x0B3C, 0x0B3E, 0x0B57, 0x0B62, 0x0B63, 0x0B82, 0x0B82,
614+
0x0BBE, 0x0BCD, 0x0BD7, 0x0BD7, 0x0C00, 0x0C04, 0x0C3E, 0x0C56, 0x0C62, 0x0C63, 0x0C81, 0x0C83,
615+
0x0CBC, 0x0CBC, 0x0CBE, 0x0CD6, 0x0CE2, 0x0CE3, 0x0D00, 0x0D03, 0x0D3B, 0x0D3C, 0x0D3E, 0x0D4D,
616+
0x0D57, 0x0D57, 0x0D62, 0x0D63, 0x0D81, 0x0D83, 0x0DCA, 0x0DDF, 0x0DF2, 0x0DF3, 0x0E31, 0x0E31,
617+
0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB1, 0x0EB1, 0x0EB4, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19,
618+
0x0F35, 0x0F35, 0x0F37, 0x0F37, 0x0F39, 0x0F39, 0x0F3E, 0x0F3F, 0x0F71, 0x0F84, 0x0F86, 0x0F87,
619+
0x0F8D, 0x0FBC, 0x0FC6, 0x0FC6, 0x102B, 0x103E, 0x1056, 0x1059, 0x105E, 0x1060, 0x1062, 0x1064,
620+
0x1067, 0x106D, 0x1071, 0x1074, 0x1082, 0x108D, 0x108F, 0x108F, 0x109A, 0x109D, 0x135D, 0x135F,
621+
0x1712, 0x1714, 0x1732, 0x1734, 0x1752, 0x1753, 0x1772, 0x1773, 0x17B4, 0x17D3, 0x17DD, 0x17DD,
622+
0x180B, 0x180D, 0x1885, 0x1886, 0x18A9, 0x18A9, 0x1920, 0x193B, 0x1A17, 0x1A1B, 0x1A55, 0x1A7F,
623+
0x1AB0, 0x1B04, 0x1B34, 0x1B44, 0x1B6B, 0x1B73, 0x1B80, 0x1B82, 0x1BA1, 0x1BAD, 0x1BE6, 0x1BF3,
624+
0x1C24, 0x1C37, 0x1CD0, 0x1CD2, 0x1CD4, 0x1CE8, 0x1CED, 0x1CED, 0x1CF4, 0x1CF4, 0x1CF7, 0x1CF9,
625+
0x1DC0, 0x1DFF, 0x20D0, 0x20F0, 0x2CEF, 0x2CF1, 0x2D7F, 0x2D7F, 0x2DE0, 0x2DFF, 0x302A, 0x302F,
626+
0x3099, 0x309A, 0xA66F, 0xA672, 0xA674, 0xA67D, 0xA69E, 0xA69F, 0xA6F0, 0xA6F1, 0xA802, 0xA802,
627+
0xA806, 0xA806, 0xA80B, 0xA80B, 0xA823, 0xA827, 0xA82C, 0xA82C, 0xA880, 0xA881, 0xA8B4, 0xA8C5,
628+
0xA8E0, 0xA8F1, 0xA8FF, 0xA8FF, 0xA926, 0xA92D, 0xA947, 0xA953, 0xA980, 0xA983, 0xA9B3, 0xA9C0,
629+
0xA9E5, 0xA9E5, 0xAA29, 0xAA36, 0xAA43, 0xAA43, 0xAA4C, 0xAA4D, 0xAA7B, 0xAA7D, 0xAAB0, 0xAAB0,
630+
0xAAB2, 0xAAB4, 0xAAB7, 0xAAB8, 0xAABE, 0xAABF, 0xAAC1, 0xAAC1, 0xAAEB, 0xAAEF, 0xAAF5, 0xAAF6,
631+
0xABE3, 0xABEA, 0xABEC, 0xABED, 0xFB1E, 0xFB1E, 0xFE00, 0xFE0F, 0xFE20, 0xFE2F, 0x101FD, 0x101FD,
632+
0x102E0, 0x102E0, 0x10376, 0x1037A, 0x10A01, 0x10A0F, 0x10A38, 0x10A3F, 0x10AE5, 0x10AE6, 0x10D24, 0x10D27,
633+
0x10EAB, 0x10EAC, 0x10F46, 0x10F50, 0x11000, 0x11002, 0x11038, 0x11046, 0x1107F, 0x11082, 0x110B0, 0x110BA,
634+
0x11100, 0x11102, 0x11127, 0x11134, 0x11145, 0x11146, 0x11173, 0x11173, 0x11180, 0x11182, 0x111B3, 0x111C0,
635+
0x111C9, 0x111CC, 0x111CE, 0x111CF, 0x1122C, 0x11237, 0x1123E, 0x1123E, 0x112DF, 0x112EA, 0x11300, 0x11303,
636+
0x1133B, 0x1133C, 0x1133E, 0x1134D, 0x11357, 0x11357, 0x11362, 0x11374, 0x11435, 0x11446, 0x1145E, 0x1145E,
637+
0x114B0, 0x114C3, 0x115AF, 0x115C0, 0x115DC, 0x115DD, 0x11630, 0x11640, 0x116AB, 0x116B7, 0x1171D, 0x1172B,
638+
0x1182C, 0x1183A, 0x11930, 0x1193E, 0x11940, 0x11940, 0x11942, 0x11943, 0x119D1, 0x119E0, 0x119E4, 0x119E4,
639+
0x11A01, 0x11A0A, 0x11A33, 0x11A39, 0x11A3B, 0x11A3E, 0x11A47, 0x11A47, 0x11A51, 0x11A5B, 0x11A8A, 0x11A99,
640+
0x11C2F, 0x11C3F, 0x11C92, 0x11CB6, 0x11D31, 0x11D45, 0x11D47, 0x11D47, 0x11D8A, 0x11D97, 0x11EF3, 0x11EF6,
641+
0x16AF0, 0x16AF4, 0x16B30, 0x16B36, 0x16F4F, 0x16F4F, 0x16F51, 0x16F92, 0x16FE4, 0x16FF1, 0x1BC9D, 0x1BC9E,
642+
0x1D165, 0x1D169, 0x1D16D, 0x1D172, 0x1D17B, 0x1D182, 0x1D185, 0x1D18B, 0x1D1AA, 0x1D1AD, 0x1D242, 0x1D244,
643+
0x1DA00, 0x1DA36, 0x1DA3B, 0x1DA6C, 0x1DA75, 0x1DA75, 0x1DA84, 0x1DA84, 0x1DA9B, 0x1E02A, 0x1E130, 0x1E136,
644+
0x1E2EC, 0x1E2EF, 0x1E8D0, 0x1E8D6, 0x1E944, 0x1E94A, 0xE0100, 0xE01EF
645+
];
646+
647+
const maxCodePoint = ranges[ranges.length - 1];
648+
const arrLen = Math.ceil(maxCodePoint / 8);
649+
const arr = new Uint8Array(arrLen);
650+
651+
for (let i = 0, len = ranges.length / 2; i < len; i++) {
652+
const from = ranges[2 * i];
653+
const to = ranges[2 * i + 1];
654+
655+
for (let j = from; j <= to; j++) {
656+
const div8 = j >>> 3;
657+
const mod8 = j & 7;
658+
arr[div8] = arr[div8] | (1 << mod8);
659+
}
660+
}
661+
662+
this.arr = arr;
663+
}
664+
665+
public isUnicodeMark(codePoint: number): boolean {
666+
const div8 = codePoint >>> 3;
667+
const mod8 = codePoint & 7;
668+
if (div8 >= this.arr.length) {
669+
return false;
670+
}
671+
return (this.arr[div8] & (1 << mod8)) ? true : false;
672+
}
673+
}
674+
490675
/**
491676
* Generated using https://github.com/alexandrudima/unicode-utils/blob/master/generate-rtl-test.js
492677
*/
@@ -572,6 +757,18 @@ export function isFullWidthCharacter(charCode: number): boolean {
572757
);
573758
}
574759

760+
/**
761+
* A fast function (therefore imprecise) to check if code points are emojis.
762+
* Generated using https://github.com/alexandrudima/unicode-utils/blob/master/generate-emoji-test.js
763+
*/
764+
export function isEmojiImprecise(x: number): boolean {
765+
return (
766+
(x >= 0x1F1E6 && x <= 0x1F1FF) || (x >= 9728 && x <= 10175) || (x >= 127744 && x <= 128591)
767+
|| (x >= 128640 && x <= 128764) || (x >= 128992 && x <= 129003) || (x >= 129280 && x <= 129535)
768+
|| (x >= 129648 && x <= 129651) || (x >= 129656 && x <= 129666) || (x >= 129680 && x <= 129685)
769+
);
770+
}
771+
575772
/**
576773
* Given a string and a max length returns a shorted version. Shorting
577774
* happens at favorable positions - such as whitespace or punctuation characters.

src/vs/base/common/uint.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ export const enum Constants {
3535
*/
3636
MAX_UINT_32 = 4294967295, // 2^32 - 1
3737

38-
38+
UNICODE_SUPPLEMENTARY_PLANE_BEGIN = 0x010000
3939
}
4040

4141
export function toUint8(v: number): number {

src/vs/editor/browser/viewParts/viewCursors/viewCursor.ts

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,34 +118,39 @@ export class ViewCursor {
118118

119119
private _prepareRender(ctx: RenderingContext): ViewCursorRenderData | null {
120120
let textContent = '';
121-
let textContentClassName = '';
122121

123122
if (this._cursorStyle === TextEditorCursorStyle.Line || this._cursorStyle === TextEditorCursorStyle.LineThin) {
124123
const visibleRange = ctx.visibleRangeForPosition(this._position);
125124
if (!visibleRange) {
126125
// Outside viewport
127126
return null;
128127
}
128+
129129
let width: number;
130130
if (this._cursorStyle === TextEditorCursorStyle.Line) {
131131
width = dom.computeScreenAwareSize(this._lineCursorWidth > 0 ? this._lineCursorWidth : 2);
132132
if (width > 2) {
133133
const lineContent = this._context.model.getLineContent(this._position.lineNumber);
134-
textContent = lineContent.charAt(this._position.column - 1);
134+
const nextCharLength = strings.nextCharLength(lineContent, this._position.column - 1);
135+
textContent = lineContent.substr(this._position.column - 1, nextCharLength);
135136
}
136137
} else {
137138
width = dom.computeScreenAwareSize(1);
138139
}
140+
139141
let left = visibleRange.left;
140142
if (width >= 2 && left >= 1) {
141143
// try to center cursor
142144
left -= 1;
143145
}
146+
144147
const top = ctx.getVerticalOffsetForLineNumber(this._position.lineNumber) - ctx.bigNumbersDelta;
145-
return new ViewCursorRenderData(top, left, width, this._lineHeight, textContent, textContentClassName);
148+
return new ViewCursorRenderData(top, left, width, this._lineHeight, textContent, '');
146149
}
147150

148-
const visibleRangeForCharacter = ctx.linesVisibleRangesForRange(new Range(this._position.lineNumber, this._position.column, this._position.lineNumber, this._position.column + 1), false);
151+
const lineContent = this._context.model.getLineContent(this._position.lineNumber);
152+
const nextCharLength = strings.nextCharLength(lineContent, this._position.column - 1);
153+
const visibleRangeForCharacter = ctx.linesVisibleRangesForRange(new Range(this._position.lineNumber, this._position.column, this._position.lineNumber, this._position.column + nextCharLength), false);
149154

150155
if (!visibleRangeForCharacter || visibleRangeForCharacter.length === 0 || visibleRangeForCharacter[0].ranges.length === 0) {
151156
// Outside viewport
@@ -155,12 +160,10 @@ export class ViewCursor {
155160
const range = visibleRangeForCharacter[0].ranges[0];
156161
const width = range.width < 1 ? this._typicalHalfwidthCharacterWidth : range.width;
157162

163+
let textContentClassName = '';
158164
if (this._cursorStyle === TextEditorCursorStyle.Block) {
159165
const lineData = this._context.model.getViewLineData(this._position.lineNumber);
160-
textContent = lineData.content.charAt(this._position.column - 1);
161-
if (strings.isHighSurrogate(lineData.content.charCodeAt(this._position.column - 1))) {
162-
textContent += lineData.content.charAt(this._position.column);
163-
}
166+
textContent = lineContent.substr(this._position.column - 1, nextCharLength);
164167
const tokenIndex = lineData.tokens.findTokenIndexAtOffset(this._position.column - 1);
165168
textContentClassName = lineData.tokens.getClassName(tokenIndex);
166169
}

src/vs/editor/common/controller/cursor.ts

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -765,19 +765,17 @@ export class Cursor extends viewEvents.ViewEventEmitter implements ICursors {
765765
if (!this._isDoingComposition && source === 'keyboard') {
766766
// If this event is coming straight from the keyboard, look for electric characters and enter
767767

768-
for (let i = 0, len = text.length; i < len; i++) {
769-
let charCode = text.charCodeAt(i);
770-
let chr: string;
771-
if (strings.isHighSurrogate(charCode) && i + 1 < len) {
772-
chr = text.charAt(i) + text.charAt(i + 1);
773-
i++;
774-
} else {
775-
chr = text.charAt(i);
776-
}
768+
const len = text.length;
769+
let offset = 0;
770+
while (offset < len) {
771+
const charLength = strings.nextCharLength(text, offset);
772+
const chr = text.substr(offset, charLength);
777773

778774
// Here we must interpret each typed character individually
779775
const autoClosedCharacters = AutoClosedAction.getAllAutoClosedCharacters(this._autoClosedActions);
780776
this._executeEditOperation(TypeOperations.typeWithInterceptors(this._prevEditOperationType, this.context.config, this.context.model, this.getSelections(), autoClosedCharacters, chr));
777+
778+
offset += charLength;
781779
}
782780

783781
} else {

0 commit comments

Comments
 (0)