|
4 | 4 | *--------------------------------------------------------------------------------------------*/ |
5 | 5 |
|
6 | 6 | import { CharCode } from 'vs/base/common/charCode'; |
| 7 | +import { Constants } from 'vs/base/common/uint'; |
7 | 8 |
|
8 | 9 | export function isFalsyOrWhitespace(str: string | undefined): boolean { |
9 | 10 | if (!str || typeof str !== 'string') { |
@@ -487,6 +488,190 @@ export function isLowSurrogate(charCode: number): boolean { |
487 | 488 | return (0xDC00 <= charCode && charCode <= 0xDFFF); |
488 | 489 | } |
489 | 490 |
|
| 491 | +/** |
| 492 | + * get the code point that begins at offset `offset` |
| 493 | + */ |
| 494 | +export function getNextCodePoint(str: string, len: number, offset: number): number { |
| 495 | + const charCode = str.charCodeAt(offset); |
| 496 | + if (isHighSurrogate(charCode) && offset + 1 < len) { |
| 497 | + const nextCharCode = str.charCodeAt(offset + 1); |
| 498 | + if (isLowSurrogate(nextCharCode)) { |
| 499 | + return ((charCode - 0xD800) << 10) + (nextCharCode - 0xDC00) + 0x10000; |
| 500 | + } |
| 501 | + } |
| 502 | + return charCode; |
| 503 | +} |
| 504 | + |
| 505 | +/** |
| 506 | + * get the code point that ends right before offset `offset` |
| 507 | + */ |
| 508 | +function getPrevCodePoint(str: string, offset: number): number { |
| 509 | + const charCode = str.charCodeAt(offset - 1); |
| 510 | + if (isLowSurrogate(charCode) && offset > 1) { |
| 511 | + const prevCharCode = str.charCodeAt(offset - 2); |
| 512 | + if (isHighSurrogate(prevCharCode)) { |
| 513 | + return ((prevCharCode - 0xD800) << 10) + (charCode - 0xDC00) + 0x10000; |
| 514 | + } |
| 515 | + } |
| 516 | + return charCode; |
| 517 | +} |
| 518 | + |
| 519 | +export function nextCharLength(str: string, offset: number): number { |
| 520 | + const initialOffset = offset; |
| 521 | + const len = str.length; |
| 522 | + |
| 523 | + let codePoint = getNextCodePoint(str, len, offset); |
| 524 | + offset += (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1); |
| 525 | + |
| 526 | + while (offset < len) { |
| 527 | + codePoint = getNextCodePoint(str, len, offset); |
| 528 | + if (!isUnicodeMark(codePoint)) { |
| 529 | + break; |
| 530 | + } |
| 531 | + offset += (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1); |
| 532 | + } |
| 533 | + |
| 534 | + return (offset - initialOffset); |
| 535 | +} |
| 536 | + |
| 537 | +export function prevCharLength(str: string, offset: number): number { |
| 538 | + const initialOffset = offset; |
| 539 | + |
| 540 | + let codePoint = getPrevCodePoint(str, offset); |
| 541 | + offset -= (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1); |
| 542 | + |
| 543 | + while (offset > 0 && isUnicodeMark(codePoint)) { |
| 544 | + codePoint = getPrevCodePoint(str, offset); |
| 545 | + offset -= (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1); |
| 546 | + } |
| 547 | + |
| 548 | + return (initialOffset - offset); |
| 549 | +} |
| 550 | + |
| 551 | +function _getCharContainingOffset(str: string, offset: number): [number, number] { |
| 552 | + const len = str.length; |
| 553 | + const initialOffset = offset; |
| 554 | + const initialCodePoint = getNextCodePoint(str, len, offset); |
| 555 | + offset += (initialCodePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1); |
| 556 | + |
| 557 | + // extend to the right |
| 558 | + while (offset < len) { |
| 559 | + const nextCodePoint = getNextCodePoint(str, len, offset); |
| 560 | + if (!isUnicodeMark(nextCodePoint)) { |
| 561 | + break; |
| 562 | + } |
| 563 | + offset += (nextCodePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1); |
| 564 | + } |
| 565 | + const endOffset = offset; |
| 566 | + |
| 567 | + // extend to the left |
| 568 | + offset = initialOffset; |
| 569 | + let codePoint = initialCodePoint; |
| 570 | + |
| 571 | + while (offset > 0 && isUnicodeMark(codePoint)) { |
| 572 | + codePoint = getPrevCodePoint(str, offset); |
| 573 | + offset -= (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1); |
| 574 | + } |
| 575 | + |
| 576 | + return [offset, endOffset]; |
| 577 | +} |
| 578 | + |
| 579 | +export function getCharContainingOffset(str: string, offset: number): [number, number] { |
| 580 | + if (offset > 0 && isLowSurrogate(str.charCodeAt(offset))) { |
| 581 | + return _getCharContainingOffset(str, offset - 1); |
| 582 | + } |
| 583 | + return _getCharContainingOffset(str, offset); |
| 584 | +} |
| 585 | + |
| 586 | +export function isUnicodeMark(codePoint: number): boolean { |
| 587 | + return MarkClassifier.getInstance().isUnicodeMark(codePoint); |
| 588 | +} |
| 589 | + |
| 590 | +class MarkClassifier { |
| 591 | + |
| 592 | + private static _INSTANCE: MarkClassifier | null = null; |
| 593 | + |
| 594 | + public static getInstance(): MarkClassifier { |
| 595 | + if (!MarkClassifier._INSTANCE) { |
| 596 | + MarkClassifier._INSTANCE = new MarkClassifier(); |
| 597 | + } |
| 598 | + return MarkClassifier._INSTANCE; |
| 599 | + } |
| 600 | + |
| 601 | + private arr: Uint8Array; |
| 602 | + |
| 603 | + constructor() { |
| 604 | + // generated using https://github.com/alexandrudima/unicode-utils/blob/master/generate-mark-test.js |
| 605 | + const ranges = [ |
| 606 | + 0x0300, 0x036F, 0x0483, 0x0489, 0x0591, 0x05BD, 0x05BF, 0x05BF, 0x05C1, 0x05C2, 0x05C4, 0x05C5, |
| 607 | + 0x05C7, 0x05C7, 0x0610, 0x061A, 0x064B, 0x065F, 0x0670, 0x0670, 0x06D6, 0x06DC, 0x06DF, 0x06E4, |
| 608 | + 0x06E7, 0x06E8, 0x06EA, 0x06ED, 0x0711, 0x0711, 0x0730, 0x074A, 0x07A6, 0x07B0, 0x07EB, 0x07F3, |
| 609 | + 0x07FD, 0x07FD, 0x0816, 0x0819, 0x081B, 0x0823, 0x0825, 0x0827, 0x0829, 0x082D, 0x0859, 0x085B, |
| 610 | + 0x08D3, 0x08E1, 0x08E3, 0x0903, 0x093A, 0x093C, 0x093E, 0x094F, 0x0951, 0x0957, 0x0962, 0x0963, |
| 611 | + 0x0981, 0x0983, 0x09BC, 0x09BC, 0x09BE, 0x09CD, 0x09D7, 0x09D7, 0x09E2, 0x09E3, 0x09FE, 0x0A03, |
| 612 | + 0x0A3C, 0x0A51, 0x0A70, 0x0A71, 0x0A75, 0x0A75, 0x0A81, 0x0A83, 0x0ABC, 0x0ABC, 0x0ABE, 0x0ACD, |
| 613 | + 0x0AE2, 0x0AE3, 0x0AFA, 0x0B03, 0x0B3C, 0x0B3C, 0x0B3E, 0x0B57, 0x0B62, 0x0B63, 0x0B82, 0x0B82, |
| 614 | + 0x0BBE, 0x0BCD, 0x0BD7, 0x0BD7, 0x0C00, 0x0C04, 0x0C3E, 0x0C56, 0x0C62, 0x0C63, 0x0C81, 0x0C83, |
| 615 | + 0x0CBC, 0x0CBC, 0x0CBE, 0x0CD6, 0x0CE2, 0x0CE3, 0x0D00, 0x0D03, 0x0D3B, 0x0D3C, 0x0D3E, 0x0D4D, |
| 616 | + 0x0D57, 0x0D57, 0x0D62, 0x0D63, 0x0D81, 0x0D83, 0x0DCA, 0x0DDF, 0x0DF2, 0x0DF3, 0x0E31, 0x0E31, |
| 617 | + 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB1, 0x0EB1, 0x0EB4, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, |
| 618 | + 0x0F35, 0x0F35, 0x0F37, 0x0F37, 0x0F39, 0x0F39, 0x0F3E, 0x0F3F, 0x0F71, 0x0F84, 0x0F86, 0x0F87, |
| 619 | + 0x0F8D, 0x0FBC, 0x0FC6, 0x0FC6, 0x102B, 0x103E, 0x1056, 0x1059, 0x105E, 0x1060, 0x1062, 0x1064, |
| 620 | + 0x1067, 0x106D, 0x1071, 0x1074, 0x1082, 0x108D, 0x108F, 0x108F, 0x109A, 0x109D, 0x135D, 0x135F, |
| 621 | + 0x1712, 0x1714, 0x1732, 0x1734, 0x1752, 0x1753, 0x1772, 0x1773, 0x17B4, 0x17D3, 0x17DD, 0x17DD, |
| 622 | + 0x180B, 0x180D, 0x1885, 0x1886, 0x18A9, 0x18A9, 0x1920, 0x193B, 0x1A17, 0x1A1B, 0x1A55, 0x1A7F, |
| 623 | + 0x1AB0, 0x1B04, 0x1B34, 0x1B44, 0x1B6B, 0x1B73, 0x1B80, 0x1B82, 0x1BA1, 0x1BAD, 0x1BE6, 0x1BF3, |
| 624 | + 0x1C24, 0x1C37, 0x1CD0, 0x1CD2, 0x1CD4, 0x1CE8, 0x1CED, 0x1CED, 0x1CF4, 0x1CF4, 0x1CF7, 0x1CF9, |
| 625 | + 0x1DC0, 0x1DFF, 0x20D0, 0x20F0, 0x2CEF, 0x2CF1, 0x2D7F, 0x2D7F, 0x2DE0, 0x2DFF, 0x302A, 0x302F, |
| 626 | + 0x3099, 0x309A, 0xA66F, 0xA672, 0xA674, 0xA67D, 0xA69E, 0xA69F, 0xA6F0, 0xA6F1, 0xA802, 0xA802, |
| 627 | + 0xA806, 0xA806, 0xA80B, 0xA80B, 0xA823, 0xA827, 0xA82C, 0xA82C, 0xA880, 0xA881, 0xA8B4, 0xA8C5, |
| 628 | + 0xA8E0, 0xA8F1, 0xA8FF, 0xA8FF, 0xA926, 0xA92D, 0xA947, 0xA953, 0xA980, 0xA983, 0xA9B3, 0xA9C0, |
| 629 | + 0xA9E5, 0xA9E5, 0xAA29, 0xAA36, 0xAA43, 0xAA43, 0xAA4C, 0xAA4D, 0xAA7B, 0xAA7D, 0xAAB0, 0xAAB0, |
| 630 | + 0xAAB2, 0xAAB4, 0xAAB7, 0xAAB8, 0xAABE, 0xAABF, 0xAAC1, 0xAAC1, 0xAAEB, 0xAAEF, 0xAAF5, 0xAAF6, |
| 631 | + 0xABE3, 0xABEA, 0xABEC, 0xABED, 0xFB1E, 0xFB1E, 0xFE00, 0xFE0F, 0xFE20, 0xFE2F, 0x101FD, 0x101FD, |
| 632 | + 0x102E0, 0x102E0, 0x10376, 0x1037A, 0x10A01, 0x10A0F, 0x10A38, 0x10A3F, 0x10AE5, 0x10AE6, 0x10D24, 0x10D27, |
| 633 | + 0x10EAB, 0x10EAC, 0x10F46, 0x10F50, 0x11000, 0x11002, 0x11038, 0x11046, 0x1107F, 0x11082, 0x110B0, 0x110BA, |
| 634 | + 0x11100, 0x11102, 0x11127, 0x11134, 0x11145, 0x11146, 0x11173, 0x11173, 0x11180, 0x11182, 0x111B3, 0x111C0, |
| 635 | + 0x111C9, 0x111CC, 0x111CE, 0x111CF, 0x1122C, 0x11237, 0x1123E, 0x1123E, 0x112DF, 0x112EA, 0x11300, 0x11303, |
| 636 | + 0x1133B, 0x1133C, 0x1133E, 0x1134D, 0x11357, 0x11357, 0x11362, 0x11374, 0x11435, 0x11446, 0x1145E, 0x1145E, |
| 637 | + 0x114B0, 0x114C3, 0x115AF, 0x115C0, 0x115DC, 0x115DD, 0x11630, 0x11640, 0x116AB, 0x116B7, 0x1171D, 0x1172B, |
| 638 | + 0x1182C, 0x1183A, 0x11930, 0x1193E, 0x11940, 0x11940, 0x11942, 0x11943, 0x119D1, 0x119E0, 0x119E4, 0x119E4, |
| 639 | + 0x11A01, 0x11A0A, 0x11A33, 0x11A39, 0x11A3B, 0x11A3E, 0x11A47, 0x11A47, 0x11A51, 0x11A5B, 0x11A8A, 0x11A99, |
| 640 | + 0x11C2F, 0x11C3F, 0x11C92, 0x11CB6, 0x11D31, 0x11D45, 0x11D47, 0x11D47, 0x11D8A, 0x11D97, 0x11EF3, 0x11EF6, |
| 641 | + 0x16AF0, 0x16AF4, 0x16B30, 0x16B36, 0x16F4F, 0x16F4F, 0x16F51, 0x16F92, 0x16FE4, 0x16FF1, 0x1BC9D, 0x1BC9E, |
| 642 | + 0x1D165, 0x1D169, 0x1D16D, 0x1D172, 0x1D17B, 0x1D182, 0x1D185, 0x1D18B, 0x1D1AA, 0x1D1AD, 0x1D242, 0x1D244, |
| 643 | + 0x1DA00, 0x1DA36, 0x1DA3B, 0x1DA6C, 0x1DA75, 0x1DA75, 0x1DA84, 0x1DA84, 0x1DA9B, 0x1E02A, 0x1E130, 0x1E136, |
| 644 | + 0x1E2EC, 0x1E2EF, 0x1E8D0, 0x1E8D6, 0x1E944, 0x1E94A, 0xE0100, 0xE01EF |
| 645 | + ]; |
| 646 | + |
| 647 | + const maxCodePoint = ranges[ranges.length - 1]; |
| 648 | + const arrLen = Math.ceil(maxCodePoint / 8); |
| 649 | + const arr = new Uint8Array(arrLen); |
| 650 | + |
| 651 | + for (let i = 0, len = ranges.length / 2; i < len; i++) { |
| 652 | + const from = ranges[2 * i]; |
| 653 | + const to = ranges[2 * i + 1]; |
| 654 | + |
| 655 | + for (let j = from; j <= to; j++) { |
| 656 | + const div8 = j >>> 3; |
| 657 | + const mod8 = j & 7; |
| 658 | + arr[div8] = arr[div8] | (1 << mod8); |
| 659 | + } |
| 660 | + } |
| 661 | + |
| 662 | + this.arr = arr; |
| 663 | + } |
| 664 | + |
| 665 | + public isUnicodeMark(codePoint: number): boolean { |
| 666 | + const div8 = codePoint >>> 3; |
| 667 | + const mod8 = codePoint & 7; |
| 668 | + if (div8 >= this.arr.length) { |
| 669 | + return false; |
| 670 | + } |
| 671 | + return (this.arr[div8] & (1 << mod8)) ? true : false; |
| 672 | + } |
| 673 | +} |
| 674 | + |
490 | 675 | /** |
491 | 676 | * Generated using https://github.com/alexandrudima/unicode-utils/blob/master/generate-rtl-test.js |
492 | 677 | */ |
@@ -572,6 +757,18 @@ export function isFullWidthCharacter(charCode: number): boolean { |
572 | 757 | ); |
573 | 758 | } |
574 | 759 |
|
| 760 | +/** |
| 761 | + * A fast function (therefore imprecise) to check if code points are emojis. |
| 762 | + * Generated using https://github.com/alexandrudima/unicode-utils/blob/master/generate-emoji-test.js |
| 763 | + */ |
| 764 | +export function isEmojiImprecise(x: number): boolean { |
| 765 | + return ( |
| 766 | + (x >= 0x1F1E6 && x <= 0x1F1FF) || (x >= 9728 && x <= 10175) || (x >= 127744 && x <= 128591) |
| 767 | + || (x >= 128640 && x <= 128764) || (x >= 128992 && x <= 129003) || (x >= 129280 && x <= 129535) |
| 768 | + || (x >= 129648 && x <= 129651) || (x >= 129656 && x <= 129666) || (x >= 129680 && x <= 129685) |
| 769 | + ); |
| 770 | +} |
| 771 | + |
575 | 772 | /** |
576 | 773 | * Given a string and a max length returns a shorted version. Shorting |
577 | 774 | * happens at favorable positions - such as whitespace or punctuation characters. |
|
0 commit comments