@@ -672,6 +672,119 @@ class MarkClassifier {
672672 }
673673}
674674
675+ /**
676+ * A manual encoding of `str` to UTF8.
677+ * Use only in environments which do not offer native conversion methods!
678+ */
679+ export function encodeUTF8 ( str : string ) : Uint8Array {
680+ const strLen = str . length ;
681+
682+ // See https://en.wikipedia.org/wiki/UTF-8
683+
684+ // first loop to establish needed buffer size
685+ let neededSize = 0 ;
686+ let strOffset = 0 ;
687+ while ( strOffset < strLen ) {
688+ const codePoint = getNextCodePoint ( str , strLen , strOffset ) ;
689+ strOffset += ( codePoint >= Constants . UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1 ) ;
690+
691+ if ( codePoint < 0x0080 ) {
692+ neededSize += 1 ;
693+ } else if ( codePoint < 0x0800 ) {
694+ neededSize += 2 ;
695+ } else if ( codePoint < 0x10000 ) {
696+ neededSize += 3 ;
697+ } else {
698+ neededSize += 4 ;
699+ }
700+ }
701+
702+ // second loop to actually encode
703+ const arr = new Uint8Array ( neededSize ) ;
704+ strOffset = 0 ;
705+ let arrOffset = 0 ;
706+ while ( strOffset < strLen ) {
707+ const codePoint = getNextCodePoint ( str , strLen , strOffset ) ;
708+ strOffset += ( codePoint >= Constants . UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1 ) ;
709+
710+ if ( codePoint < 0x0080 ) {
711+ arr [ arrOffset ++ ] = codePoint ;
712+ } else if ( codePoint < 0x0800 ) {
713+ arr [ arrOffset ++ ] = 0b11000000 | ( ( codePoint & 0b00000000000000000000011111000000 ) >>> 6 ) ;
714+ arr [ arrOffset ++ ] = 0b10000000 | ( ( codePoint & 0b00000000000000000000000000111111 ) >>> 0 ) ;
715+ } else if ( codePoint < 0x10000 ) {
716+ arr [ arrOffset ++ ] = 0b11100000 | ( ( codePoint & 0b00000000000000001111000000000000 ) >>> 12 ) ;
717+ arr [ arrOffset ++ ] = 0b10000000 | ( ( codePoint & 0b00000000000000000000111111000000 ) >>> 6 ) ;
718+ arr [ arrOffset ++ ] = 0b10000000 | ( ( codePoint & 0b00000000000000000000000000111111 ) >>> 0 ) ;
719+ } else {
720+ arr [ arrOffset ++ ] = 0b11110000 | ( ( codePoint & 0b00000000000111000000000000000000 ) >>> 18 ) ;
721+ arr [ arrOffset ++ ] = 0b10000000 | ( ( codePoint & 0b00000000000000111111000000000000 ) >>> 12 ) ;
722+ arr [ arrOffset ++ ] = 0b10000000 | ( ( codePoint & 0b00000000000000000000111111000000 ) >>> 6 ) ;
723+ arr [ arrOffset ++ ] = 0b10000000 | ( ( codePoint & 0b00000000000000000000000000111111 ) >>> 0 ) ;
724+ }
725+ }
726+
727+ return arr ;
728+ }
729+
730+ /**
731+ * A manual decoding of a UTF8 string.
732+ * Use only in environments which do not offer native conversion methods!
733+ */
734+ export function decodeUTF8 ( buffer : Uint8Array ) : string {
735+ // https://en.wikipedia.org/wiki/UTF-8
736+
737+ const len = buffer . byteLength ;
738+ const result : string [ ] = [ ] ;
739+ let offset = 0 ;
740+ while ( offset < len ) {
741+ const v0 = buffer [ offset ] ;
742+ let codePoint : number ;
743+ if ( v0 >= 0b11110000 && offset + 3 < len ) {
744+ // 4 bytes
745+ codePoint = (
746+ ( ( ( buffer [ offset ++ ] & 0b00000111 ) << 18 ) >>> 0 )
747+ | ( ( ( buffer [ offset ++ ] & 0b00111111 ) << 12 ) >>> 0 )
748+ | ( ( ( buffer [ offset ++ ] & 0b00111111 ) << 6 ) >>> 0 )
749+ | ( ( ( buffer [ offset ++ ] & 0b00111111 ) << 0 ) >>> 0 )
750+ ) ;
751+ } else if ( v0 >= 0b11100000 && offset + 2 < len ) {
752+ // 3 bytes
753+ codePoint = (
754+ ( ( ( buffer [ offset ++ ] & 0b00001111 ) << 12 ) >>> 0 )
755+ | ( ( ( buffer [ offset ++ ] & 0b00111111 ) << 6 ) >>> 0 )
756+ | ( ( ( buffer [ offset ++ ] & 0b00111111 ) << 0 ) >>> 0 )
757+ ) ;
758+ } else if ( v0 >= 0b11000000 && offset + 1 < len ) {
759+ // 2 bytes
760+ codePoint = (
761+ ( ( ( buffer [ offset ++ ] & 0b00011111 ) << 6 ) >>> 0 )
762+ | ( ( ( buffer [ offset ++ ] & 0b00111111 ) << 0 ) >>> 0 )
763+ ) ;
764+ } else {
765+ // 1 byte
766+ codePoint = buffer [ offset ++ ] ;
767+ }
768+
769+ if ( ( codePoint >= 0 && codePoint <= 0xD7FF ) || ( codePoint >= 0xE000 && codePoint <= 0xFFFF ) ) {
770+ // Basic Multilingual Plane
771+ result . push ( String . fromCharCode ( codePoint ) ) ;
772+ } else if ( codePoint >= 0x010000 && codePoint <= 0x10FFFF ) {
773+ // Supplementary Planes
774+ const uPrime = codePoint - 0x10000 ;
775+ const w1 = 0xD800 + ( ( uPrime & 0b11111111110000000000 ) >>> 10 ) ;
776+ const w2 = 0xDC00 + ( ( uPrime & 0b00000000001111111111 ) >>> 0 ) ;
777+ result . push ( String . fromCharCode ( w1 ) ) ;
778+ result . push ( String . fromCharCode ( w2 ) ) ;
779+ } else {
780+ // illegal code point
781+ result . push ( String . fromCharCode ( 0xFFFD ) ) ;
782+ }
783+ }
784+
785+ return result . join ( '' ) ;
786+ }
787+
675788/**
676789 * Generated using https://github.com/alexandrudima/unicode-utils/blob/master/generate-rtl-test.js
677790 */
0 commit comments