@@ -465,74 +465,110 @@ jsg::Ref<TextEncoder> TextEncoder::constructor(jsg::Lock& js) {
465465 return js.alloc <TextEncoder>();
466466}
467467
468- jsg::BufferSource TextEncoder::encode (jsg::Lock& js, jsg::Optional<jsg::JsString> input) {
468+ jsg::JsUint8Array TextEncoder::encode (jsg::Lock& js, jsg::Optional<jsg::JsString> input) {
469469 jsg::JsString str = input.orDefault (js.str ());
470470
471- if (str.length (js) == 0 ) {
472- return jsg::BufferSource (js, jsg::BackingStore::alloc<v8::Uint8Array>(js, 0 ));
473- }
474-
475- // Allocate the output buffer and perform the conversion while ValueView is alive, but defer
476- // creating the V8 BufferSource until after ValueView is destroyed. This approach uses
477- // BackingStore::wrap with a custom disposer to avoid the copy overhead that would occur with
478- // BackingStore::from in the v8 sandbox, since from() copies data when it's not already in the
479- // sandbox. By using new/delete with wrap(), we maintain ownership semantics compatible with V8's
480- // C-style BackingStore API while avoiding the extra allocation and copy.
481- jsg::BackingStore backing = [&]() {
482- v8::String::ValueView value_view (js.v8Isolate , str);
483- size_t length = static_cast <size_t >(value_view.length ());
484-
485- if (value_view.is_one_byte ()) {
486- // Fast path for Latin-1 encoded strings. V8 uses Latin-1 (ISO-8859-1) encoding internally
487- // for strings that contain only code points <= U+00FF. We need to convert to UTF-8.
488- auto data = reinterpret_cast <const char *>(value_view.data8 ());
489- size_t utf8_length = simdutf::utf8_length_from_latin1 (data, length);
490- auto * output = new kj::Array<kj::byte>(kj::heapArray<kj::byte>(utf8_length));
491- [[maybe_unused]] auto written =
492- simdutf::convert_latin1_to_utf8 (data, length, output->asChars ().begin ());
493- KJ_DASSERT (written == output->size ());
494- return jsg::BackingStore::wrap<v8::Uint8Array>(output->begin (), output->size (),
495- [](void *, size_t , void * ptr) { delete reinterpret_cast <kj::Array<kj::byte>*>(ptr); },
496- output);
471+ if (str.isOneByte (js)) {
472+ auto length = str.length (js);
473+ // Fast path for one-byte strings (Latin-1). writeOneByte() copies the raw bytes without
474+ // flattening the string, which is more efficient than using ValueView. Note that we
475+ // allocate `length * 2` bytes because Latin-1 characters 0x80-0xFF need 2 bytes in UTF-8.
476+ auto backing = jsg::BackingStore::alloc<v8::Uint8Array>(
477+ js, length, jsg::Lock::AllocOption::ZERO_INITIALIZED);
478+ str.writeOneByte (
479+ js, backing.asArrayPtr <kj::byte>(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8);
480+ auto backingData = reinterpret_cast <const char *>(backing.asArrayPtr <kj::byte>().begin ());
481+
482+ size_t utf8_length = simdutf::utf8_length_from_latin1 (backingData, length);
483+
484+ if (utf8_length == length) {
485+ return jsg::JsUint8Array (backing.createHandle (js).As <v8::Uint8Array>());
497486 }
498487
488+ auto backing2 = jsg::BackingStore::alloc<v8::Uint8Array>(
489+ js, utf8_length, jsg::Lock::AllocOption::ZERO_INITIALIZED);
490+ auto written = simdutf::convert_latin1_to_utf8 (
491+ backingData, length, reinterpret_cast <char *>(backing2.asArrayPtr <kj::byte>().begin ()));
492+ KJ_DASSERT (backing2.size () == written);
493+ return jsg::JsUint8Array (backing2.createHandle (js).As <v8::Uint8Array>());
494+ }
495+
496+ // First pass: Calculate the required UTF-8 output buffer size.
497+ // We need to do this in a separate ValueView because:
498+ // 1. ValueView holds the V8 heap lock, which prevents us from allocating new V8 objects
499+ // 2. We must determine the exact output size before allocating the BackingStore
500+ // 3. Once we know the size, we'll create a second ValueView to do the actual conversion
501+ size_t utf8_length = 0 ;
502+ bool isValidUtf16 = true ;
503+ // For invalid UTF-16 strings (with unpaired surrogates), we need to fix them to well-formed
504+ // UTF-16 before calculating the UTF-8 length. We store the fixed version here so it can be
505+ // reused in the second pass, avoiding the need to fix it twice.
506+ kj::Array<char16_t > wellFormed;
507+
508+ {
509+ v8::String::ValueView view (js.v8Isolate , str);
510+ // One-byte strings are handled by the fast path above
511+ KJ_DASSERT (!view.is_one_byte ());
512+
513+ auto data = reinterpret_cast <const char16_t *>(view.data16 ());
499514 // Two-byte string path. V8 uses UTF-16LE encoding internally for strings with code points
500515 // > U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path.
501- auto data = reinterpret_cast <const char16_t *>(value_view.data16 ());
502- auto valid_utf16 = simdutf::validate_utf16le (data, length);
503-
504- if (valid_utf16) {
516+ isValidUtf16 = simdutf::validate_utf16le (data, view.length ());
517+
518+ if (isValidUtf16) {
519+ // Common case: valid UTF-16, calculate UTF-8 length directly
520+ utf8_length = simdutf::utf8_length_from_utf16le (data, view.length ());
521+ } else {
522+ // Rare case: Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard,
523+ // unpaired surrogates must be replaced with U+FFFD (replacement character).
524+ // U+FFFD is 3 bytes in UTF-8, which means the UTF-8 length will differ from what
525+ // we'd calculate from the invalid UTF-16. We must fix the UTF-16 first, then
526+ // calculate the UTF-8 length from the well-formed version to get the correct size.
527+ wellFormed = kj::heapArray<char16_t >(view.length ());
528+ simdutf::to_well_formed_utf16le (data, view.length (), wellFormed.begin ());
529+ utf8_length = simdutf::utf8_length_from_utf16le (wellFormed.begin (), view.length ());
530+ }
531+ } // ValueView destroyed here, releasing the heap lock
532+
533+ // Pre-allocate the jsg::BackingStore to avoid the copy overhead that would occur with
534+ // BackingStore::from() in the v8 sandbox, since from() copies data when it's not already in the
535+ // sandbox. By pre-allocating with alloc(), the memory is already in the sandbox and we can
536+ // perform the conversion directly into it.
537+ auto backing = jsg::BackingStore::alloc<v8::Uint8Array>(
538+ js, utf8_length, jsg::Lock::AllocOption::ZERO_INITIALIZED);
539+
540+ // Second pass: Perform the actual UTF-8 conversion.
541+ // We create a new ValueView here to access the string data again, now that we have a
542+ // pre-allocated output buffer. The closure ensures the ValueView is destroyed before we
543+ // return the result, which is important for proper V8 heap management.
544+ [&]() {
545+ v8::String::ValueView view (js.v8Isolate , str);
546+ // One-byte strings are handled by the fast path above
547+ KJ_DASSERT (!view.is_one_byte ());
548+
549+ size_t length = static_cast <size_t >(view.length ());
550+ auto * output = backing.asArrayPtr <char >().begin ();
551+ auto data = reinterpret_cast <const char16_t *>(view.data16 ());
552+
553+ if (isValidUtf16) {
505554 // Common case: valid UTF-16LE, convert directly to UTF-8
506- size_t utf8_length = simdutf::utf8_length_from_utf16le (data, length);
507- auto * output = new kj::Array<kj::byte>(kj::heapArray<kj::byte>(utf8_length));
508- [[maybe_unused]] auto written =
509- simdutf::convert_utf16le_to_utf8 (data, length, output->asChars ().begin ());
510- KJ_DASSERT (written == output->size ());
511- return jsg::BackingStore::wrap<v8::Uint8Array>(output->begin (), output->size (),
512- [](void *, size_t , void * ptr) { delete reinterpret_cast <kj::Array<kj::byte>*>(ptr); },
513- output);
555+ [[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8 (data, length, output);
556+ KJ_DASSERT (written == backing.size ());
557+ return ;
514558 }
515559
516- // Rare case: Invalid UTF-16LE with unpaired surrogates. Per the Encoding Standard, we must
517- // replace unpaired surrogates with U+FFFD replacement characters. We do this in two passes:
518- // first fix the UTF-16, then convert to UTF-8. This extra buffer allocation only happens
519- // for malformed strings, which should be uncommon in practice.
520- auto well_formed = kj::heapArray<char16_t >(length);
521- simdutf::to_well_formed_utf16le (data, length, well_formed.begin ());
522-
523- size_t utf8_length = simdutf::utf8_length_from_utf16le (well_formed.begin (), length);
524- auto * output = new kj::Array<kj::byte>(kj::heapArray<kj::byte>(utf8_length));
560+ // Rare case: Invalid UTF-16LE with unpaired surrogates. We already fixed the UTF-16 to
561+ // well-formed in the first pass (stored in wellFormed array), so now we just convert that
562+ // fixed version to UTF-8. This reuses the wellFormed array created earlier, avoiding the
563+ // need to fix the UTF-16 a second time.
525564 [[maybe_unused]] auto written =
526- simdutf::convert_utf16le_to_utf8 (well_formed.begin (), length, output->asChars ().begin ());
527- KJ_DASSERT (written == output->size ());
528- return jsg::BackingStore::wrap<v8::Uint8Array>(output->begin (), output->size (),
529- [](void *, size_t , void * ptr) { delete reinterpret_cast <kj::Array<kj::byte>*>(ptr); },
530- output);
565+ simdutf::convert_utf16le_to_utf8 (wellFormed.begin (), wellFormed.size (), output);
566+ KJ_DASSERT (written == backing.size ());
531567 }(); // ValueView destroyed here, releasing the heap lock
532568
533569 // Now that ValueView is destroyed and the heap lock is released, it's safe to create V8 objects.
534- // Construct the BufferSource which will create the actual Uint8Array that gets returned to JS.
535- return jsg::BufferSource (js, kj::mv (backing ));
570+ // Create the Uint8Array from the BackingStore and return it to JS.
571+ return jsg::JsUint8Array (backing. createHandle (js). As <v8::Uint8Array>( ));
536572}
537573
538574TextEncoder::EncodeIntoResult TextEncoder::encodeInto (
0 commit comments