Skip to content

Commit 3a6ea76

Browse files
committed
get rid of multiple valueviews
1 parent f1bbfe6 commit 3a6ea76

File tree

1 file changed

+42
-73
lines changed

1 file changed

+42
-73
lines changed

src/workerd/api/encoding.c++

Lines changed: 42 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -467,108 +467,77 @@ jsg::Ref<TextEncoder> TextEncoder::constructor(jsg::Lock& js) {
467467

468468
jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional<jsg::JsString> input) {
469469
jsg::JsString str = input.orDefault(js.str());
470+
std::shared_ptr<v8::BackingStore> backingStore;
471+
size_t utf8_length = 0;
470472

473+
// Fast path: check if string is one-byte before creating ValueView
471474
if (str.isOneByte(js)) {
472475
auto length = str.length(js);
473-
// Fast path for one-byte strings (Latin-1). writeOneByte() copies the raw bytes without
474-
// flattening the string, which is more efficient than using ValueView. Note that we
475-
// allocate `length * 2` bytes because Latin-1 characters 0x80-0xFF need 2 bytes in UTF-8.
476-
auto backing =
477-
jsg::BackingStore::alloc<v8::Uint8Array>(js, length, jsg::Lock::AllocOption::UNINITIALIZED);
478-
str.writeOneByte(
479-
js, backing.asArrayPtr<kj::byte>(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8);
480-
auto backingData = reinterpret_cast<const char*>(backing.asArrayPtr<kj::byte>().begin());
476+
// Allocate buffer for Latin-1. Use v8::ArrayBuffer::NewBackingStore to avoid creating
477+
// JS objects during conversion.
478+
backingStore = v8::ArrayBuffer::NewBackingStore(js.v8Isolate, length);
479+
auto backingData = reinterpret_cast<kj::byte*>(backingStore->Data());
480+
481+
str.writeOneByte(js, kj::ArrayPtr<kj::byte>(backingData, length),
482+
jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8);
481483

482-
size_t utf8_length = simdutf::utf8_length_from_latin1(backingData, length);
484+
utf8_length =
485+
simdutf::utf8_length_from_latin1(reinterpret_cast<const char*>(backingData), length);
483486

484487
if (utf8_length == length) {
485-
return jsg::JsUint8Array(backing.createHandle(js).As<v8::Uint8Array>());
488+
// ASCII fast path: no conversion needed, Latin-1 is same as UTF-8 for ASCII
489+
auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, length);
490+
return jsg::JsUint8Array(array);
486491
}
487492

488-
auto backing2 = jsg::BackingStore::alloc<v8::Uint8Array>(
489-
js, utf8_length, jsg::Lock::AllocOption::UNINITIALIZED);
490-
auto written = simdutf::convert_latin1_to_utf8(
491-
backingData, length, reinterpret_cast<char*>(backing2.asArrayPtr<kj::byte>().begin()));
492-
KJ_DASSERT(backing2.size() == written);
493-
return jsg::JsUint8Array(backing2.createHandle(js).As<v8::Uint8Array>());
493+
// Need to convert Latin-1 to UTF-8
494+
std::shared_ptr<v8::BackingStore> backingStore2 =
495+
v8::ArrayBuffer::NewBackingStore(js.v8Isolate, utf8_length);
496+
auto written = simdutf::convert_latin1_to_utf8(reinterpret_cast<const char*>(backingData),
497+
length, reinterpret_cast<char*>(backingStore2->Data()));
498+
KJ_DASSERT(utf8_length == written);
499+
auto array =
500+
v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore2), 0, utf8_length);
501+
return jsg::JsUint8Array(array);
494502
}
495503

496-
// First pass: Calculate the required UTF-8 output buffer size.
497-
// We need to do this in a separate ValueView because:
498-
// 1. ValueView holds the V8 heap lock, which prevents us from allocating new V8 objects
499-
// 2. We must determine the exact output size before allocating the BackingStore
500-
// 3. Once we know the size, we'll create a second ValueView to do the actual conversion
501-
size_t utf8_length = 0;
502-
bool isValidUtf16 = true;
503-
// For invalid UTF-16 strings (with unpaired surrogates), we need to fix them to well-formed
504-
// UTF-16 before calculating the UTF-8 length. We store the fixed version here so it can be
505-
// reused in the second pass, avoiding the need to fix it twice.
506-
kj::Array<char16_t> wellFormed;
507-
504+
// Two-byte string path
508505
{
506+
// Note that ValueView flattens the string, if it's not already flattened
509507
v8::String::ValueView view(js.v8Isolate, str);
510-
// One-byte strings are handled by the fast path above
511-
KJ_DASSERT(!view.is_one_byte());
512-
513-
auto data = reinterpret_cast<const char16_t*>(view.data16());
514508
// Two-byte string path. V8 uses UTF-16LE encoding internally for strings with code points
515509
// > U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path.
516-
isValidUtf16 = simdutf::validate_utf16le(data, view.length());
510+
auto data = reinterpret_cast<const char16_t*>(view.data16());
511+
bool isValidUtf16 = simdutf::validate_utf16le(data, view.length());
517512

518513
if (isValidUtf16) {
519-
// Common case: valid UTF-16, calculate UTF-8 length directly
514+
// Common case: valid UTF-16, convert directly to UTF-8
520515
utf8_length = simdutf::utf8_length_from_utf16le(data, view.length());
516+
backingStore = v8::ArrayBuffer::NewBackingStore(js.v8Isolate, utf8_length);
517+
[[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8(
518+
data, view.length(), reinterpret_cast<char*>(backingStore->Data()));
519+
KJ_DASSERT(written == utf8_length);
521520
} else {
522521
// Rare case: Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard,
523522
// unpaired surrogates must be replaced with U+FFFD (replacement character).
524523
// U+FFFD is 3 bytes in UTF-8, which means the UTF-8 length will differ from what
525524
// we'd calculate from the invalid UTF-16. We must fix the UTF-16 first, then
526525
// calculate the UTF-8 length from the well-formed version to get the correct size.
527-
wellFormed = kj::heapArray<char16_t>(view.length());
526+
auto wellFormed = kj::heapArray<char16_t>(view.length());
528527
simdutf::to_well_formed_utf16le(data, view.length(), wellFormed.begin());
529528
utf8_length = simdutf::utf8_length_from_utf16le(wellFormed.begin(), view.length());
529+
backingStore = v8::ArrayBuffer::NewBackingStore(js.v8Isolate, utf8_length);
530+
[[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8(
531+
wellFormed.begin(), wellFormed.size(), reinterpret_cast<char*>(backingStore->Data()));
532+
KJ_DASSERT(written == utf8_length);
530533
}
531534
} // ValueView destroyed here, releasing the heap lock
532535

533-
// Pre-allocate the jsg::BackingStore to avoid the copy overhead that would occur with
534-
// BackingStore::from() in the v8 sandbox, since from() copies data when it's not already in the
535-
// sandbox. By pre-allocating with alloc(), the memory is already in the sandbox and we can
536-
// perform the conversion directly into it.
537-
auto backing = jsg::BackingStore::alloc<v8::Uint8Array>(
538-
js, utf8_length, jsg::Lock::AllocOption::UNINITIALIZED);
539-
540-
// Second pass: Perform the actual UTF-8 conversion.
541-
// We create a new ValueView here to access the string data again, now that we have a
542-
// pre-allocated output buffer. The closure ensures the ValueView is destroyed before we
543-
// return the result, which is important for proper V8 heap management.
544-
[&]() {
545-
v8::String::ValueView view(js.v8Isolate, str);
546-
// One-byte strings are handled by the fast path above
547-
KJ_DASSERT(!view.is_one_byte());
548-
549-
size_t length = static_cast<size_t>(view.length());
550-
auto* output = backing.asArrayPtr<char>().begin();
551-
auto data = reinterpret_cast<const char16_t*>(view.data16());
552-
553-
if (isValidUtf16) {
554-
// Common case: valid UTF-16LE, convert directly to UTF-8
555-
[[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8(data, length, output);
556-
KJ_DASSERT(written == backing.size());
557-
return;
558-
}
559-
560-
// Rare case: Invalid UTF-16LE with unpaired surrogates. We already fixed the UTF-16 to
561-
// well-formed in the first pass (stored in wellFormed array), so now we just convert that
562-
// fixed version to UTF-8. This reuses the wellFormed array created earlier, avoiding the
563-
// need to fix the UTF-16 a second time.
564-
[[maybe_unused]] auto written =
565-
simdutf::convert_utf16le_to_utf8(wellFormed.begin(), wellFormed.size(), output);
566-
KJ_DASSERT(written == backing.size());
567-
}(); // ValueView destroyed here, releasing the heap lock
568-
569536
// Now that ValueView is destroyed and the heap lock is released, it's safe to create V8 objects.
570-
// Create the Uint8Array from the BackingStore and return it to JS.
571-
return jsg::JsUint8Array(backing.createHandle(js).As<v8::Uint8Array>());
537+
// Create the Uint8Array from the raw v8::BackingStore.
538+
auto array =
539+
v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length);
540+
return jsg::JsUint8Array(array);
572541
}
573542

574543
TextEncoder::EncodeIntoResult TextEncoder::encodeInto(

0 commit comments

Comments
 (0)