Skip to content

Commit 98afb46

Browse files
committed
address pr reviews
1 parent a118760 commit 98afb46

File tree

4 files changed

+103
-60
lines changed

4 files changed

+103
-60
lines changed

src/workerd/api/encoding.c++

Lines changed: 91 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -465,74 +465,110 @@ jsg::Ref<TextEncoder> TextEncoder::constructor(jsg::Lock& js) {
465465
return js.alloc<TextEncoder>();
466466
}
467467

468-
jsg::BufferSource TextEncoder::encode(jsg::Lock& js, jsg::Optional<jsg::JsString> input) {
468+
jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional<jsg::JsString> input) {
469469
jsg::JsString str = input.orDefault(js.str());
470470

471-
if (str.length(js) == 0) {
472-
return jsg::BufferSource(js, jsg::BackingStore::alloc<v8::Uint8Array>(js, 0));
473-
}
474-
475-
// Allocate the output buffer and perform the conversion while ValueView is alive, but defer
476-
// creating the V8 BufferSource until after ValueView is destroyed. This approach uses
477-
// BackingStore::wrap with a custom disposer to avoid the copy overhead that would occur with
478-
// BackingStore::from in the v8 sandbox, since from() copies data when it's not already in the
479-
// sandbox. By using new/delete with wrap(), we maintain ownership semantics compatible with V8's
480-
// C-style BackingStore API while avoiding the extra allocation and copy.
481-
jsg::BackingStore backing = [&]() {
482-
v8::String::ValueView value_view(js.v8Isolate, str);
483-
size_t length = static_cast<size_t>(value_view.length());
484-
485-
if (value_view.is_one_byte()) {
486-
// Fast path for Latin-1 encoded strings. V8 uses Latin-1 (ISO-8859-1) encoding internally
487-
// for strings that contain only code points <= U+00FF. We need to convert to UTF-8.
488-
auto data = reinterpret_cast<const char*>(value_view.data8());
489-
size_t utf8_length = simdutf::utf8_length_from_latin1(data, length);
490-
auto* output = new kj::Array<kj::byte>(kj::heapArray<kj::byte>(utf8_length));
491-
[[maybe_unused]] auto written =
492-
simdutf::convert_latin1_to_utf8(data, length, output->asChars().begin());
493-
KJ_DASSERT(written == output->size());
494-
return jsg::BackingStore::wrap<v8::Uint8Array>(output->begin(), output->size(),
495-
[](void*, size_t, void* ptr) { delete reinterpret_cast<kj::Array<kj::byte>*>(ptr); },
496-
output);
471+
if (str.isOneByte(js)) {
472+
auto length = str.length(js);
473+
// Fast path for one-byte strings (Latin-1). writeOneByte() copies the raw bytes without
474+
// flattening the string, which is more efficient than using ValueView. Note that we
475+
// allocate `length * 2` bytes because Latin-1 characters 0x80-0xFF need 2 bytes in UTF-8.
476+
auto backing = jsg::BackingStore::alloc<v8::Uint8Array>(
477+
js, length, jsg::Lock::AllocOption::ZERO_INITIALIZED);
478+
str.writeOneByte(
479+
js, backing.asArrayPtr<kj::byte>(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8);
480+
auto backingData = reinterpret_cast<const char*>(backing.asArrayPtr<kj::byte>().begin());
481+
482+
size_t utf8_length = simdutf::utf8_length_from_latin1(backingData, length);
483+
484+
if (utf8_length == length) {
485+
return jsg::JsUint8Array(backing.createHandle(js).As<v8::Uint8Array>());
497486
}
498487

488+
auto backing2 = jsg::BackingStore::alloc<v8::Uint8Array>(
489+
js, utf8_length, jsg::Lock::AllocOption::ZERO_INITIALIZED);
490+
auto written = simdutf::convert_latin1_to_utf8(
491+
backingData, length, reinterpret_cast<char*>(backing2.asArrayPtr<kj::byte>().begin()));
492+
KJ_DASSERT(backing2.size() == written);
493+
return jsg::JsUint8Array(backing2.createHandle(js).As<v8::Uint8Array>());
494+
}
495+
496+
// First pass: Calculate the required UTF-8 output buffer size.
497+
// We need to do this in a separate ValueView because:
498+
// 1. ValueView holds the V8 heap lock, which prevents us from allocating new V8 objects
499+
// 2. We must determine the exact output size before allocating the BackingStore
500+
// 3. Once we know the size, we'll create a second ValueView to do the actual conversion
501+
size_t utf8_length = 0;
502+
bool isValidUtf16 = true;
503+
// For invalid UTF-16 strings (with unpaired surrogates), we need to fix them to well-formed
504+
// UTF-16 before calculating the UTF-8 length. We store the fixed version here so it can be
505+
// reused in the second pass, avoiding the need to fix it twice.
506+
kj::Array<char16_t> wellFormed;
507+
508+
{
509+
v8::String::ValueView view(js.v8Isolate, str);
510+
// One-byte strings are handled by the fast path above
511+
KJ_DASSERT(!view.is_one_byte());
512+
513+
auto data = reinterpret_cast<const char16_t*>(view.data16());
499514
// Two-byte string path. V8 uses UTF-16LE encoding internally for strings with code points
500515
// > U+00FF. Check if the UTF-16 is valid (no unpaired surrogates) to determine the path.
501-
auto data = reinterpret_cast<const char16_t*>(value_view.data16());
502-
auto valid_utf16 = simdutf::validate_utf16le(data, length);
503-
504-
if (valid_utf16) {
516+
isValidUtf16 = simdutf::validate_utf16le(data, view.length());
517+
518+
if (isValidUtf16) {
519+
// Common case: valid UTF-16, calculate UTF-8 length directly
520+
utf8_length = simdutf::utf8_length_from_utf16le(data, view.length());
521+
} else {
522+
// Rare case: Invalid UTF-16 with unpaired surrogates. Per the Encoding Standard,
523+
// unpaired surrogates must be replaced with U+FFFD (replacement character).
524+
// U+FFFD is 3 bytes in UTF-8, which means the UTF-8 length will differ from what
525+
// we'd calculate from the invalid UTF-16. We must fix the UTF-16 first, then
526+
// calculate the UTF-8 length from the well-formed version to get the correct size.
527+
wellFormed = kj::heapArray<char16_t>(view.length());
528+
simdutf::to_well_formed_utf16le(data, view.length(), wellFormed.begin());
529+
utf8_length = simdutf::utf8_length_from_utf16le(wellFormed.begin(), view.length());
530+
}
531+
} // ValueView destroyed here, releasing the heap lock
532+
533+
// Pre-allocate the jsg::BackingStore to avoid the copy overhead that would occur with
534+
// BackingStore::from() in the v8 sandbox, since from() copies data when it's not already in the
535+
// sandbox. By pre-allocating with alloc(), the memory is already in the sandbox and we can
536+
// perform the conversion directly into it.
537+
auto backing = jsg::BackingStore::alloc<v8::Uint8Array>(
538+
js, utf8_length, jsg::Lock::AllocOption::ZERO_INITIALIZED);
539+
540+
// Second pass: Perform the actual UTF-8 conversion.
541+
// We create a new ValueView here to access the string data again, now that we have a
542+
// pre-allocated output buffer. The closure ensures the ValueView is destroyed before we
543+
// return the result, which is important for proper V8 heap management.
544+
[&]() {
545+
v8::String::ValueView view(js.v8Isolate, str);
546+
// One-byte strings are handled by the fast path above
547+
KJ_DASSERT(!view.is_one_byte());
548+
549+
size_t length = static_cast<size_t>(view.length());
550+
auto* output = backing.asArrayPtr<char>().begin();
551+
auto data = reinterpret_cast<const char16_t*>(view.data16());
552+
553+
if (isValidUtf16) {
505554
// Common case: valid UTF-16LE, convert directly to UTF-8
506-
size_t utf8_length = simdutf::utf8_length_from_utf16le(data, length);
507-
auto* output = new kj::Array<kj::byte>(kj::heapArray<kj::byte>(utf8_length));
508-
[[maybe_unused]] auto written =
509-
simdutf::convert_utf16le_to_utf8(data, length, output->asChars().begin());
510-
KJ_DASSERT(written == output->size());
511-
return jsg::BackingStore::wrap<v8::Uint8Array>(output->begin(), output->size(),
512-
[](void*, size_t, void* ptr) { delete reinterpret_cast<kj::Array<kj::byte>*>(ptr); },
513-
output);
555+
[[maybe_unused]] auto written = simdutf::convert_utf16le_to_utf8(data, length, output);
556+
KJ_DASSERT(written == backing.size());
557+
return;
514558
}
515559

516-
// Rare case: Invalid UTF-16LE with unpaired surrogates. Per the Encoding Standard, we must
517-
// replace unpaired surrogates with U+FFFD replacement characters. We do this in two passes:
518-
// first fix the UTF-16, then convert to UTF-8. This extra buffer allocation only happens
519-
// for malformed strings, which should be uncommon in practice.
520-
auto well_formed = kj::heapArray<char16_t>(length);
521-
simdutf::to_well_formed_utf16le(data, length, well_formed.begin());
522-
523-
size_t utf8_length = simdutf::utf8_length_from_utf16le(well_formed.begin(), length);
524-
auto* output = new kj::Array<kj::byte>(kj::heapArray<kj::byte>(utf8_length));
560+
// Rare case: Invalid UTF-16LE with unpaired surrogates. We already fixed the UTF-16 to
561+
// well-formed in the first pass (stored in wellFormed array), so now we just convert that
562+
// fixed version to UTF-8. This reuses the wellFormed array created earlier, avoiding the
563+
// need to fix the UTF-16 a second time.
525564
[[maybe_unused]] auto written =
526-
simdutf::convert_utf16le_to_utf8(well_formed.begin(), length, output->asChars().begin());
527-
KJ_DASSERT(written == output->size());
528-
return jsg::BackingStore::wrap<v8::Uint8Array>(output->begin(), output->size(),
529-
[](void*, size_t, void* ptr) { delete reinterpret_cast<kj::Array<kj::byte>*>(ptr); },
530-
output);
565+
simdutf::convert_utf16le_to_utf8(wellFormed.begin(), wellFormed.size(), output);
566+
KJ_DASSERT(written == backing.size());
531567
}(); // ValueView destroyed here, releasing the heap lock
532568

533569
// Now that ValueView is destroyed and the heap lock is released, it's safe to create V8 objects.
534-
// Construct the BufferSource which will create the actual Uint8Array that gets returned to JS.
535-
return jsg::BufferSource(js, kj::mv(backing));
570+
// Create the Uint8Array from the BackingStore and return it to JS.
571+
return jsg::JsUint8Array(backing.createHandle(js).As<v8::Uint8Array>());
536572
}
537573

538574
TextEncoder::EncodeIntoResult TextEncoder::encodeInto(

src/workerd/api/encoding.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ class TextEncoder final: public jsg::Object {
216216

217217
static jsg::Ref<TextEncoder> constructor(jsg::Lock& js);
218218

219-
jsg::BufferSource encode(jsg::Lock& js, jsg::Optional<jsg::JsString> input);
219+
jsg::JsUint8Array encode(jsg::Lock& js, jsg::Optional<jsg::JsString> input);
220220

221221
EncodeIntoResult encodeInto(jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer);
222222

@@ -234,11 +234,7 @@ class TextEncoder final: public jsg::Object {
234234
JSG_READONLY_INSTANCE_PROPERTY(encoding, getEncoding);
235235
}
236236

237-
// `encode()` returns `jsg::BufferSource`, which may be an `ArrayBuffer` or `ArrayBufferView`,
238-
// but the implementation uses `jsg::BufferSource::tryAlloc()` which always tries to allocate a
239-
// `Uint8Array`. The spec defines that this function returns a `Uint8Array` too.
240237
JSG_TS_OVERRIDE({
241-
encode(input?: string): Uint8Array;
242238
encodeInto(input: string, buffer: Uint8Array): TextEncoderEncodeIntoResult;
243239
});
244240
}

src/workerd/jsg/jsvalue.c++

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,10 @@ JsString JsString::internalize(Lock& js) const {
397397
return JsString(inner->InternalizeString(js.v8Isolate));
398398
}
399399

400+
void JsString::writeOneByte(Lock& js, kj::ArrayPtr<kj::byte> buffer, WriteFlags flags) {
401+
inner->WriteOneByteV2(js.v8Isolate, 0, buffer.size(), buffer.begin(), flags);
402+
}
403+
400404
JsString::WriteIntoStatus JsString::writeInto(
401405
Lock& js, kj::ArrayPtr<char> buffer, WriteFlags options) const {
402406
WriteIntoStatus result = {0, 0};

src/workerd/jsg/jsvalue.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ class JsString final: public JsBase<v8::String, JsString> {
255255

256256
int hashCode() const;
257257

258+
bool isOneByte(Lock& js) const KJ_WARN_UNUSED_RESULT;
258259
bool containsOnlyOneByte() const;
259260

260261
bool operator==(const JsString& other) const;
@@ -289,6 +290,8 @@ class JsString final: public JsBase<v8::String, JsString> {
289290
WriteIntoStatus writeInto(
290291
Lock& js, kj::ArrayPtr<uint16_t> buffer, WriteFlags options = WriteFlags::NONE) const;
291292

293+
void writeOneByte(Lock& js, kj::ArrayPtr<kj::byte> buffer, WriteFlags flags = WriteFlags::NONE);
294+
292295
using JsBase<v8::String, JsString>::JsBase;
293296
};
294297

@@ -963,6 +966,10 @@ inline int JsString::length(jsg::Lock& js) const {
963966
return inner->Length();
964967
}
965968

969+
inline bool JsString::isOneByte(jsg::Lock& js) const {
970+
return inner->IsOneByte();
971+
}
972+
966973
inline size_t JsString::utf8Length(jsg::Lock& js) const {
967974
return inner->Utf8LengthV2(js.v8Isolate);
968975
}

0 commit comments

Comments
 (0)