diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel index e47e0426e05..0a8bc2e4d1a 100644 --- a/src/workerd/api/BUILD.bazel +++ b/src/workerd/api/BUILD.bazel @@ -440,14 +440,15 @@ wd_cc_library( "encoding-shared.h", ], implementation_deps = [ - "//src/workerd/io:features", "//src/workerd/util:strings", + "@simdutf", ], visibility = ["//visibility:public"], deps = [ ":util", "//src/rust/encoding", "//src/workerd/io:compatibility-date_capnp", + "//src/workerd/io:features", "//src/workerd/jsg", "@capnp-cpp//src/kj", "@simdutf", @@ -624,6 +625,14 @@ kj_test( ], ) +kj_test( + src = "encoding-test.c++", + deps = [ + ":encoding", + "//src/workerd/io", + ], +) + kj_test( src = "base64-test.c++", deps = ["//src/workerd/tests:test-fixture"], diff --git a/src/workerd/api/encoding-test.c++ b/src/workerd/api/encoding-test.c++ new file mode 100644 index 00000000000..ed1b7fb6593 --- /dev/null +++ b/src/workerd/api/encoding-test.c++ @@ -0,0 +1,90 @@ +// Copyright (c) 2025 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +#include "encoding.h" + +#include + +namespace workerd::api { +namespace test { + +// These tests verify the findBestFit() function used by TextEncoder.encodeInto(). +// +// bestFit(input, bufferSize) returns the number of input code units that can be +// fully converted to UTF-8 and fit within the given output buffer size in bytes. +// +// Different characters expand to different UTF-8 byte lengths: +// - ASCII (U+0000-U+007F): 1 byte per code unit +// - Latin-1 extended (U+0080-U+00FF): 2 bytes per code unit +// - BMP characters (U+0100-U+FFFF): 2-3 bytes per code unit +// - Supplementary characters (U+10000+): 4 bytes, encoded as surrogate pairs in UTF-16 +// +// The function must never split a surrogate pair, so if there's only room for part of +// a multi-byte character, it stops before that character. +KJ_TEST("BestFitASCII") { + // If there's zero input or output space, the answer is zero. + KJ_ASSERT(bestFit("", 0) == 0); + KJ_ASSERT(bestFit("a", 0) == 0); + KJ_ASSERT(bestFit("aa", 0) == 0); + KJ_ASSERT(bestFit("aaa", 0) == 0); + KJ_ASSERT(bestFit("aaaa", 0) == 0); + KJ_ASSERT(bestFit("aaaaa", 0) == 0); + KJ_ASSERT(bestFit("", 0) == 0); + KJ_ASSERT(bestFit("", 1) == 0); + KJ_ASSERT(bestFit("", 2) == 0); + KJ_ASSERT(bestFit("", 3) == 0); + KJ_ASSERT(bestFit("", 4) == 0); + KJ_ASSERT(bestFit("", 5) == 0); + // Zero cases with two-byte strings. + KJ_ASSERT(bestFit(u"", 0) == 0); + KJ_ASSERT(bestFit(u"€", 0) == 0); + KJ_ASSERT(bestFit(u"€€", 0) == 0); + KJ_ASSERT(bestFit(u"€€€", 0) == 0); + KJ_ASSERT(bestFit(u"€€€€", 0) == 0); + KJ_ASSERT(bestFit(u"€€€€€", 0) == 0); + KJ_ASSERT(bestFit(u"", 0) == 0); + KJ_ASSERT(bestFit(u"", 1) == 0); + KJ_ASSERT(bestFit(u"", 2) == 0); + KJ_ASSERT(bestFit(u"", 3) == 0); + KJ_ASSERT(bestFit(u"", 4) == 0); + KJ_ASSERT(bestFit(u"", 5) == 0); + // Small buffers that only just fit. + KJ_ASSERT(bestFit(u"a", 1) == 1); + KJ_ASSERT(bestFit(u"å", 2) == 1); + KJ_ASSERT(bestFit(u"€", 3) == 1); + KJ_ASSERT(bestFit(u"😹", 4) == 2); + // Small buffers that don't fit. + KJ_ASSERT(bestFit(u"å", 1) == 0); + KJ_ASSERT(bestFit(u"€", 2) == 0); + KJ_ASSERT(bestFit(u"😹", 3) == 0); + // Don't chop a surrogate pair. + KJ_ASSERT(bestFit(u"1😹", 4) == 1); + KJ_ASSERT(bestFit(u"12😹", 5) == 2); + KJ_ASSERT(bestFit(u"123😹", 6) == 3); + KJ_ASSERT(bestFit(u"1234😹", 7) == 4); + KJ_ASSERT(bestFit(u"12345😹", 8) == 5); + // Some bigger ones just for fun. + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 0) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 1) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 2) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 3) == 0); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 4) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 5) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 6) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 7) == 2); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 8) == 4); + KJ_ASSERT(bestFit(u"😹😹😹😹😹😹", 9) == 4); + KJ_ASSERT(bestFit(u"0😹😹😹😹😹😹", 9) == 5); // 0😹😹 is 5 and takes 9. + KJ_ASSERT(bestFit(u"01😹😹😹😹😹😹", 9) == 4); // 01😹 is 4 and takes 6. + KJ_ASSERT(bestFit(u"012😹😹😹😹😹😹", 9) == 5); // 012😹 is 5 and takes 7. + KJ_ASSERT(bestFit(u"0123😹😹😹😹😹😹", 9) == 6); // 0123😹 is 6 and takes 8. + KJ_ASSERT(bestFit(u"01234😹😹😹😹😹😹", 9) == 7); // 01234😹 is 7 and takes 9. + KJ_ASSERT(bestFit(u"012345😹😹😹😹😹😹", 9) == 6); // 012345 is 6 and takes 6. + KJ_ASSERT(bestFit(u"0123456😹😹😹😹😹😹", 9) == 7); // 0123456 is 7 and takes 7. + KJ_ASSERT(bestFit(u"01234567😹😹😹😹😹😹", 9) == 8); // 0123456 is 8 and takes 8. + KJ_ASSERT(bestFit(u"012345678😹😹😹😹😹😹", 9) == 9); // 0123456 is 9 and takes 9. +} + +} // namespace test +} // namespace workerd::api diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 958f928f7e4..19f7712e52a 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -9,12 +9,15 @@ #include #include +#include #include #include #include +#include #include +#include namespace workerd::api { @@ -274,6 +277,9 @@ Encoding getEncodingForLabel(kj::StringPtr label) { #undef V return Encoding::INVALID; } + +constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096; + } // namespace const kj::Array TextDecoder::EMPTY = @@ -516,34 +522,261 @@ jsg::Ref TextEncoder::constructor(jsg::Lock& js) { return js.alloc(); } +jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { + if (!workerd::util::Autogate::isEnabled(workerd::util::AutogateKey::ENABLE_FAST_TEXTENCODER)) { + auto str = input.orDefault(js.str()); + auto view = JSG_REQUIRE_NONNULL(jsg::BufferSource::tryAlloc(js, str.utf8Length(js)), RangeError, + "Cannot allocate space for TextEncoder.encode"); + [[maybe_unused]] auto result = str.writeInto( + js, view.asArrayPtr().asChars(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + KJ_DASSERT(result.written == view.size()); + return jsg::JsUint8Array(view.getHandle(js).As()); + } + + jsg::JsString str = input.orDefault(js.str()); + + size_t utf8_length = 0; + auto length = str.length(js); + +#ifdef KJ_DEBUG + bool wasAlreadyFlat = str.isFlat(); + KJ_DEFER({ KJ_ASSERT(wasAlreadyFlat || !str.isFlat()); }); +#endif + + // Note: writeInto() doesn't flatten the string - it calls writeTo() which chains through + // Write2 -> WriteV2 -> WriteHelperV2 -> String::WriteToFlat. + // This means we may read from multiple string segments, but that's fine for our use case. + + if (str.isOneByte(js)) { + // Use off-heap allocation for intermediate Latin-1 buffer to avoid wasting V8 heap space + // and potentially triggering GC. Stack allocation for small strings, heap for large. + kj::SmallArray latin1Buffer(length); + + [[maybe_unused]] auto writeResult = str.writeInto(js, latin1Buffer.asPtr()); + KJ_DASSERT( + writeResult.written == length, "writeInto must completely overwrite the backing buffer"); + + utf8_length = simdutf::utf8_length_from_latin1( + reinterpret_cast(latin1Buffer.begin()), length); + + auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + if (utf8_length == length) { + // ASCII fast path: no conversion needed, Latin-1 is same as UTF-8 for ASCII + kj::arrayPtr(static_cast(backingStore->Data()), length).copyFrom(latin1Buffer); + } else { + [[maybe_unused]] auto written = + simdutf::convert_latin1_to_utf8(reinterpret_cast(latin1Buffer.begin()), + length, reinterpret_cast(backingStore->Data())); + KJ_DASSERT(utf8_length == written); + } + return jsg::JsUint8Array::create(js, kj::mv(backingStore), 0, utf8_length); + } + + // Use off-heap allocation for intermediate UTF-16 buffer to avoid wasting V8 heap space + // and potentially triggering GC. Stack allocation for small strings, heap for large. + // Stack allocation for small strings, heap for large. + kj::SmallArray utf16Buffer(length); + + [[maybe_unused]] auto writeResult = str.writeInto(js, utf16Buffer.asPtr()); + KJ_DASSERT( + writeResult.written == length, "writeInto must completely overwrite the backing buffer"); + + auto data = reinterpret_cast(utf16Buffer.begin()); + auto lengthResult = simdutf::utf8_length_from_utf16_with_replacement(data, length); + utf8_length = lengthResult.count; + + if (lengthResult.error == simdutf::SURROGATE) { + // If there are surrogates there may be unpaired surrogates. Fix them. + simdutf::to_well_formed_utf16(data, length, data); + } else { + KJ_DASSERT(lengthResult.error == simdutf::SUCCESS); + } + + auto backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + [[maybe_unused]] auto written = + simdutf::convert_utf16_to_utf8(data, length, reinterpret_cast(backingStore->Data())); + KJ_DASSERT(written == utf8_length, "Conversion yielded wrong number of UTF-8 bytes"); + + return jsg::JsUint8Array::create(js, kj::mv(backingStore), 0, utf8_length); +} + namespace { -TextEncoder::EncodeIntoResult encodeIntoImpl( - jsg::Lock& js, jsg::JsString input, jsg::BufferSource& buffer) { - auto result = input.writeInto( - js, buffer.asArrayPtr().asChars(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(result.read), - .written = static_cast(result.written), - }; + +constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) { + // We would like to use simdutf::trim_partial_utf16, but it's not guaranteed + // to work right on invalid UTF-16. Hence, we need this method to check for + // surrogate pairs and correctly trim utf16 chunks. + return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00; +} + +// Ignores surrogates conservatively. +constexpr size_t simpleUtfEncodingLength(uint16_t c) { + return 1 + (c >= 0x80) + (c >= 0x400); } + +// Find how many UTF-16 or Latin1 code units fit when converted to UTF-8. +// May conservatively underestimate the largest number of code units we can fit +// because of undetected surrogate pairs on boundaries. +// Works even on malformed UTF-16. +template +size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { + size_t pos = 0; + size_t utf8Accumulated = 0; + // The SIMD is more efficient with a size that's a little over a multiple of 16. + constexpr size_t CHUNK = 257; + // The max number of UTF-8 output bytes per input code unit. + constexpr bool UTF16 = sizeof(Char) == 2; + constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2; + + // Our initial guess at how much the number of elements expands in the + // conversion to UTF-8. + double expansion = 1.15; + + while (pos < length && utf8Accumulated < bufferSize) { + size_t remainingInput = length - pos; + size_t spaceRemaining = bufferSize - utf8Accumulated; + KJ_DASSERT(expansion >= 1.15); + + // We estimate how many characters are likely to fit in the buffer, but + // only try for CHUNK characters at a time to minimize the worst case + // waste of time if we guessed too high. + size_t guaranteedToFit = spaceRemaining / MAX_FACTOR; + if (guaranteedToFit >= remainingInput) { + // Don't even bother checking any more, it's all going to fit. Hitting + // this halfway through is also a good reason to limit the CHUNK size. + return length; + } + size_t likelyToFit = kj::min(static_cast(spaceRemaining / expansion), CHUNK); + size_t fitEstimate = kj::max(1, kj::max(guaranteedToFit, likelyToFit)); + size_t chunkSize = kj::min(remainingInput, fitEstimate); + if (chunkSize == 1) break; // Not worth running this complicated stuff one char at a time. + // No div-by-zero because remainingInput and fitEstimate are at least 1. + KJ_DASSERT(chunkSize >= 1); + + size_t chunkUtf8Len; + if constexpr (UTF16) { + chunkUtf8Len = simdutf::utf8_length_from_utf16_with_replacement(data + pos, chunkSize).count; + } else { + chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); + } + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + // Our chosen chunk didn't fit in the rest of the output buffer. + KJ_DASSERT(chunkSize > guaranteedToFit); + // Since it didn't fit we adjust our expansion guess upwards. + expansion = kj::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize); + } else { + // Use successful length calculation to adjust our expansion estimate. + expansion = kj::max(1.15, (chunkUtf8Len * 1.1) / chunkSize); + pos += chunkSize; + utf8Accumulated += chunkUtf8Len; + } + } + // Do the last few code units in a simpler way. + while (pos < length && utf8Accumulated < bufferSize) { + size_t extra = simpleUtfEncodingLength(data[pos]); + if (utf8Accumulated + extra > bufferSize) break; + pos++; + utf8Accumulated += extra; + } + if (UTF16 && pos != 0 && pos != length && isSurrogatePair(data[pos - 1], data[pos])) { + // We ended on a leading surrogate which has a matching trailing surrogate in the next + // position. In order to make progress when the bufferSize is tiny we try to include it. + if (utf8Accumulated < bufferSize) { + pos++; // We had one more byte, so we can include the pair, UTF-8 encoding 3->4. + } else { + pos--; // Don't chop the pair in half. + } + } + return pos; +} + } // namespace -jsg::BufferSource TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { - auto str = input.orDefault(js.str()); - auto view = JSG_REQUIRE_NONNULL(jsg::BufferSource::tryAlloc(js, str.utf8Length(js)), RangeError, - "Cannot allocate space for TextEncoder.encode"); - [[maybe_unused]] auto result = encodeIntoImpl(js, str, view); - KJ_DASSERT(result.written == view.size()); - return kj::mv(view); +// Test helpers used by encoding-test.c++ to verify findBestFit behavior. +namespace test { + +size_t bestFit(const char* str, size_t bufferSize) { + return findBestFit(str, strlen(str), bufferSize); } +size_t bestFit(const char16_t* str, size_t bufferSize) { + size_t length = 0; + while (str[length] != 0) length++; + return findBestFit(str, length, bufferSize); +} + +} // namespace test + TextEncoder::EncodeIntoResult TextEncoder::encodeInto( jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer) { - auto result = input.writeInto( - js, buffer.asArrayPtr(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + if (!workerd::util::Autogate::isEnabled(workerd::util::AutogateKey::ENABLE_FAST_TEXTENCODER)) { + auto result = input.writeInto( + js, buffer.asArrayPtr(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(result.read), + .written = static_cast(result.written), + }; + } + + auto outputBuf = buffer.asArrayPtr(); + size_t bufferSize = outputBuf.size(); + + size_t read = 0; + size_t written = 0; + { + // Scope for the view - we can't do anything that might cause a V8 GC! + v8::String::ValueView view(js.v8Isolate, input); + size_t length = view.length(); + + if (view.is_one_byte()) { + auto data = reinterpret_cast(view.data8()); + simdutf::result result = + simdutf::validate_ascii_with_errors(data, kj::min(length, bufferSize)); + written = read = result.count; + auto outAddr = outputBuf.begin(); + kj::arrayPtr(outAddr, read).copyFrom(kj::arrayPtr(data, read)); + outAddr += read; + data += read; + length -= read; + bufferSize -= read; + if (length != 0 && bufferSize != 0) { + size_t rest = findBestFit(data, length, bufferSize); + if (rest != 0) { + KJ_DASSERT(simdutf::utf8_length_from_latin1(data, rest) <= bufferSize); + written += simdutf::convert_latin1_to_utf8(data, rest, outAddr); + read += rest; + } + } + } else { + auto data = reinterpret_cast(view.data16()); + read = findBestFit(data, length, bufferSize); + if (read != 0) { + KJ_DASSERT( + simdutf::utf8_length_from_utf16_with_replacement(data, read).count <= bufferSize); + simdutf::result result = + simdutf::convert_utf16_to_utf8_with_errors(data, read, outputBuf.begin()); + if (result.error == simdutf::SUCCESS) { + written = result.count; + } else { + // Oh, no, there are unpaired surrogates. This is hopefully rare. + kj::SmallArray conversionBuffer(read); + simdutf::to_well_formed_utf16(data, read, conversionBuffer.begin()); + written = + simdutf::convert_utf16_to_utf8(conversionBuffer.begin(), read, outputBuf.begin()); + } + } + } + } + KJ_DASSERT(written <= outputBuf.size()); + // V8's String::kMaxLenth is a lot less than a maximal int so this is fine. + using RInt = decltype(TextEncoder::EncodeIntoResult::read); + using WInt = decltype(TextEncoder::EncodeIntoResult::written); + KJ_DASSERT(0 <= read && read <= std::numeric_limits::max()); + KJ_DASSERT(0 <= written && written <= std::numeric_limits::max()); return TextEncoder::EncodeIntoResult{ - .read = static_cast(result.read), - .written = static_cast(result.written), + .read = static_cast(read), + .written = static_cast(written), }; } diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h index adaae247e89..5ffc69573e6 100644 --- a/src/workerd/api/encoding.h +++ b/src/workerd/api/encoding.h @@ -142,7 +142,7 @@ class TextEncoder final: public jsg::Object { static jsg::Ref constructor(jsg::Lock& js); - jsg::BufferSource encode(jsg::Lock& js, jsg::Optional input); + jsg::JsUint8Array encode(jsg::Lock& js, jsg::Optional input); EncodeIntoResult encodeInto(jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer); @@ -160,11 +160,7 @@ class TextEncoder final: public jsg::Object { JSG_READONLY_INSTANCE_PROPERTY(encoding, getEncoding); } - // `encode()` returns `jsg::BufferSource`, which may be an `ArrayBuffer` or `ArrayBufferView`, - // but the implementation uses `jsg::BufferSource::tryAlloc()` which always tries to allocate a - // `Uint8Array`. The spec defines that this function returns a `Uint8Array` too. JSG_TS_OVERRIDE({ - encode(input?: string): Uint8Array; encodeInto(input: string, buffer: Uint8Array): TextEncoderEncodeIntoResult; }); } @@ -173,4 +169,11 @@ class TextEncoder final: public jsg::Object { #define EW_ENCODING_ISOLATE_TYPES \ api::TextDecoder, api::TextEncoder, api::TextDecoder::ConstructorOptions, \ api::TextDecoder::DecodeOptions, api::TextEncoder::EncodeIntoResult + +namespace test { + +size_t bestFit(const char* str, size_t bufferSize); +size_t bestFit(const char16_t* str, size_t bufferSize); + +} // namespace test } // namespace workerd::api diff --git a/src/workerd/api/streams/encoding.c++ b/src/workerd/api/streams/encoding.c++ index 7fe67ce5e68..ede16250b83 100644 --- a/src/workerd/api/streams/encoding.c++ +++ b/src/workerd/api/streams/encoding.c++ @@ -93,9 +93,7 @@ jsg::Ref TextEncoderStream::constructor(jsg::Lock& js) { if (holder->pending != kj::none) { auto backingStore = js.allocBackingStore(3, jsg::Lock::AllocOption::UNINITIALIZED); memcpy(backingStore->Data(), REPLACEMENT_UTF8, 3); - auto array = - v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), 0, 3); - controller->enqueue(js, jsg::JsUint8Array(array)); + controller->enqueue(js, jsg::JsUint8Array::create(js, kj::mv(backingStore), 0, 3)); } return js.resolvedPromise(); }; diff --git a/src/workerd/jsg/buffersource.h b/src/workerd/jsg/buffersource.h index cb86e82ca67..9d711d4ec82 100644 --- a/src/workerd/jsg/buffersource.h +++ b/src/workerd/jsg/buffersource.h @@ -102,9 +102,10 @@ class BackingStore { // Creates a new BackingStore of the given size. template - static BackingStore alloc(Lock& js, size_t size) { - return BackingStore(js.allocBackingStore(size), size, 0, getBufferSourceElementSize(), - construct, checkIsIntegerType()); + static BackingStore alloc( + Lock& js, size_t size, Lock::AllocOption init_mode = Lock::AllocOption::ZERO_INITIALIZED) { + return BackingStore(js.allocBackingStore(size, init_mode), size, 0, + getBufferSourceElementSize(), construct, checkIsIntegerType()); } using Disposer = void(void*, size_t, void*); diff --git a/src/workerd/jsg/jsg.h b/src/workerd/jsg/jsg.h index faeb4944fa8..44a09cba60f 100644 --- a/src/workerd/jsg/jsg.h +++ b/src/workerd/jsg/jsg.h @@ -2758,6 +2758,14 @@ class Lock { // Utility method to safely allocate a v8::BackingStore with allocation failure handling. // Throws a javascript error if allocation fails. + // + // IMPORTANT: This method can trigger garbage collection, which may move or invalidate V8 + // objects. Do NOT call this method while: + // - A v8::String::ValueView is alive (it holds internal V8 heap locks) + // - You have raw pointers to V8 heap data (e.g., from view.data8(), view.data16()) + // + // Safe pattern: Copy V8 string data to off-heap memory FIRST (e.g., via JsString::writeInto() + // into kj::SmallArray), THEN call allocBackingStore(). See TextEncoder::encode() for example. std::unique_ptr allocBackingStore( size_t size, AllocOption init_mode = AllocOption::ZERO_INITIALIZED) KJ_WARN_UNUSED_RESULT; diff --git a/src/workerd/jsg/jsvalue.h b/src/workerd/jsg/jsvalue.h index 61481f4521d..25ee99fe228 100644 --- a/src/workerd/jsg/jsvalue.h +++ b/src/workerd/jsg/jsvalue.h @@ -254,6 +254,12 @@ class JsArrayBufferView final: public JsBase { public: + static JsUint8Array create( + Lock& js, std::unique_ptr backingStore, size_t byteOffset, size_t length) { + return JsUint8Array(v8::Uint8Array::New( + v8::ArrayBuffer::New(js.v8Isolate, kj::mv(backingStore)), byteOffset, length)); + } + template kj::ArrayPtr asArrayPtr() { v8::Local inner = *this; @@ -277,6 +283,7 @@ class JsString final: public JsBase { int hashCode() const; bool isFlat() const; + bool isOneByte(Lock& js) const KJ_WARN_UNUSED_RESULT; bool containsOnlyOneByte() const; bool operator==(const JsString& other) const; @@ -304,6 +311,12 @@ class JsString final: public JsBase { // The number of elements (e.g. char, byte, uint16_t) written to the buffer. size_t written; }; + + // Copy string contents into a provided buffer (off-heap memory). + // + // IMPORTANT: This method does NOT flatten the V8 string or hold V8 heap locks. It safely + // copies data out of V8's heap into your buffer. This makes it safe to use before calling + // GC-triggering operations like Lock::allocBackingStore(). WriteIntoStatus writeInto( Lock& js, kj::ArrayPtr buffer, WriteFlags options = WriteFlags::NONE) const; WriteIntoStatus writeInto( @@ -986,6 +999,10 @@ inline int JsString::length(jsg::Lock& js) const { return inner->Length(); } +inline bool JsString::isOneByte(jsg::Lock& js) const { + return inner->IsOneByte(); +} + inline size_t JsString::utf8Length(jsg::Lock& js) const { return inner->Utf8LengthV2(js.v8Isolate); } diff --git a/src/workerd/util/autogate.c++ b/src/workerd/util/autogate.c++ index 4ff178c9170..211e0689374 100644 --- a/src/workerd/util/autogate.c++ +++ b/src/workerd/util/autogate.c++ @@ -33,6 +33,8 @@ kj::StringPtr KJ_STRINGIFY(AutogateKey key) { return "rpc-use-external-pusher"_kj; case AutogateKey::BLOB_USE_STREAMS_NEW_MEMORY_SOURCE: return "blob-use-streams-new-memory-source"_kj; + case AutogateKey::ENABLE_FAST_TEXTENCODER: + return "enable-fast-textencoder"_kj; case AutogateKey::NumOfKeys: KJ_FAIL_ASSERT("NumOfKeys should not be used in getName"); } diff --git a/src/workerd/util/autogate.h b/src/workerd/util/autogate.h index 853a028c517..37b86ccfd19 100644 --- a/src/workerd/util/autogate.h +++ b/src/workerd/util/autogate.h @@ -29,6 +29,8 @@ enum class AutogateKey { RPC_USE_EXTERNAL_PUSHER, // Switch Blob stream() to use streams::newMemorySource instead of Blob::BlobInputStream BLOB_USE_STREAMS_NEW_MEMORY_SOURCE, + // Enable fast TextEncoder implementation using simdutf + ENABLE_FAST_TEXTENCODER, NumOfKeys // Reserved for iteration. };