Skip to content

Commit 38f0b41

Browse files
authored
perf(web): optimize single pass utf8 decoding (#16593)
- [x] Avoid copying buffers. https://encoding.spec.whatwg.org/#dom-textdecoder-decode > Implementations are strongly encouraged to use an implementation strategy that avoids this copy. When doing so they will have to make sure that changes to input do not affect future calls to [decode()](https://encoding.spec.whatwg.org/#dom-textdecoder-decode). - [x] Special op to avoid string label deserialization and parsing. (Ideally we should map labels to integers in JS) - [x] Avoid webidl `Object.assign` when options is undefined.
1 parent 5b9620d commit 38f0b41

3 files changed

Lines changed: 74 additions & 28 deletions

File tree

ext/web/08_text_encoding.js

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@
1616
const ops = core.ops;
1717
const webidl = window.__bootstrap.webidl;
1818
const {
19-
ArrayBufferIsView,
20-
ObjectPrototypeIsPrototypeOf,
2119
PromiseReject,
2220
PromiseResolve,
2321
StringPrototypeCharCodeAt,
2422
StringPrototypeSlice,
2523
TypedArrayPrototypeSubarray,
2624
Uint8Array,
25+
ObjectPrototypeIsPrototypeOf,
26+
ArrayBufferIsView,
2727
Uint32Array,
2828
} = window.__bootstrap.primordials;
2929

@@ -34,6 +34,8 @@
3434
#fatal;
3535
/** @type {boolean} */
3636
#ignoreBOM;
37+
/** @type {boolean} */
38+
#utf8SinglePass;
3739

3840
/** @type {number | null} */
3941
#rid = null;
@@ -56,6 +58,7 @@
5658
this.#encoding = encoding;
5759
this.#fatal = options.fatal;
5860
this.#ignoreBOM = options.ignoreBOM;
61+
this.#utf8SinglePass = encoding === "utf-8" && !options.fatal;
5962
this[webidl.brand] = webidl.brand;
6063
}
6164

@@ -81,7 +84,7 @@
8184
* @param {BufferSource} [input]
8285
* @param {TextDecodeOptions} options
8386
*/
84-
decode(input = new Uint8Array(), options = {}) {
87+
decode(input = new Uint8Array(), options = undefined) {
8588
webidl.assertBranded(this, TextDecoderPrototype);
8689
const prefix = "Failed to execute 'decode' on 'TextDecoder'";
8790
if (input !== undefined) {
@@ -91,40 +94,46 @@
9194
allowShared: true,
9295
});
9396
}
94-
options = webidl.converters.TextDecodeOptions(options, {
95-
prefix,
96-
context: "Argument 2",
97-
});
97+
let stream = false;
98+
if (options !== undefined) {
99+
options = webidl.converters.TextDecodeOptions(options, {
100+
prefix,
101+
context: "Argument 2",
102+
});
103+
stream = options.stream;
104+
}
98105

99106
try {
100-
try {
101-
if (ArrayBufferIsView(input)) {
102-
input = new Uint8Array(
103-
input.buffer,
104-
input.byteOffset,
105-
input.byteLength,
106-
);
107-
} else {
108-
input = new Uint8Array(input);
109-
}
110-
} catch {
111-
// If the buffer is detached, just create a new empty Uint8Array.
112-
input = new Uint8Array();
113-
}
107+
// Note from spec: implementations are strongly encouraged to use an implementation strategy that avoids this copy.
108+
// When doing so they will have to make sure that changes to input do not affect future calls to decode().
114109
if (
115110
ObjectPrototypeIsPrototypeOf(
116111
SharedArrayBuffer.prototype,
117-
input.buffer,
112+
input || input.buffer,
118113
)
119114
) {
120115
// We clone the data into a non-shared ArrayBuffer so we can pass it
121116
// to Rust.
122117
// `input` is now a Uint8Array, and calling the TypedArray constructor
123118
// with a TypedArray argument copies the data.
124-
input = new Uint8Array(input);
119+
if (ArrayBufferIsView(input)) {
120+
input = new Uint8Array(
121+
input.buffer,
122+
input.byteOffset,
123+
input.byteLength,
124+
);
125+
} else {
126+
input = new Uint8Array(input);
127+
}
125128
}
126129

127-
if (!options.stream && this.#rid === null) {
130+
// Fast path for single pass encoding.
131+
if (!stream && this.#rid === null) {
132+
// Fast path for utf8 single pass encoding.
133+
if (this.#utf8SinglePass) {
134+
return ops.op_encoding_decode_utf8(input, this.#ignoreBOM);
135+
}
136+
128137
return ops.op_encoding_decode_single(
129138
input,
130139
this.#encoding,
@@ -140,9 +149,9 @@
140149
this.#ignoreBOM,
141150
);
142151
}
143-
return ops.op_encoding_decode(input, this.#rid, options.stream);
152+
return ops.op_encoding_decode(input, this.#rid, stream);
144153
} finally {
145-
if (!options.stream && this.#rid !== null) {
154+
if (!stream && this.#rid !== null) {
146155
core.close(this.#rid);
147156
this.#rid = null;
148157
}

ext/web/lib.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ pub fn init<P: TimersPermission + 'static>(
9191
op_base64_btoa::decl(),
9292
op_encoding_normalize_label::decl(),
9393
op_encoding_decode_single::decl(),
94+
op_encoding_decode_utf8::decl(),
9495
op_encoding_new_decoder::decl(),
9596
op_encoding_decode::decl(),
9697
op_encoding_encode_into::decl(),
@@ -179,6 +180,39 @@ fn op_encoding_normalize_label(label: String) -> Result<String, AnyError> {
179180
Ok(encoding.name().to_lowercase())
180181
}
181182

183+
#[op(v8)]
184+
fn op_encoding_decode_utf8<'a>(
185+
scope: &mut v8::HandleScope<'a>,
186+
zero_copy: &[u8],
187+
ignore_bom: bool,
188+
) -> Result<serde_v8::Value<'a>, AnyError> {
189+
let buf = &zero_copy;
190+
191+
let buf = if !ignore_bom
192+
&& buf.len() >= 3
193+
&& buf[0] == 0xef
194+
&& buf[1] == 0xbb
195+
&& buf[2] == 0xbf
196+
{
197+
&buf[3..]
198+
} else {
199+
buf
200+
};
201+
202+
// If `String::new_from_utf8()` returns `None`, this means that the
203+
// length of the decoded string would be longer than what V8 can
204+
// handle. In this case we return `RangeError`.
205+
//
206+
// For more details see:
207+
// - https://encoding.spec.whatwg.org/#dom-textdecoder-decode
208+
// - https://github.com/denoland/deno/issues/6649
209+
// - https://github.com/v8/v8/blob/d68fb4733e39525f9ff0a9222107c02c28096e2a/include/v8.h#L3277-L3278
210+
match v8::String::new_from_utf8(scope, buf, v8::NewStringType::Normal) {
211+
Some(text) => Ok(serde_v8::from_v8(scope, text.into())?),
212+
None => Err(type_error("buffer exceeds maximum length")),
213+
}
214+
}
215+
182216
#[op]
183217
fn op_encoding_decode_single(
184218
data: &[u8],

ops/lib.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -449,13 +449,16 @@ fn codegen_u8_slice(core: &TokenStream2, idx: usize) -> TokenStream2 {
449449
let value = args.get(#idx as i32);
450450
match #core::v8::Local::<#core::v8::ArrayBuffer>::try_from(value) {
451451
Ok(b) => {
452+
// Handles detached buffers.
453+
let byte_length = b.byte_length();
452454
let store = b.data() as *mut u8;
453455
// SAFETY: rust guarantees that lifetime of slice is no longer than the call.
454-
unsafe { ::std::slice::from_raw_parts_mut(store, b.byte_length()) }
456+
unsafe { ::std::slice::from_raw_parts_mut(store, byte_length) }
455457
},
456458
Err(_) => {
457459
if let Ok(view) = #core::v8::Local::<#core::v8::ArrayBufferView>::try_from(value) {
458-
let (offset, len) = (view.byte_offset(), view.byte_length());
460+
let len = view.byte_length();
461+
let offset = view.byte_offset();
459462
let buffer = match view.buffer(scope) {
460463
Some(v) => v,
461464
None => {

0 commit comments

Comments
 (0)