Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,7 @@ debug = false # FIXME(#1813)
[[bench]]
name = "instantiation"
harness = false

[[bench]]
name = "thread_eager_init"
harness = false
114 changes: 114 additions & 0 deletions benches/thread_eager_init.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
use criterion::{criterion_group, criterion_main, Criterion};
use std::thread;
use std::time::{Duration, Instant};
use wasmtime::*;

fn measure_execution_time(c: &mut Criterion) {
// Baseline performance: a single measurment covers both initializing
// thread local resources and executing the first call.
//
// The other two bench functions should sum to this duration.
c.bench_function("lazy initialization at call", move |b| {
let (engine, module) = test_setup();
b.iter_custom(move |iters| {
(0..iters)
.into_iter()
.map(|_| lazy_thread_instantiate(engine.clone(), module.clone()))
.sum()
})
});

// Using Engine::tls_eager_initialize: measure how long eager
// initialization takes on a new thread.
c.bench_function("eager initialization", move |b| {
let (engine, module) = test_setup();
b.iter_custom(move |iters| {
(0..iters)
.into_iter()
.map(|_| {
let (init, _call) = eager_thread_instantiate(engine.clone(), module.clone());
init
})
.sum()
})
});

// Measure how long the first call takes on a thread after it has been
// eagerly initialized.
c.bench_function("call after eager initialization", move |b| {
let (engine, module) = test_setup();
b.iter_custom(move |iters| {
(0..iters)
.into_iter()
.map(|_| {
let (_init, call) = eager_thread_instantiate(engine.clone(), module.clone());
call
})
.sum()
})
});
}

/// Creating a store and measuring the time to perform a call is the same behavior
/// in both setups.
fn duration_of_call(engine: &Engine, module: &Module) -> Duration {
let mut store = Store::new(engine, ());
let inst = Instance::new(&mut store, module, &[]).expect("instantiate");
let f = inst.get_func(&mut store, "f").expect("get f");
let f = f.typed::<(), (), _>(&store).expect("type f");

let call = Instant::now();
f.call(&mut store, ()).expect("call f");
call.elapsed()
}

/// When wasmtime first runs a function on a thread, it needs to initialize
/// some thread-local resources and install signal handlers. This benchmark
/// spawns a new thread, and returns the duration it took to execute the first
/// function call made on that thread.
fn lazy_thread_instantiate(engine: Engine, module: Module) -> Duration {
thread::spawn(move || duration_of_call(&engine, &module))
.join()
.expect("thread joins")
}
/// This benchmark spawns a new thread, and records the duration to eagerly
/// initializes the thread local resources. It then creates a store and
/// instance, and records the duration it took to execute the first function
/// call.
fn eager_thread_instantiate(engine: Engine, module: Module) -> (Duration, Duration) {
thread::spawn(move || {
let init_start = Instant::now();
Engine::tls_eager_initialize().expect("eager init");
let init_duration = init_start.elapsed();

(init_duration, duration_of_call(&engine, &module))
})
.join()
.expect("thread joins")
}

fn test_setup() -> (Engine, Module) {
// We only expect to create one Instance at a time, with a single memory.
let pool_count = 10;

let mut config = Config::new();
config.allocation_strategy(InstanceAllocationStrategy::Pooling {
strategy: PoolingAllocationStrategy::NextAvailable,
module_limits: ModuleLimits {
memory_pages: 1,
..Default::default()
},
instance_limits: InstanceLimits {
count: pool_count,
memory_reservation_size: 1,
},
});
let engine = Engine::new(&config).unwrap();

// The module has a memory (shouldn't matter) and a single function which is a no-op.
let module = Module::new(&engine, r#"(module (memory 1) (func (export "f")))"#).unwrap();
(engine, module)
}

criterion_group!(benches, measure_execution_time);
criterion_main!(benches);
4 changes: 2 additions & 2 deletions crates/runtime/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ pub use crate::memory::{Memory, RuntimeLinearMemory, RuntimeMemoryCreator};
pub use crate::mmap::Mmap;
pub use crate::table::{Table, TableElement};
pub use crate::traphandlers::{
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, SignalHandler,
TlsRestore, Trap,
catch_traps, init_traps, raise_lib_trap, raise_user_trap, resume_panic, tls_eager_initialize,
SignalHandler, TlsRestore, Trap,
};
pub use crate::vmcontext::{
VMCallerCheckedAnyfunc, VMContext, VMFunctionBody, VMFunctionImport, VMGlobalDefinition,
Expand Down
19 changes: 18 additions & 1 deletion crates/runtime/src/traphandlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use std::sync::atomic::Ordering::SeqCst;
use std::sync::Once;
use wasmtime_environ::ir;

pub use self::tls::TlsRestore;
pub use self::tls::{tls_eager_initialize, TlsRestore};

extern "C" {
#[allow(improper_ctypes)]
Expand Down Expand Up @@ -386,12 +386,29 @@ mod tls {
})
}

#[inline(never)]
/// Eagerly initialize thread-local runtime functionality. This will be performed
/// lazily by the runtime if users do not perform it eagerly.
pub fn initialize() -> Result<(), Trap> {
PTR.with(|p| {
let (state, initialized) = p.get();
if initialized {
return Ok(());
}
super::super::sys::lazy_per_thread_init()?;
p.set((state, true));
Ok(())
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could probably be a bit simpler with if initialized { return } and then p.set((state, true)) afterwards

})
}

#[inline(never)] // see module docs for why this is here
pub fn get() -> Ptr {
PTR.with(|p| p.get().0)
}
}

pub use raw::initialize as tls_eager_initialize;

/// Opaque state used to help control TLS state across stack switches for
/// async support.
pub struct TlsRestore(raw::Ptr);
Expand Down
23 changes: 22 additions & 1 deletion crates/wasmtime/src/engine.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::signatures::SignatureRegistry;
use crate::Config;
use crate::{Config, Trap};
use anyhow::Result;
use std::sync::Arc;
#[cfg(feature = "cache")]
Expand Down Expand Up @@ -63,6 +63,27 @@ impl Engine {
})
}

/// Eagerly initialize thread-local functionality shared by all [`Engine`]s.
///
/// Wasmtime's implementation on some platforms may involve per-thread
/// setup that needs to happen whenever WebAssembly is invoked. This setup
/// can take on the order of a few hundred microseconds, whereas the
/// overhead of calling WebAssembly is otherwise on the order of a few
/// nanoseconds. This setup cost is paid once per-OS-thread. If your
/// application is sensitive to the latencies of WebAssembly function
/// calls, even those that happen first on a thread, then this function
/// can be used to improve the consistency of each call into WebAssembly
/// by explicitly frontloading the cost of the one-time setup per-thread.
///
/// Note that this function is not required to be called in any embedding.
/// Wasmtime will automatically initialize thread-local-state as necessary
/// on calls into WebAssembly. This is provided for use cases where the
/// latency of WebAssembly calls are extra-important, which is not
/// necessarily true of all embeddings.
pub fn tls_eager_initialize() -> Result<(), Trap> {
wasmtime_runtime::tls_eager_initialize().map_err(Trap::from_runtime)
}

/// Returns the configuration settings that this engine is using.
#[inline]
pub fn config(&self) -> &Config {
Expand Down