From 9f1aa64c9e18477674438c9d58d7977dd3111427 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 8 Apr 2021 10:26:59 -0700 Subject: [PATCH] Speed up registration of stack maps This commit fixes a perf issue I was seeing in some local benchmarking where registration of stack maps took a nontrivial amount of time for a module that didn't even use stack maps! The fix here is to largely just do the same thing as #2811 which is to use the in-memory data structures of a `CompiledModule` rather than rebuilding different data structures when a module is registered with a `Store`. The `StackMapRegistry` type now takes a lookup object for a range of pcs, simplifying registration. Registration additionally doesn't even happen if a module is pre-determined to not have any stack maps inside of it. This trait object encapsulates all the logic that was previously used in the rebuilt data structures and also leverages conveniences like `CompiledModule::func_by_pc` added recently as well. With this all combined it means that modules which don't have stack maps now skip this registration step entirely and modules with stack maps should do much less work during registration as well. --- crates/jit/src/instantiate.rs | 90 +++++++++++++++-- crates/runtime/src/externref.rs | 168 ++++++-------------------------- crates/wasmtime/src/store.rs | 18 +--- 3 files changed, 115 insertions(+), 161 deletions(-) diff --git a/crates/jit/src/instantiate.rs b/crates/jit/src/instantiate.rs index 2d155e3d7b6f..c3248cebd783 100644 --- a/crates/jit/src/instantiate.rs +++ b/crates/jit/src/instantiate.rs @@ -16,6 +16,7 @@ use std::sync::Arc; use thiserror::Error; use wasmtime_debug::create_gdbjit_image; use wasmtime_environ::entity::PrimaryMap; +use wasmtime_environ::ir::StackMap; use wasmtime_environ::isa::TargetIsa; use wasmtime_environ::wasm::{ DefinedFuncIndex, InstanceTypeIndex, ModuleTypeIndex, SignatureIndex, WasmFuncType, @@ -25,6 +26,7 @@ use wasmtime_environ::{ ModuleSignature, ModuleTranslation, StackMapInformation, TrapInformation, }; use wasmtime_profiling::ProfilingAgent; +use wasmtime_runtime::StackMapLookup; use wasmtime_runtime::{GdbJitImageRegistration, InstantiationError, VMFunctionBody, VMTrampoline}; /// An error condition while setting up a wasm instance, be it validation, @@ -214,6 +216,7 @@ pub struct CompiledModule { code: Arc, finished_functions: FinishedFunctions, trampolines: Vec<(SignatureIndex, VMTrampoline)>, + has_stack_maps: bool, } impl CompiledModule { @@ -270,6 +273,8 @@ impl CompiledModule { let start = code_range.0 as usize; let end = start + code_range.1; + let has_stack_maps = artifacts.funcs.values().any(|f| f.stack_maps.len() > 0); + Ok(Arc::new(Self { artifacts, code: Arc::new(ModuleCode { @@ -279,6 +284,7 @@ impl CompiledModule { }), finished_functions, trampolines, + has_stack_maps, })) } @@ -313,15 +319,12 @@ impl CompiledModule { /// /// The iterator returned iterates over the span of the compiled function in /// memory with the stack maps associated with those bytes. - pub fn stack_maps( - &self, - ) -> impl Iterator { - self.finished_functions().values().copied().zip( - self.artifacts - .funcs - .values() - .map(|f| f.stack_maps.as_slice()), - ) + pub fn stack_maps(self: &Arc) -> Option> { + if self.has_stack_maps { + Some(Box::new(StackMapLookupImpl(self.clone()))) + } else { + None + } } /// Lookups a defined function by a program counter value. @@ -588,3 +591,72 @@ mod arc_serde { Ok(Arc::new(T::deserialize(de)?)) } } + +struct StackMapLookupImpl(Arc); + +impl StackMapLookup for StackMapLookupImpl { + fn lookup(&self, pc: usize) -> Option<&StackMap> { + let (index, start, _end) = self.0.func_by_pc(pc)?; + let func = self.0.artifacts.funcs.get(index)?; + let offset = (pc - start) as u32; + + // Do a binary search to find the stack map for the given PC. + // + // Because GC safepoints are technically only associated with a single + // PC, we should ideally only care about `Ok(index)` values returned + // from the binary search. However, safepoints are inserted right before + // calls, and there are two things that can disturb the PC/offset + // associated with the safepoint versus the PC we actually use to query + // for the stack map: + // + // 1. The `backtrace` crate gives us the PC in a frame that will be + // *returned to*, and where execution will continue from, rather than + // the PC of the call we are currently at. So we would need to + // disassemble one instruction backwards to query the actual PC for + // the stack map. + // + // TODO: One thing we *could* do to make this a little less error + // prone, would be to assert/check that the nearest GC safepoint + // found is within `max_encoded_size(any kind of call instruction)` + // our queried PC for the target architecture. + // + // 2. Cranelift's stack maps only handle the stack, not + // registers. However, some references that are arguments to a call + // may need to be in registers. In these cases, what Cranelift will + // do is: + // + // a. spill all the live references, + // b. insert a GC safepoint for those references, + // c. reload the references into registers, and finally + // d. make the call. + // + // Step (c) adds drift between the GC safepoint and the location of + // the call, which is where we actually walk the stack frame and + // collect its live references. + // + // Luckily, the spill stack slots for the live references are still + // up to date, so we can still find all the on-stack roots. + // Furthermore, we do not have a moving GC, so we don't need to worry + // whether the following code will reuse the references in registers + // (which would not have been updated to point to the moved objects) + // or reload from the stack slots (which would have been updated to + // point to the moved objects). + let index = match func + .stack_maps + .binary_search_by_key(&offset, |map| map.code_offset) + { + // Exact hit. + Ok(pos) => pos, + + // `Err(0)` means that the associated stack map would have been the + // first element in the array if this pc had an associated stack + // map, but this pc does not have an associated stack map. This can + // only happen inside a Wasm frame if there are no live refs at this + // pc. + Err(0) => return None, + + Err(n) => n - 1, + }; + Some(&func.stack_maps[index].stack_map) + } +} diff --git a/crates/runtime/src/externref.rs b/crates/runtime/src/externref.rs index d21e57ed423f..05e94a6b07c9 100644 --- a/crates/runtime/src/externref.rs +++ b/crates/runtime/src/externref.rs @@ -109,8 +109,7 @@ use std::hash::{Hash, Hasher}; use std::mem; use std::ops::Deref; use std::ptr::{self, NonNull}; -use std::rc::Rc; -use wasmtime_environ::{ir::StackMap, StackMapInformation}; +use wasmtime_environ::ir::StackMap; /// An external reference to some opaque data. /// @@ -757,166 +756,55 @@ struct StackMapRegistryInner { ranges: BTreeMap, } -#[derive(Debug)] struct ModuleStackMaps { - /// The range of PCs that this module covers. Different modules must always - /// have distinct ranges. - range: std::ops::Range, - - /// A map from a PC in this module (that is a GC safepoint) to its - /// associated stack map. If `None` then it means that the PC is the start - /// of a range which has no stack map. - pc_to_stack_map: Vec<(usize, Option>)>, + start: usize, + lookup: Box, +} + +/// A trait used to abstract how stack maps are located. +pub trait StackMapLookup { + /// Returns the stack map, if any, for the program with the given `pc` + /// value. + fn lookup(&self, pc: usize) -> Option<&StackMap>; } impl StackMapRegistry { /// Register the stack maps for a given module. /// - /// The stack maps should be given as an iterator over a function's PC range - /// in memory (that is, where the JIT actually allocated and emitted the - /// function's code at), and the stack maps and code offsets within that - /// range for each of its GC safepoints. + /// The `start` and `end` range encompass where the code for this module + /// lives, and the `lookup` object provided will be used to lookup stack + /// maps within this range of pc values. pub fn register_stack_maps<'a>( &self, - stack_maps: impl IntoIterator, &'a [StackMapInformation])>, + start: usize, + end: usize, + lookup: Box, ) { - let mut min = usize::max_value(); - let mut max = 0; - let mut pc_to_stack_map = vec![]; - let mut last_is_none_marker = true; - - for (range, infos) in stack_maps { - let len = range.end - range.start; - - min = std::cmp::min(min, range.start); - max = std::cmp::max(max, range.end); - - // Add a marker between functions indicating that this function's pc - // starts with no stack map so when our binary search later on finds - // a pc between the start of the function and the function's first - // stack map it doesn't think the previous stack map is our stack - // map. - // - // We skip this if the previous entry pushed was also a `None` - // marker, in which case the starting pc already has no stack map. - // This is also skipped if the first `code_offset` is zero since - // what we'll push applies for the first pc anyway. - if !last_is_none_marker && (infos.is_empty() || infos[0].code_offset > 0) { - pc_to_stack_map.push((range.start, None)); - last_is_none_marker = true; - } - - for info in infos { - assert!((info.code_offset as usize) < len); - pc_to_stack_map.push(( - range.start + (info.code_offset as usize), - Some(Rc::new(info.stack_map.clone())), - )); - last_is_none_marker = false; - } - } - - if pc_to_stack_map.is_empty() { - // Nothing to register. - return; - } - - let module_stack_maps = ModuleStackMaps { - range: min..max, - pc_to_stack_map, - }; + let module_stack_maps = ModuleStackMaps { start, lookup }; let mut inner = self.inner.borrow_mut(); // Assert that this chunk of ranges doesn't collide with any other known // chunks. - if let Some((_, prev)) = inner.ranges.range(max..).next() { - assert!(prev.range.start > max); + if let Some((_, prev)) = inner.ranges.range(end..).next() { + assert!(prev.start > end); } - if let Some((prev_end, _)) = inner.ranges.range(..=min).next_back() { - assert!(*prev_end < min); + if let Some((prev_end, _)) = inner.ranges.range(..=start).next_back() { + assert!(*prev_end < start); } - let old = inner.ranges.insert(max, module_stack_maps); + let old = inner.ranges.insert(end, module_stack_maps); assert!(old.is_none()); } - - /// Lookup the stack map for the given PC, if any. - pub fn lookup_stack_map(&self, pc: usize) -> Option> { - let inner = self.inner.borrow(); - let stack_maps = inner.module_stack_maps(pc)?; - - // Do a binary search to find the stack map for the given PC. - // - // Because GC safepoints are technically only associated with a single - // PC, we should ideally only care about `Ok(index)` values returned - // from the binary search. However, safepoints are inserted right before - // calls, and there are two things that can disturb the PC/offset - // associated with the safepoint versus the PC we actually use to query - // for the stack map: - // - // 1. The `backtrace` crate gives us the PC in a frame that will be - // *returned to*, and where execution will continue from, rather than - // the PC of the call we are currently at. So we would need to - // disassemble one instruction backwards to query the actual PC for - // the stack map. - // - // TODO: One thing we *could* do to make this a little less error - // prone, would be to assert/check that the nearest GC safepoint - // found is within `max_encoded_size(any kind of call instruction)` - // our queried PC for the target architecture. - // - // 2. Cranelift's stack maps only handle the stack, not - // registers. However, some references that are arguments to a call - // may need to be in registers. In these cases, what Cranelift will - // do is: - // - // a. spill all the live references, - // b. insert a GC safepoint for those references, - // c. reload the references into registers, and finally - // d. make the call. - // - // Step (c) adds drift between the GC safepoint and the location of - // the call, which is where we actually walk the stack frame and - // collect its live references. - // - // Luckily, the spill stack slots for the live references are still - // up to date, so we can still find all the on-stack roots. - // Furthermore, we do not have a moving GC, so we don't need to worry - // whether the following code will reuse the references in registers - // (which would not have been updated to point to the moved objects) - // or reload from the stack slots (which would have been updated to - // point to the moved objects). - let index = match stack_maps - .pc_to_stack_map - .binary_search_by_key(&pc, |(pc, _stack_map)| *pc) - { - // Exact hit. - Ok(i) => i, - - // `Err(0)` means that the associated stack map would have been the - // first element in the array if this pc had an associated stack - // map, but this pc does not have an associated stack map. This can - // only happen inside a Wasm frame if there are no live refs at this - // pc. - Err(0) => return None, - - Err(n) => n - 1, - }; - - let stack_map = stack_maps.pc_to_stack_map[index].1.as_ref()?.clone(); - Some(stack_map) - } } impl StackMapRegistryInner { - fn module_stack_maps(&self, pc: usize) -> Option<&ModuleStackMaps> { + fn lookup_stack_map(&self, pc: usize) -> Option<&StackMap> { let (end, stack_maps) = self.ranges.range(pc..).next()?; - if pc < stack_maps.range.start || *end < pc { - None - } else { - Some(stack_maps) + if pc < stack_maps.start || *end < pc { + return None; } + stack_maps.lookup.lookup(pc) } } @@ -1003,7 +891,8 @@ pub unsafe fn gc( if cfg!(debug_assertions) { // Assert that there aren't any Wasm frames on the stack. backtrace::trace(|frame| { - let stack_map = stack_maps_registry.lookup_stack_map(frame.ip() as usize); + let inner = stack_maps_registry.inner.borrow(); + let stack_map = inner.lookup_stack_map(frame.ip() as usize); assert!(stack_map.is_none()); true }); @@ -1044,6 +933,7 @@ pub unsafe fn gc( }); } + let stack_maps_registry = stack_maps_registry.inner.borrow(); backtrace::trace(|frame| { let pc = frame.ip() as usize; let sp = frame.sp() as usize; diff --git a/crates/wasmtime/src/store.rs b/crates/wasmtime/src/store.rs index 18dbde303cfa..dcfdb47c6473 100644 --- a/crates/wasmtime/src/store.rs +++ b/crates/wasmtime/src/store.rs @@ -283,7 +283,11 @@ impl Store { // We need to know about all the stack maps of all instantiated modules // so when performing a GC we know about all wasm frames that we find // on the stack. - self.register_stack_maps(module.compiled_module()); + if let Some(lookup) = module.compiled_module().stack_maps() { + let (start, end) = module.compiled_module().code().range(); + self.stack_map_registry() + .register_stack_maps(start, end, lookup); + } // Signatures are loaded into our `SignatureRegistry` here // once-per-module (and once-per-signature). This allows us to create @@ -292,18 +296,6 @@ impl Store { self.signatures().borrow_mut().register_module(module); } - fn register_stack_maps(&self, module: &CompiledModule) { - self.stack_map_registry() - .register_stack_maps(module.stack_maps().map(|(func, stack_maps)| unsafe { - let ptr = (*func).as_ptr(); - let len = (*func).len(); - let start = ptr as usize; - let end = ptr as usize + len; - let range = start..end; - (range, stack_maps) - })); - } - pub(crate) fn bump_resource_counts(&self, module: &Module) -> Result<()> { let config = self.engine().config();