diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index 0f6fcaa62174..a72a05b2a8ec 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -456,7 +456,6 @@ impl WastTest { "misc_testsuite/simd/spillslot-size-fuzzbug.wast", "misc_testsuite/simd/unaligned-load.wast", "multi-memory/simd_memory-multi.wast", - "spec_testsuite/simd_align.wast", "spec_testsuite/simd_bit_shift.wast", "spec_testsuite/simd_bitwise.wast", "spec_testsuite/simd_boolean.wast", @@ -526,6 +525,16 @@ impl WastTest { if unsupported.iter().any(|part| self.path.ends_with(part)) { return true; } + + // SIMD on Winch requires AVX instructions. + #[cfg(target_arch = "x86_64")] + if !(std::is_x86_feature_detected!("avx") && std::is_x86_feature_detected!("avx2")) { + let unsupported = ["spec_testsuite/simd_align.wast"]; + + if unsupported.iter().any(|part| self.path.ends_with(part)) { + return true; + } + } } for part in self.path.iter() { diff --git a/tests/disas/winch/x64/load/v128_load16_splat_avx2.wat b/tests/disas/winch/x64/load/v128_load16_splat_avx2.wat new file mode 100644 index 000000000000..5350689d30a8 --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load16_splat_avx2.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true", "-Ccranelift-has-avx2=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load16_splat") (result v128) (v128.load16_splat (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpbroadcastw (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load16x4_s_avx.wat b/tests/disas/winch/x64/load/v128_load16x4_s_avx.wat new file mode 100644 index 000000000000..2e95463b2d7e --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load16x4_s_avx.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load16x4_s") (result v128) (v128.load16x4_s (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpmovsxwd (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load16x4_u_avx.wat b/tests/disas/winch/x64/load/v128_load16x4_u_avx.wat new file mode 100644 index 000000000000..641275ac2e5d --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load16x4_u_avx.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load16x4_u") (result v128) (v128.load16x4_u (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpmovzxwd (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load32_splat_avx2.wat b/tests/disas/winch/x64/load/v128_load32_splat_avx2.wat new file mode 100644 index 000000000000..a932a1e9e70a --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load32_splat_avx2.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true", "-Ccranelift-has-avx2=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load32_splat") (result v128) (v128.load32_splat (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpbroadcastd (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load32x2_s_avx.wat b/tests/disas/winch/x64/load/v128_load32x2_s_avx.wat new file mode 100644 index 000000000000..ca2f20219f39 --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load32x2_s_avx.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load32x2_s") (result v128) (v128.load32x2_s (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpmovsxdq (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load32x2_u_avx.wat b/tests/disas/winch/x64/load/v128_load32x2_u_avx.wat new file mode 100644 index 000000000000..b81f42c01b6b --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load32x2_u_avx.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load32x2_u") (result v128) (v128.load32x2_u (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpmovzxdq (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load64_splat_avx.wat b/tests/disas/winch/x64/load/v128_load64_splat_avx.wat new file mode 100644 index 000000000000..4170e9ba1b67 --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load64_splat_avx.wat @@ -0,0 +1,30 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load64_splat") (result v128) (v128.load64_splat (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x47 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; movsd (%rcx), %xmm0 +;; vpshufd $0x44, %xmm0, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 47: ud2 diff --git a/tests/disas/winch/x64/load/v128_load8_splat_avx2.wat b/tests/disas/winch/x64/load/v128_load8_splat_avx2.wat new file mode 100644 index 000000000000..57dc051380df --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load8_splat_avx2.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true", "-Ccranelift-has-avx2=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load8_splat") (result v128) (v128.load8_splat (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpbroadcastb (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load8x8_s_avx.wat b/tests/disas/winch/x64/load/v128_load8x8_s_avx.wat new file mode 100644 index 000000000000..58e8c44a3c90 --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load8x8_s_avx.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load8x8_s") (result v128) (v128.load8x8_s (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpmovsxbw (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/tests/disas/winch/x64/load/v128_load8x8_u_avx.wat b/tests/disas/winch/x64/load/v128_load8x8_u_avx.wat new file mode 100644 index 000000000000..3611ca60ee0b --- /dev/null +++ b/tests/disas/winch/x64/load/v128_load8x8_u_avx.wat @@ -0,0 +1,29 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx=true" ] + +(module + (memory (data "\00\00\00\00\00\00\00\00\00\00\00\00\00\00\a0\7f")) + + (func (export "v128.load8x8_u") (result v128) (v128.load8x8_u (i32.const 0))) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x43 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movl $0, %eax +;; movq 0x60(%r14), %rcx +;; addq %rax, %rcx +;; vpmovzxbw (%rcx), %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 43: ud2 diff --git a/winch/codegen/src/codegen/error.rs b/winch/codegen/src/codegen/error.rs index 8834aefbb1a6..674468741d4d 100644 --- a/winch/codegen/src/codegen/error.rs +++ b/winch/codegen/src/codegen/error.rs @@ -17,6 +17,12 @@ pub(crate) enum CodeGenError { /// Unimplemented MacroAssembler instruction. #[error("Unimplemented Masm instruction")] UnimplementedMasmInstruction, + /// Unimplemented Wasm load kind. + #[error("Unimplemented Wasm load kind")] + UnimplementedWasmLoadKind, + /// Unimplemented due to requiring AVX. + #[error("Instruction not implemented for CPUs without AVX support")] + UnimplementedForNoAvx, /// Unsupported eager initialization of tables. #[error("Unsupported eager initialization of tables")] UnsupportedTableEagerInit, diff --git a/winch/codegen/src/codegen/mod.rs b/winch/codegen/src/codegen/mod.rs index a3e9b99411c0..9c7e3652e0f0 100644 --- a/winch/codegen/src/codegen/mod.rs +++ b/winch/codegen/src/codegen/mod.rs @@ -3,8 +3,8 @@ use crate::{ codegen::BlockSig, isa::reg::{writable, Reg}, masm::{ - ExtendKind, IntCmpKind, MacroAssembler, MemOpKind, OperandSize, RegImm, SPOffset, - ShiftKind, TrapCode, + IntCmpKind, LoadKind, MacroAssembler, MemOpKind, OperandSize, RegImm, SPOffset, ShiftKind, + TrapCode, }, stack::TypedReg, }; @@ -846,7 +846,7 @@ where arg: &MemArg, ty: WasmValType, size: OperandSize, - sextend: Option, + kind: LoadKind, op_kind: MemOpKind, ) -> Result<()> { if let Some(addr) = self.emit_compute_heap_address(&arg, size)? { @@ -859,7 +859,7 @@ where let src = self.masm.address_at_reg(addr, 0)?; self.masm - .wasm_load(src, writable!(dst), size, sextend, op_kind)?; + .wasm_load(src, writable!(dst), size, kind, op_kind)?; self.context.stack.push(TypedReg::new(ty, dst).into()); self.context.free_reg(addr); } diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index 549f1932cc8c..6549e9d5dce0 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -12,7 +12,7 @@ use crate::{ CallingConvention, }, masm::{ - CalleeKind, DivKind, ExtendKind, FloatCmpKind, Imm as I, IntCmpKind, + CalleeKind, DivKind, ExtendKind, FloatCmpKind, Imm as I, IntCmpKind, LoadKind, MacroAssembler as Masm, MemOpKind, MulWideKind, OperandSize, RegImm, RemKind, RoundingMode, SPOffset, ShiftKind, StackSlot, TrapCode, TruncKind, }, @@ -215,24 +215,28 @@ impl Masm for MacroAssembler { src: Self::Address, dst: WritableReg, size: OperandSize, - kind: Option, + kind: LoadKind, op_kind: MemOpKind, ) -> Result<()> { match op_kind { - MemOpKind::Normal => { - // kind is some if the value is signed - // unlike x64, unused bits are set to zero so we don't need to extend - if kind.is_some() { - self.asm.sload(src, dst, size); - } else { - self.asm.uload(src, dst, size); + MemOpKind::Normal => match kind { + LoadKind::Simple => self.asm.uload(src, dst, size), + LoadKind::Splat => bail!(CodeGenError::UnimplementedWasmLoadKind), + LoadKind::ScalarExtend(extend_kind) => { + if extend_kind.signed() { + self.asm.sload(src, dst, size) + } else { + // unlike x64, unused bits are set to zero so we don't need to extend + self.asm.uload(src, dst, size) + } } - - Ok(()) - } - - MemOpKind::Atomic => Err(anyhow!(CodeGenError::unimplemented_masm_instruction())), + LoadKind::VectorExtend(_vector_extend_kind) => { + bail!(CodeGenError::UnimplementedWasmLoadKind) + } + }, + MemOpKind::Atomic => bail!(CodeGenError::unimplemented_masm_instruction()), } + Ok(()) } fn load_addr(&mut self, src: Self::Address, dst: WritableReg, size: OperandSize) -> Result<()> { diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs index 4cae0f512dfb..27f20ee84b76 100644 --- a/winch/codegen/src/isa/x64/asm.rs +++ b/winch/codegen/src/isa/x64/asm.rs @@ -3,7 +3,8 @@ use crate::{ isa::{reg::Reg, CallingConvention}, masm::{ - DivKind, ExtendKind, IntCmpKind, MulWideKind, OperandSize, RemKind, RoundingMode, ShiftKind, + DivKind, ExtendKind, IntCmpKind, MulWideKind, OperandSize, RemKind, RoundingMode, + ShiftKind, VectorExtendKind, }, }; use cranelift_codegen::{ @@ -15,8 +16,8 @@ use cranelift_codegen::{ unwind::UnwindInst, x64::{ args::{ - self, AluRmiROpcode, Amode, CmpOpcode, DivSignedness, ExtMode, FromWritableReg, - Gpr, GprMem, GprMemImm, Imm8Gpr, Imm8Reg, RegMem, RegMemImm, + self, AluRmiROpcode, Amode, AvxOpcode, CmpOpcode, DivSignedness, ExtMode, + FromWritableReg, Gpr, GprMem, GprMemImm, Imm8Gpr, Imm8Reg, RegMem, RegMemImm, ShiftKind as CraneliftShiftKind, SseOpcode, SyntheticAmode, WritableGpr, WritableXmm, Xmm, XmmMem, XmmMemAligned, CC, }, @@ -471,6 +472,90 @@ impl Assembler { }); } + /// Vector load and extend. + pub fn xmm_vpmov_mr( + &mut self, + src: &Address, + dst: WritableReg, + ext: VectorExtendKind, + flags: MemFlags, + ) { + assert!(dst.to_reg().is_float()); + + let op = match ext { + VectorExtendKind::V128Extend8x8S => AvxOpcode::Vpmovsxbw, + VectorExtendKind::V128Extend8x8U => AvxOpcode::Vpmovzxbw, + VectorExtendKind::V128Extend16x4S => AvxOpcode::Vpmovsxwd, + VectorExtendKind::V128Extend16x4U => AvxOpcode::Vpmovzxwd, + VectorExtendKind::V128Extend32x2S => AvxOpcode::Vpmovsxdq, + VectorExtendKind::V128Extend32x2U => AvxOpcode::Vpmovzxdq, + }; + + let src = Self::to_synthetic_amode( + src, + &mut self.pool, + &mut self.constants, + &mut self.buffer, + flags, + ); + + self.emit(Inst::XmmUnaryRmRVex { + op, + src: XmmMem::unwrap_new(RegMem::mem(src)), + dst: dst.to_reg().into(), + }); + } + + /// Vector load and broadcast. + pub fn xmm_vpbroadcast_mr( + &mut self, + src: &Address, + dst: WritableReg, + size: OperandSize, + flags: MemFlags, + ) { + assert!(dst.to_reg().is_float()); + + let src = Self::to_synthetic_amode( + src, + &mut self.pool, + &mut self.constants, + &mut self.buffer, + flags, + ); + + let op = match size { + OperandSize::S8 => AvxOpcode::Vpbroadcastb, + OperandSize::S16 => AvxOpcode::Vpbroadcastw, + OperandSize::S32 => AvxOpcode::Vpbroadcastd, + _ => unimplemented!(), + }; + + self.emit(Inst::XmmUnaryRmRVex { + op, + src: XmmMem::unwrap_new(RegMem::mem(src)), + dst: dst.to_reg().into(), + }); + } + + /// Shuffle of bytes in vector. + pub fn xmm_vpshuf_rr(&mut self, src: Reg, dst: WritableReg, mask: u8, size: OperandSize) { + assert!(src.is_float() && dst.to_reg().is_float()); + + let op = match size { + OperandSize::S16 => AvxOpcode::Vpshuflw, + OperandSize::S64 => AvxOpcode::Vpshufd, + _ => unimplemented!(), + }; + + self.emit(Inst::XmmUnaryRmRImmVex { + op, + src: XmmMem::from(Xmm::from(src)), + imm: mask, + dst: dst.to_reg().into(), + }); + } + /// Single and double precision floating point store. pub fn xmm_mov_rm(&mut self, src: Reg, dst: &Address, size: OperandSize, flags: MemFlags) { use OperandSize::*; diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index b9010083b0f8..fa43eeddbbf5 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -7,9 +7,9 @@ use super::{ use anyhow::{anyhow, bail, Result}; use crate::masm::{ - DivKind, ExtendKind, FloatCmpKind, Imm as I, IntCmpKind, MacroAssembler as Masm, MemOpKind, - MulWideKind, OperandSize, RegImm, RemKind, RoundingMode, ShiftKind, TrapCode, TruncKind, - TRUSTED_FLAGS, UNTRUSTED_FLAGS, + DivKind, ExtendKind, FloatCmpKind, Imm as I, IntCmpKind, LoadKind, MacroAssembler as Masm, + MemOpKind, MulWideKind, OperandSize, RegImm, RemKind, RoundingMode, ShiftKind, TrapCode, + TruncKind, TRUSTED_FLAGS, UNTRUSTED_FLAGS, }; use crate::{ abi::{self, align_to, calculate_frame_adjustment, LocalSlot}, @@ -30,10 +30,12 @@ use crate::{ use cranelift_codegen::{ binemit::CodeOffset, ir::{MemFlags, RelSourceLoc, SourceLoc}, - isa::unwind::UnwindInst, - isa::x64::{ - args::{ExtMode, CC}, - settings as x64_settings, + isa::{ + unwind::UnwindInst, + x64::{ + args::{ExtMode, CC}, + settings as x64_settings, + }, }, settings, Final, MachBufferFinalized, MachLabel, }; @@ -279,7 +281,7 @@ impl Masm for MacroAssembler { src: Self::Address, dst: WritableReg, size: OperandSize, - kind: Option, + kind: LoadKind, op_kind: MemOpKind, ) -> Result<()> { if op_kind == MemOpKind::Atomic && size == OperandSize::S128 { @@ -287,14 +289,51 @@ impl Masm for MacroAssembler { bail!(CodeGenError::unexpected_operand_size()) } - // The guarantees of the x86-64 memory model ensure that `SeqCst` - // loads are equivalent to normal loads. - if let Some(ext) = kind { - self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS); - Ok(()) - } else { - self.load_impl::(src, dst, size, UNTRUSTED_FLAGS) + match kind { + // The guarantees of the x86-64 memory model ensure that `SeqCst` + // loads are equivalent to normal loads. + LoadKind::ScalarExtend(ext) => self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS), + LoadKind::Simple => self.load_impl::(src, dst, size, UNTRUSTED_FLAGS)?, + LoadKind::VectorExtend(ext) => match op_kind { + MemOpKind::Normal => { + if self.flags.has_avx() { + self.asm.xmm_vpmov_mr(&src, dst, ext, UNTRUSTED_FLAGS) + } else { + bail!(CodeGenError::UnimplementedForNoAvx) + } + } + MemOpKind::Atomic => bail!(CodeGenError::unimplemented_masm_instruction()), + }, + LoadKind::Splat => { + match op_kind { + MemOpKind::Normal => { + if self.flags.has_avx() { + if size == OperandSize::S64 { + self.asm + .xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS); + // Results in the first 4 bytes and second 4 bytes being + // swapped and then the swapped bytes being copied. + // [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields + // [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3]. + self.asm.xmm_vpshuf_rr( + dst.to_reg(), + dst, + 0b0100_0100, + OperandSize::S64, + ); + } else { + self.asm + .xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS); + } + } else { + bail!(CodeGenError::UnimplementedForNoAvx) + } + } + MemOpKind::Atomic => bail!(CodeGenError::unimplemented_masm_instruction()), + } + } } + Ok(()) } fn sp_offset(&self) -> Result { diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index de7dcd1f8aba..6b9dbbb60032 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -234,6 +234,36 @@ impl ExtendKind { } } +/// Kinds of vector extends in WebAssembly. Each MacroAssembler implementation +/// is responsible for emitting the correct sequence of instructions when +/// lowering to machine code. +pub(crate) enum VectorExtendKind { + /// Sign extends eight 8 bit integers to eight 16 bit lanes. + V128Extend8x8S, + /// Zero extends eight 8 bit integers to eight 16 bit lanes. + V128Extend8x8U, + /// Sign extends four 16 bit integers to four 32 bit lanes. + V128Extend16x4S, + /// Zero extends four 16 bit integers to four 32 bit lanes. + V128Extend16x4U, + /// Sign extends two 32 bit integers to two 64 bit lanes. + V128Extend32x2S, + /// Zero extends two 32 bit integers to two 64 bit lanes. + V128Extend32x2U, +} + +/// Kinds of behavior supported by Wasm loads. +pub(crate) enum LoadKind { + /// Do not extend or splat. + Simple, + /// Duplicate value into vector lanes. + Splat, + /// Scalar (non-vector) extend. + ScalarExtend(ExtendKind), + /// Vector extend. + VectorExtend(VectorExtendKind), +} + /// Operand size, in bits. #[derive(Copy, Debug, Clone, Eq, PartialEq)] pub(crate) enum OperandSize { @@ -654,7 +684,7 @@ pub(crate) trait MacroAssembler { src: Self::Address, dst: WritableReg, size: OperandSize, - ext_kind: Option, + kind: LoadKind, op_kind: MemOpKind, ) -> Result<()>; diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index 9e5be8e6dc9f..7502cef44c68 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -9,8 +9,9 @@ use crate::codegen::{ control_index, Callee, CodeGen, CodeGenError, ControlStackFrame, Emission, FnCall, }; use crate::masm::{ - DivKind, ExtendKind, FloatCmpKind, IntCmpKind, MacroAssembler, MemMoveDirection, MemOpKind, - MulWideKind, OperandSize, RegImm, RemKind, RoundingMode, SPOffset, ShiftKind, TruncKind, + DivKind, ExtendKind, FloatCmpKind, IntCmpKind, LoadKind, MacroAssembler, MemMoveDirection, + MemOpKind, MulWideKind, OperandSize, RegImm, RemKind, RoundingMode, SPOffset, ShiftKind, + TruncKind, VectorExtendKind, }; use crate::reg::{writable, Reg}; @@ -260,7 +261,16 @@ macro_rules! def_unsupported { (emit I64AtomicLoad16U $($rest:tt)*) => {}; (emit I64AtomicLoad32U $($rest:tt)*) => {}; (emit I64AtomicLoad $($rest:tt)*) => {}; - + (emit V128Load8x8S $($rest:tt)*) => {}; + (emit V128Load8x8U $($rest:tt)*) => {}; + (emit V128Load16x4S $($rest:tt)*) => {}; + (emit V128Load16x4U $($rest:tt)*) => {}; + (emit V128Load32x2S $($rest:tt)*) => {}; + (emit V128Load32x2U $($rest:tt)*) => {}; + (emit V128Load8Splat $($rest:tt)*) => {}; + (emit V128Load16Splat $($rest:tt)*) => {}; + (emit V128Load32Splat $($rest:tt)*) => {}; + (emit V128Load64Splat $($rest:tt)*) => {}; (emit $unsupported:tt $($rest:tt)*) => {$($rest)*}; } @@ -1932,7 +1942,7 @@ where &memarg, WasmValType::I32, OperandSize::S32, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -1942,7 +1952,7 @@ where &memarg, WasmValType::I32, OperandSize::S8, - Some(ExtendKind::I32Extend8S), + LoadKind::ScalarExtend(ExtendKind::I32Extend8S), MemOpKind::Normal, ) } @@ -1952,7 +1962,7 @@ where &memarg, WasmValType::I32, OperandSize::S8, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -1962,7 +1972,7 @@ where &memarg, WasmValType::I32, OperandSize::S16, - Some(ExtendKind::I32Extend16S), + LoadKind::ScalarExtend(ExtendKind::I32Extend16S), MemOpKind::Normal, ) } @@ -1972,7 +1982,7 @@ where &memarg, WasmValType::I32, OperandSize::S16, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -1994,7 +2004,7 @@ where &memarg, WasmValType::I64, OperandSize::S8, - Some(ExtendKind::I64Extend8S), + LoadKind::ScalarExtend(ExtendKind::I64Extend8S), MemOpKind::Normal, ) } @@ -2004,7 +2014,7 @@ where &memarg, WasmValType::I64, OperandSize::S8, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2014,7 +2024,7 @@ where &memarg, WasmValType::I64, OperandSize::S16, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2024,7 +2034,7 @@ where &memarg, WasmValType::I64, OperandSize::S16, - Some(ExtendKind::I64Extend16S), + LoadKind::ScalarExtend(ExtendKind::I64Extend16S), MemOpKind::Normal, ) } @@ -2034,7 +2044,7 @@ where &memarg, WasmValType::I64, OperandSize::S32, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2044,7 +2054,7 @@ where &memarg, WasmValType::I64, OperandSize::S32, - Some(ExtendKind::I64Extend32S), + LoadKind::ScalarExtend(ExtendKind::I64Extend32S), MemOpKind::Normal, ) } @@ -2054,7 +2064,7 @@ where &memarg, WasmValType::I64, OperandSize::S64, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2080,7 +2090,7 @@ where &memarg, WasmValType::F32, OperandSize::S32, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2094,7 +2104,7 @@ where &memarg, WasmValType::F64, OperandSize::S64, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2210,7 +2220,7 @@ where &memarg, WasmValType::I32, OperandSize::S8, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2220,7 +2230,7 @@ where &memarg, WasmValType::I32, OperandSize::S16, - None, + LoadKind::Simple, MemOpKind::Atomic, ) } @@ -2230,7 +2240,7 @@ where &memarg, WasmValType::I32, OperandSize::S32, - None, + LoadKind::Simple, MemOpKind::Atomic, ) } @@ -2240,7 +2250,7 @@ where &memarg, WasmValType::I64, OperandSize::S8, - None, + LoadKind::Simple, MemOpKind::Atomic, ) } @@ -2250,7 +2260,7 @@ where &memarg, WasmValType::I64, OperandSize::S16, - None, + LoadKind::Simple, MemOpKind::Atomic, ) } @@ -2260,7 +2270,7 @@ where &memarg, WasmValType::I64, OperandSize::S32, - None, + LoadKind::Simple, MemOpKind::Atomic, ) } @@ -2270,7 +2280,7 @@ where &memarg, WasmValType::I64, OperandSize::S64, - None, + LoadKind::Simple, MemOpKind::Atomic, ) } @@ -2293,7 +2303,7 @@ where &memarg, WasmValType::V128, OperandSize::S128, - None, + LoadKind::Simple, MemOpKind::Normal, ) } @@ -2302,6 +2312,106 @@ where self.emit_wasm_store(&memarg, OperandSize::S128) } + fn visit_v128_load8x8_s(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S64, + LoadKind::VectorExtend(VectorExtendKind::V128Extend8x8S), + MemOpKind::Normal, + ) + } + + fn visit_v128_load8x8_u(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S64, + LoadKind::VectorExtend(VectorExtendKind::V128Extend8x8U), + MemOpKind::Normal, + ) + } + + fn visit_v128_load16x4_s(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S64, + LoadKind::VectorExtend(VectorExtendKind::V128Extend16x4S), + MemOpKind::Normal, + ) + } + + fn visit_v128_load16x4_u(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S64, + LoadKind::VectorExtend(VectorExtendKind::V128Extend16x4U), + MemOpKind::Normal, + ) + } + + fn visit_v128_load32x2_s(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S64, + LoadKind::VectorExtend(VectorExtendKind::V128Extend32x2S), + MemOpKind::Normal, + ) + } + + fn visit_v128_load32x2_u(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S64, + LoadKind::VectorExtend(VectorExtendKind::V128Extend32x2U), + MemOpKind::Normal, + ) + } + + fn visit_v128_load8_splat(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S8, + LoadKind::Splat, + MemOpKind::Normal, + ) + } + + fn visit_v128_load16_splat(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S16, + LoadKind::Splat, + MemOpKind::Normal, + ) + } + + fn visit_v128_load32_splat(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S32, + LoadKind::Splat, + MemOpKind::Normal, + ) + } + + fn visit_v128_load64_splat(&mut self, memarg: MemArg) -> Self::Output { + self.emit_wasm_load( + &memarg, + WasmValType::V128, + OperandSize::S64, + LoadKind::Splat, + MemOpKind::Normal, + ) + } + wasmparser::for_each_visit_simd_operator!(def_unsupported); }