bytecodealliance · elliottt · Aug 24, 2022 · Aug 24, 2022 · Aug 24, 2022 · Aug 24, 2022
@@ -1395,6 +1395,9 @@
 (decl avx512bitalg_enabled () Type)
 (extern extractor avx512bitalg_enabled avx512bitalg_enabled)
 
+(decl avx512vbmi_enabled () Type)
+(extern extractor avx512vbmi_enabled avx512vbmi_enabled)
+
 (decl use_lzcnt () Type)
 (extern extractor use_lzcnt use_lzcnt)
 
@@ -2735,6 +2738,19 @@
                      src1
                      src2))
 
+;; Helper for creating `vpermi2b` instructions.
+;;
+;; Requires AVX-512 vl and vbmi extensions.
+(decl x64_vpermi2b (Xmm Xmm Xmm) Xmm)
+(rule (x64_vpermi2b src1 src2 src3)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (gen_move $I8X16 dst src3)))
+            (_ Unit (emit (MInst.XmmRmREvex (Avx512Opcode.Vpermi2b)
+                                            src1
+                                            src2
+                                            dst))))
+        dst))
+
 ;; Helper for creating `MInst.MulHi` instructions.
 ;;
 ;; Returns the (lo, hi) register halves of the multiplication.
@@ -3621,6 +3637,47 @@
       (let ((dst WritableGpr (pinned_writable_gpr)))
         (SideEffectNoResult.Inst (gen_move $I64 dst val))))
 
+;;;; Shuffle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Produce a mask suitable for use with `pshufb` for permuting the argument to
+;; shuffle, when the arguments are the same (i.e. `shuffle a a mask`). This will
+;; map all indices in the range 0..31 to the range 0..15.
+(decl shuffle_0_31_mask (VecMask) VCodeConstant)
+(extern constructor shuffle_0_31_mask shuffle_0_31_mask)
+
+;; Produce a mask suitable for use with `pshufb` for permuting the lhs of a
+;; `shuffle` operation (lanes 0-15).
+(decl shuffle_0_15_mask (VecMask) VCodeConstant)
+(extern constructor shuffle_0_15_mask shuffle_0_15_mask)
+
+;; Produce a mask suitable for use with `pshufb` for permuting the rhs of a
+;; `shuffle` operation (lanes 16-31).
+(decl shuffle_16_31_mask (VecMask) VCodeConstant)
+(extern constructor shuffle_16_31_mask shuffle_16_31_mask)
+
+;; Produce a permutation suitable for use with `vpermi2b`, for permuting two
+;; I8X16 vectors simultaneously.
+;;
+;; NOTE: `vpermi2b` will mask the indices in each lane to 5 bits when indexing
+;; into vectors, so this constructor makes no effort to handle indices that are
+;; larger than 31. If you are lowering a clif opcode like `shuffle` that has
+;; special behavior for out of bounds indices (emitting a `0` in the resulting
+;; vector in the case of `shuffle`) you'll need to handle that behavior
+;; separately.
+(decl perm_from_mask (VecMask) VCodeConstant)
+(extern constructor perm_from_mask perm_from_mask)
+
+;; If the mask that would be given to `shuffle` contains any out-of-bounds
+;; indices, return a mask that will zero those.
+(decl perm_from_mask_with_zeros (VCodeConstant VCodeConstant) VecMask)
+(extern extractor perm_from_mask_with_zeros perm_from_mask_with_zeros)
+
+;;;; Swizzle ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
+(decl swizzle_zero_mask () VCodeConstant)
+(extern constructor swizzle_zero_mask swizzle_zero_mask)
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (convert Gpr InstOutput output_gpr)

@@ -66,6 +66,18 @@ impl Inst {
             dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
         }
     }
+
+    fn xmm_rm_r_evex(op: Avx512Opcode, src1: RegMem, src2: Reg, dst: Writable<Reg>) -> Self {
+        src1.assert_regclass_is(RegClass::Float);
+        debug_assert!(src2.class() == RegClass::Float);
+        debug_assert!(dst.to_reg().class() == RegClass::Float);
+        Inst::XmmRmREvex {
+            op,
+            src1: XmmMem::new(src1).unwrap(),
+            src2: Xmm::new(src2).unwrap(),
+            dst: WritableXmm::from_writable_reg(dst).unwrap(),
+        }
+    }
 }
 
 #[test]

@@ -316,23 +316,6 @@ impl Inst {
         }
     }
 
-    pub(crate) fn xmm_rm_r_evex(
-        op: Avx512Opcode,
-        src1: RegMem,
-        src2: Reg,
-        dst: Writable<Reg>,
-    ) -> Self {
-        src1.assert_regclass_is(RegClass::Float);
-        debug_assert!(src2.class() == RegClass::Float);
-        debug_assert!(dst.to_reg().class() == RegClass::Float);
-        Inst::XmmRmREvex {
-            op,
-            src1: XmmMem::new(src1).unwrap(),
-            src2: Xmm::new(src2).unwrap(),
-            dst: WritableXmm::from_writable_reg(dst).unwrap(),
-        }
-    }
-
     pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
         debug_assert!(dst.to_reg().class() == RegClass::Float);
         Inst::XmmUninitializedValue {

@@ -3510,3 +3510,50 @@
 ;; register allocator a definition for the output virtual register.
 (rule (lower (raw_bitcast val))
       (put_in_regs val))
+
+;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
+;; register. We statically build `constructed_mask` to zero out any unknown lane
+;; indices (may not be completely necessary: verification could fail incorrect
+;; mask values) and fix the indexes to all point to the `dst` vector.
+(rule (lower (shuffle a a (vec_mask_from_immediate mask)))
+      (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask))))
+
+;; For the case where the shuffle mask contains out-of-bounds values (values
+;; greater than 31) we must mask off those resulting values in the result of
+;; `vpermi2b`.
+(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
+                       (shuffle a b (vec_mask_from_immediate
+                                      (perm_from_mask_with_zeros mask zeros)))))
+      (x64_andps
+        (x64_xmm_load_const $I8X16 zeros)
+        (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask))))
+
+;; However, if the shuffle mask contains no out-of-bounds values, we can use
+;; `vpermi2b` without any masking.
+(rule (lower (has_type (and (avx512vl_enabled) (avx512vbmi_enabled))
+                       (shuffle a b (vec_mask_from_immediate mask))))
+      (x64_vpermi2b b a (x64_xmm_load_const $I8X16 (perm_from_mask mask))))
+
+;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR
+;; them together. This is necessary due to PSHUFB semantics. As in the case
+;; above, we build the `constructed_mask` for each case statically.
+(rule (lower (shuffle a b (vec_mask_from_immediate mask)))
+      (x64_por
+        (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask)))
+        (x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask)))))
+
+;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; SIMD swizzle; the following inefficient implementation is due to the Wasm
+;; SIMD spec requiring mask indexes greater than 15 to have the same semantics
+;; as a 0 index. For the spec discussion, see
+;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the
+;; Wasm SIMD semantics for this instruction. The instruction format maps to
+;; variables like: %dst = swizzle %src, %mask
+(rule (lower (swizzle src mask))
+      (let ((mask Xmm (x64_paddusb
+                        mask
+                        (x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
+        (x64_pshufb src mask)))
@@ -3,7 +3,6 @@
 // ISLE integration glue.
 pub(super) mod isle;
 
-use crate::data_value::DataValue;
 use crate::ir::{types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type};
 use crate::isa::x64::abi::*;
 use crate::isa::x64::inst::args::*;
@@ -585,139 +584,14 @@ fn lower_insn_to_regs(
         | Opcode::SetPinnedReg
         | Opcode::Vconst
         | Opcode::RawBitcast
-        | Opcode::Insertlane => {
+        | Opcode::Insertlane
+        | Opcode::Shuffle
+        | Opcode::Swizzle => {
             implemented_in_isle(ctx);
         }
 
         Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
 
-        Opcode::Shuffle => {
-            let ty = ty.unwrap();
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let lhs_ty = ctx.input_ty(insn, 0);
-            let lhs = put_input_in_reg(ctx, inputs[0]);
-            let rhs = put_input_in_reg(ctx, inputs[1]);
-            let mask = match ctx.get_immediate(insn) {
-                Some(DataValue::V128(bytes)) => bytes.to_vec(),
-                _ => unreachable!("shuffle should always have a 16-byte immediate"),
-            };
-
-            // A mask-building helper: in 128-bit SIMD, 0-15 indicate which lane to read from and a
-            // 1 in the most significant position zeroes the lane.
-            let zero_unknown_lane_index = |b: u8| if b > 15 { 0b10000000 } else { b };
-
-            ctx.emit(Inst::gen_move(dst, rhs, ty));
-            if rhs == lhs {
-                // If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
-                // register. We statically build `constructed_mask` to zero out any unknown lane
-                // indices (may not be completely necessary: verification could fail incorrect mask
-                // values) and fix the indexes to all point to the `dst` vector.
-                let constructed_mask = mask
-                    .iter()
-                    // If the mask is greater than 15 it still may be referring to a lane in b.
-                    .map(|&b| if b > 15 { b.wrapping_sub(16) } else { b })
-                    .map(zero_unknown_lane_index)
-                    .collect();
-                let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
-                let tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
-                ctx.emit(Inst::xmm_load_const(constant, tmp, ty));
-                // After loading the constructed mask in a temporary register, we use this to
-                // shuffle the `dst` register (remember that, in this case, it is the same as
-                // `src` so we disregard this register).
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
-            } else {
-                if isa_flags.use_avx512vl_simd() && isa_flags.use_avx512vbmi_simd() {
-                    assert!(
-                        mask.iter().all(|b| *b < 32),
-                        "shuffle mask values must be between 0 and 31"
-                    );
-
-                    // Load the mask into the destination register.
-                    let constant = ctx.use_constant(VCodeConstantData::Generated(mask.into()));
-                    ctx.emit(Inst::xmm_load_const(constant, dst, ty));
-
-                    // VPERMI2B has the exact semantics of Wasm's shuffle:
-                    // permute the bytes in `src1` and `src2` using byte indexes
-                    // in `dst` and store the byte results in `dst`.
-                    ctx.emit(Inst::xmm_rm_r_evex(
-                        Avx512Opcode::Vpermi2b,
-                        RegMem::reg(rhs),
-                        lhs,
-                        dst,
-                    ));
-                } else {
-                    // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
-                    // them together. This is necessary due to PSHUFB semantics. As in the case above,
-                    // we build the `constructed_mask` for each case statically.
-
-                    // PSHUFB the `lhs` argument into `tmp0`, placing zeroes for unused lanes.
-                    let tmp0 = ctx.alloc_tmp(lhs_ty).only_reg().unwrap();
-                    ctx.emit(Inst::gen_move(tmp0, lhs, lhs_ty));
-                    let constructed_mask =
-                        mask.iter().cloned().map(zero_unknown_lane_index).collect();
-                    let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
-                    let tmp1 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
-                    ctx.emit(Inst::xmm_load_const(constant, tmp1, ty));
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
-
-                    // PSHUFB the second argument, placing zeroes for unused lanes.
-                    let constructed_mask = mask
-                        .iter()
-                        .map(|b| b.wrapping_sub(16))
-                        .map(zero_unknown_lane_index)
-                        .collect();
-                    let constant = ctx.use_constant(VCodeConstantData::Generated(constructed_mask));
-                    let tmp2 = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
-                    ctx.emit(Inst::xmm_load_const(constant, tmp2, ty));
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
-
-                    // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
-                    // is not important).
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
-                }
-            }
-        }
-
-        Opcode::Swizzle => {
-            // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec
-            // requiring mask indexes greater than 15 to have the same semantics as a 0 index. For
-            // the spec discussion, see https://github.com/WebAssembly/simd/issues/93. The CLIF
-            // semantics match the Wasm SIMD semantics for this instruction.
-            // The instruction format maps to variables like: %dst = swizzle %src, %mask
-            let ty = ty.unwrap();
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let src = put_input_in_reg(ctx, inputs[0]);
-            let swizzle_mask = put_input_in_reg(ctx, inputs[1]);
-
-            // Inform the register allocator that `src` and `dst` should be in the same register.
-            ctx.emit(Inst::gen_move(dst, src, ty));
-
-            // Create a mask for zeroing out-of-bounds lanes of the swizzle mask.
-            let zero_mask = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
-            static ZERO_MASK_VALUE: [u8; 16] = [
-                0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                0x70, 0x70,
-            ];
-            let constant = ctx.use_constant(VCodeConstantData::WellKnown(&ZERO_MASK_VALUE));
-            ctx.emit(Inst::xmm_load_const(constant, zero_mask, ty));
-
-            // Use the `zero_mask` on a writable `swizzle_mask`.
-            let swizzle_mask_tmp = ctx.alloc_tmp(types::I8X16).only_reg().unwrap();
-            ctx.emit(Inst::gen_move(swizzle_mask_tmp, swizzle_mask, ty));
-            ctx.emit(Inst::xmm_rm_r(
-                SseOpcode::Paddusb,
-                RegMem::from(zero_mask),
-                swizzle_mask_tmp,
-            ));
-
-            // Shuffle `dst` using the fixed-up `swizzle_mask`.
-            ctx.emit(Inst::xmm_rm_r(
-                SseOpcode::Pshufb,
-                RegMem::from(swizzle_mask_tmp),
-                dst,
-            ));
-        }
-
         Opcode::Extractlane => {
             // The instruction format maps to variables like: %dst = extractlane %src, %lane
             let ty = ty.unwrap();