bytecodealliance · fitzgen · Nov 16, 2021 · Nov 5, 2021
@@ -329,6 +329,19 @@
             Vpmullq
             Vpopcntb))
 
+(type FcmpImm extern
+      (enum Equal
+            LessThan
+            LessThanOrEqual
+            Unordered
+            NotEqual
+            UnorderedOrGreaterThanOrEqual
+            UnorderedOrGreaterThan
+            Ordered))
+
+(decl encode_fcmp_imm (FcmpImm) u8)
+(extern constructor encode_fcmp_imm encode_fcmp_imm)
+
 ;;;; Helpers for Querying Enabled ISA Extensions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl avx512vl_enabled () Type)
@@ -450,6 +463,49 @@
 (rule (extend (ExtendKind.Sign) ty mode src)
       (movsx ty mode src))
 
+;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Determine the appropriate operation for xor-ing vectors of the specified type
+(decl sse_xor_op (Type) SseOpcode)
+(rule (sse_xor_op $F32X4) (SseOpcode.Xorps))
+(rule (sse_xor_op $F64X2) (SseOpcode.Xorpd))
+(rule (sse_xor_op (multi_lane _bits _lanes)) (SseOpcode.Pxor))
+
+;; Performs an xor operation of the two operands specified
+(decl sse_xor (Type Reg RegMem) Reg)
+(rule (sse_xor ty x y) (xmm_rm_r ty (sse_xor_op ty) x y))
+
+;; Determine the appropriate operation to compare two vectors of the specified
+;; type.
+(decl sse_cmp_op (Type) SseOpcode)
+(rule (sse_cmp_op (multi_lane 8 16)) (SseOpcode.Pcmpeqb))
+(rule (sse_cmp_op (multi_lane 16 8)) (SseOpcode.Pcmpeqw))
+(rule (sse_cmp_op (multi_lane 32 4)) (SseOpcode.Pcmpeqd))
+(rule (sse_cmp_op (multi_lane 64 2)) (SseOpcode.Pcmpeqq))
+(rule (sse_cmp_op $F32X4) (SseOpcode.Cmpps))
+(rule (sse_cmp_op $F64X2) (SseOpcode.Cmppd))
+
+;; Generates a register value which has an all-ones pattern of the specified
+;; type.
+;;
+;; Note that this is accomplished by comparing a fresh register with itself,
+;; which for integers is always true. Also note that the comparison is always
+;; done for integers, it doesn't actually take the input `ty` into account. This
+;; is because we're comparing a fresh register to itself and we don't know the
+;; previous contents of the register. If a floating-point comparison is used
+;; then it runs the risk of comparing NaN against NaN and not actually producing
+;; an all-ones mask. By using integer comparision operations we're guaranteeed
+;; that everything is equal to itself.
+(decl vector_all_ones (Type) Reg)
+(rule (vector_all_ones ty)
+      (let ((wr WritableReg (temp_writable_reg ty))
+            (r Reg (writable_reg_to_reg wr))
+            (_ Unit (emit (MInst.XmmRmR (sse_cmp_op $I32X4)
+                                         r
+                                         (RegMem.Reg r)
+                                         wr))))
+        r))
+
 ;;;; Instruction Constructors ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; These constructors create SSA-style `MInst`s. It is their responsibility to
@@ -596,6 +652,17 @@
                                          wr))))
         r))
 
+;; Special case for zero immediates with vector types, they turn into an xor
+;; specific to the vector type.
+(rule (imm ty @ (multi_lane _bits _lanes) 0)
+      (let ((wr WritableReg (temp_writable_reg ty))
+            (r Reg (writable_reg_to_reg wr))
+            (_ Unit (emit (MInst.XmmRmR (sse_xor_op ty)
+                                         r
+                                         (RegMem.Reg r)
+                                         wr))))
+        r))
+
 ;; Helper for creating `MInst.ShifR` instructions.
 (decl shift_r (Type ShiftKind Reg Imm8Reg) Reg)
 (rule (shift_r ty kind src1 src2)
@@ -948,6 +1015,11 @@
 (rule (psllq src1 src2)
       (xmm_rmi_reg (SseOpcode.Psllq) src1 src2))
 
+;; Helper for creating `psrld` instructions.
+(decl psrld (Reg RegMemImm) Reg)
+(rule (psrld src1 src2)
+      (xmm_rmi_reg (SseOpcode.Psrld) src1 src2))
+
 ;; Helper for creating `psrlq` instructions.
 (decl psrlq (Reg RegMemImm) Reg)
 (rule (psrlq src1 src2)
@@ -975,3 +1047,25 @@
 (decl mulhi_u (Type Reg RegMem) ValueRegs)
 (rule (mulhi_u ty src1 src2)
       (mul_hi ty $false src1 src2))
+
+;; Helper for creating `cmpps` instructions.
+(decl cmpps (Reg RegMem FcmpImm) Reg)
+(rule (cmpps src1 src2 imm)
+      (xmm_rm_r_imm (SseOpcode.Cmpps)
+                    src1
+                    src2
+                    (encode_fcmp_imm imm)
+                    (OperandSize.Size32)))
+
+;; Helper for creating `cmppd` instructions.
+;;
+;; Note that `Size32` is intentional despite this being used for 64-bit
+;; operations, since this presumably induces the correct encoding of the
+;; instruction.
+(decl cmppd (Reg RegMem FcmpImm) Reg)
+(rule (cmppd src1 src2 imm)
+      (xmm_rm_r_imm (SseOpcode.Cmppd)
+                    src1
+                    src2
+                    (encode_fcmp_imm imm)
+                    (OperandSize.Size32)))
@@ -1391,7 +1391,8 @@ impl fmt::Display for CC {
 /// Encode the ways that floats can be compared. This is used in float comparisons such as `cmpps`,
 /// e.g.; it is distinguished from other float comparisons (e.g. `ucomiss`) in that those use EFLAGS
 /// whereas [FcmpImm] is used as an immediate.
-pub(crate) enum FcmpImm {
+#[derive(Clone, Copy)]
+pub enum FcmpImm {
     Equal = 0x00,
     LessThan = 0x01,
     LessThanOrEqual = 0x02,

@@ -1301,31 +1301,6 @@ impl Inst {
         }
     }
 
-    /// Choose which instruction to use for comparing two values for equality.
-    pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
-        match ty {
-            types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to),
-            types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to),
-            types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to),
-            types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to),
-            types::F32X4 => Inst::xmm_rm_r_imm(
-                SseOpcode::Cmpps,
-                from,
-                to,
-                FcmpImm::Equal.encode(),
-                OperandSize::Size32,
-            ),
-            types::F64X2 => Inst::xmm_rm_r_imm(
-                SseOpcode::Cmppd,
-                from,
-                to,
-                FcmpImm::Equal.encode(),
-                OperandSize::Size32,
-            ),
-            _ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
-        }
-    }
-
     /// Choose which instruction to use for computing a bitwise AND on two values.
     pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
@@ -1356,16 +1331,6 @@ impl Inst {
         }
     }
 
-    /// Choose which instruction to use for computing a bitwise XOR on two values.
-    pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
-        match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to),
-            _ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
-        }
-    }
-
     /// Translate three-operand instructions into a sequence of two-operand
     /// instructions.
     ///

@@ -484,18 +484,8 @@
 
 ;; SSE.
 
-(rule (lower (has_type $F32X4 (bxor x y)))
-      (value_reg (xorps (put_in_reg x)
-                        (put_in_reg_mem y))))
-
-(rule (lower (has_type $F64X2 (bxor x y)))
-      (value_reg (xorpd (put_in_reg x)
-                        (put_in_reg_mem y))))
-
-(rule (lower (has_type (multi_lane _bits _lanes)
-                       (bxor x y)))
-      (value_reg (pxor (put_in_reg x)
-                       (put_in_reg_mem y))))
+(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bxor x y)))
+      (value_reg (sse_xor ty (put_in_reg x) (put_in_reg_mem y))))
 
 ;; `{i,b}128`.
 
@@ -945,3 +935,22 @@
 
 (rule (lower (has_type (multi_lane _bits _lanes) (band_not x y)))
       (value_reg (pandn (put_in_reg y) (put_in_reg_mem x))))
+
+;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Special case for `f32x4.abs`.
+(rule (lower (has_type $F32X4 (fabs x)))
+      (value_reg (andps (put_in_reg x)
+                        (RegMem.Reg (psrld (vector_all_ones $F32X4) (RegMemImm.Imm 1))))))
+
+;; Special case for `f64x2.abs`.
+(rule (lower (has_type $F64X2 (fabs x)))
+      (value_reg (andpd (put_in_reg x)
+                        (RegMem.Reg (psrlq (vector_all_ones $F64X2) (RegMemImm.Imm 1))))))
+
+;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Special case for vector-types where bit-negation is an xor against an
+;; all-one value
+(rule (lower (has_type ty @ (multi_lane _bits _lanes) (bnot x)))
+      (value_reg (sse_xor ty (put_in_reg x) (RegMem.Reg (vector_all_ones ty)))))
@@ -1615,14 +1615,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let ty = ty.unwrap();
 
             if ty.is_vector() {
-                let src = put_input_in_reg(ctx, inputs[0]);
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(dst, src, ty));
-                let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-
-                // Set tmp to all 1s before flipping the bits
-                ctx.emit(Inst::equals(types::I32X4, RegMem::from(tmp), tmp));
-                ctx.emit(Inst::xor(ty, RegMem::from(tmp), dst));
+                unreachable!(
+                    "implemented in ISLE: inst = `{}`, type = `{:?}`",
+                    ctx.dfg().display_inst(insn),
+                    ty
+                );
             } else if ty == types::I128 || ty == types::B128 {
                 let src = put_input_in_regs(ctx, inputs[0]);
                 let dst = get_output_reg(ctx, outputs[0]);
@@ -4669,8 +4666,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     // Shift the all 1s constant to generate the mask.
                     let lane_bits = output_ty.lane_bits();
                     let (shift_opcode, opcode, shift_by) = match (op, lane_bits) {
-                        (Opcode::Fabs, 32) => (SseOpcode::Psrld, SseOpcode::Andps, 1),
-                        (Opcode::Fabs, 64) => (SseOpcode::Psrlq, SseOpcode::Andpd, 1),
+                        (Opcode::Fabs, _) => {
+                            unreachable!(
+                                "implemented in ISLE: inst = `{}`, type = `{:?}`",
+                                ctx.dfg().display_inst(insn),
+                                ty
+                            );
+                        }
                         (Opcode::Fneg, 32) => (SseOpcode::Pslld, SseOpcode::Xorps, 31),
                         (Opcode::Fneg, 64) => (SseOpcode::Psllq, SseOpcode::Xorpd, 63),
                         _ => unreachable!(

@@ -13,7 +13,9 @@ use crate::isa::x64::settings as x64_settings;
 use crate::{
     ir::{immediates::*, types::*, Inst, InstructionData, Opcode, Value, ValueList},
     isa::x64::inst::{
-        args::{Avx512Opcode, CmpOpcode, ExtMode, Imm8Reg, RegMem, ShiftKind, SseOpcode, CC},
+        args::{
+            Avx512Opcode, CmpOpcode, ExtMode, FcmpImm, Imm8Reg, RegMem, ShiftKind, SseOpcode, CC,
+        },
         x64_map_regs, RegMapper,
     },
     machinst::{get_output_reg, InsnInput, InsnOutput, LowerCtx},
@@ -313,6 +315,11 @@ where
         RegMem::reg(self.put_in_reg(val))
     }
 
+    #[inline]
+    fn encode_fcmp_imm(&mut self, imm: &FcmpImm) -> u8 {
+        imm.encode()
+    }
+
     #[inline]
     fn avx512vl_enabled(&mut self, _: Type) -> Option<()> {
         if self.isa_flags.use_avx512vl_simd() {