From edb01a29f89165195d80153aaeed68b347f18083 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Mon, 10 May 2021 14:50:41 -0700
Subject: [PATCH 1/3] x64: move multiplication lowering

Since the lowering of `imul` complicated the other ALU operations it was
matched with and since future commits will alter the multiplication
lowering further, this change moves the `imul` lowering to its own match
block.
---
 cranelift/codegen/src/isa/x64/lower.rs | 417 ++++++++++++++-----------
 1 file changed, 228 insertions(+), 189 deletions(-)
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index a1969d564253..3f62b375a7c7 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1511,7 +1511,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::Isub
         | Opcode::SsubSat
         | Opcode::UsubSat
-        | Opcode::Imul
         | Opcode::AvgRound
         | Opcode::Band
         | Opcode::Bor
@@ -1553,112 +1552,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         types::I16X8 => SseOpcode::Psubusw,
                         _ => panic!("Unsupported type for packed usub_sat instruction: {}", ty),
                     },
-                    Opcode::Imul => match ty {
-                        types::I16X8 => SseOpcode::Pmullw,
-                        types::I32X4 => SseOpcode::Pmulld,
-                        types::I64X2 => {
-                            // Note for I64X2 we describe a lane A as being composed of a
-                            // 32-bit upper half "Ah" and a 32-bit lower half "Al".
-                            // The 32-bit long hand multiplication can then be written as:
-                            //    Ah Al
-                            // *  Bh Bl
-                            //    -----
-                            //    Al * Bl
-                            // + (Ah * Bl) << 32
-                            // + (Al * Bh) << 32
-                            //
-                            // So for each lane we will compute:
-                            // A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
-                            //
-                            // Note, the algorithm will use pmuldq which operates directly on
-                            // the lower 32-bit (Al or Bl) of a lane and writes the result
-                            // to the full 64-bits of the lane of the destination. For this
-                            // reason we don't need shifts to isolate the lower 32-bits, however
-                            // we will need to use shifts to isolate the high 32-bits when doing
-                            // calculations, i.e. Ah == A >> 32
-                            //
-                            // The full sequence then is as follows:
-                            // A' = A
-                            // A' = A' >> 32
-                            // A' = Ah' * Bl
-                            // B' = B
-                            // B' = B' >> 32
-                            // B' = Bh' * Al
-                            // B' = B' + A'
-                            // B' = B' << 32
-                            // A' = A
-                            // A' = Al' * Bl
-                            // A' = A' + B'
-                            // dst = A'
-
-                            // Get inputs rhs=A and lhs=B and the dst register
-                            let lhs = put_input_in_reg(ctx, inputs[0]);
-                            let rhs = put_input_in_reg(ctx, inputs[1]);
-                            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-                            // A' = A
-                            let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
-                            ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
-
-                            // A' = A' >> 32
-                            // A' = Ah' * Bl
-                            ctx.emit(Inst::xmm_rmi_reg(
-                                SseOpcode::Psrlq,
-                                RegMemImm::imm(32),
-                                rhs_1,
-                            ));
-                            ctx.emit(Inst::xmm_rm_r(
-                                SseOpcode::Pmuludq,
-                                RegMem::reg(lhs.clone()),
-                                rhs_1,
-                            ));
-
-                            // B' = B
-                            let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
-                            ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
-
-                            // B' = B' >> 32
-                            // B' = Bh' * Al
-                            ctx.emit(Inst::xmm_rmi_reg(
-                                SseOpcode::Psrlq,
-                                RegMemImm::imm(32),
-                                lhs_1,
-                            ));
-                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
-
-                            // B' = B' + A'
-                            // B' = B' << 32
-                            ctx.emit(Inst::xmm_rm_r(
-                                SseOpcode::Paddq,
-                                RegMem::reg(rhs_1.to_reg()),
-                                lhs_1,
-                            ));
-                            ctx.emit(Inst::xmm_rmi_reg(
-                                SseOpcode::Psllq,
-                                RegMemImm::imm(32),
-                                lhs_1,
-                            ));
-
-                            // A' = A
-                            // A' = Al' * Bl
-                            // A' = A' + B'
-                            // dst = A'
-                            ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
-                            ctx.emit(Inst::xmm_rm_r(
-                                SseOpcode::Pmuludq,
-                                RegMem::reg(lhs.clone()),
-                                rhs_1,
-                            ));
-                            ctx.emit(Inst::xmm_rm_r(
-                                SseOpcode::Paddq,
-                                RegMem::reg(lhs_1.to_reg()),
-                                rhs_1,
-                            ));
-                            ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
-                            return Ok(());
-                        }
-                        _ => panic!("Unsupported type for packed imul instruction: {}", ty),
-                    },
                     Opcode::AvgRound => match ty {
                         types::I8X16 => SseOpcode::Pavgb,
                         types::I16X8 => SseOpcode::Pavgw,
@@ -1692,8 +1585,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let alu_ops = match op {
                     Opcode::Iadd => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
                     Opcode::Isub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
-                    // multiply handled specially below
-                    Opcode::Imul => (AluRmiROpcode::Mul, AluRmiROpcode::Mul),
                     Opcode::Band => (AluRmiROpcode::And, AluRmiROpcode::And),
                     Opcode::Bor => (AluRmiROpcode::Or, AluRmiROpcode::Or),
                     Opcode::Bxor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
@@ -1706,84 +1597,22 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 assert_eq!(rhs.len(), 2);
                 assert_eq!(dst.len(), 2);
 
-                if op != Opcode::Imul {
-                    // add, sub, and, or, xor: just do ops on lower then upper half. Carry-flag
-                    // propagation is implicit (add/adc, sub/sbb).
-                    ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
-                    ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
-                    ctx.emit(Inst::alu_rmi_r(
-                        OperandSize::Size64,
-                        alu_ops.0,
-                        RegMemImm::reg(rhs.regs()[0]),
-                        dst.regs()[0],
-                    ));
-                    ctx.emit(Inst::alu_rmi_r(
-                        OperandSize::Size64,
-                        alu_ops.1,
-                        RegMemImm::reg(rhs.regs()[1]),
-                        dst.regs()[1],
-                    ));
-                } else {
-                    // mul:
-                    //   dst_lo = lhs_lo * rhs_lo
-                    //   dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
-                    //
-                    // so we emit:
-                    //   mov dst_lo, lhs_lo
-                    //   mul dst_lo, rhs_lo
-                    //   mov dst_hi, lhs_lo
-                    //   mul dst_hi, rhs_hi
-                    //   mov tmp, lhs_hi
-                    //   mul tmp, rhs_lo
-                    //   add dst_hi, tmp
-                    //   mov rax, lhs_lo
-                    //   umulhi rhs_lo  // implicit rax arg/dst
-                    //   add dst_hi, rax
-                    let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                    ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
-                    ctx.emit(Inst::alu_rmi_r(
-                        OperandSize::Size64,
-                        AluRmiROpcode::Mul,
-                        RegMemImm::reg(rhs.regs()[0]),
-                        dst.regs()[0],
-                    ));
-                    ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64));
-                    ctx.emit(Inst::alu_rmi_r(
-                        OperandSize::Size64,
-                        AluRmiROpcode::Mul,
-                        RegMemImm::reg(rhs.regs()[1]),
-                        dst.regs()[1],
-                    ));
-                    ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64));
-                    ctx.emit(Inst::alu_rmi_r(
-                        OperandSize::Size64,
-                        AluRmiROpcode::Mul,
-                        RegMemImm::reg(rhs.regs()[0]),
-                        tmp,
-                    ));
-                    ctx.emit(Inst::alu_rmi_r(
-                        OperandSize::Size64,
-                        AluRmiROpcode::Add,
-                        RegMemImm::reg(tmp.to_reg()),
-                        dst.regs()[1],
-                    ));
-                    ctx.emit(Inst::gen_move(
-                        Writable::from_reg(regs::rax()),
-                        lhs.regs()[0],
-                        types::I64,
-                    ));
-                    ctx.emit(Inst::mul_hi(
-                        OperandSize::Size64,
-                        /* signed = */ false,
-                        RegMem::reg(rhs.regs()[0]),
-                    ));
-                    ctx.emit(Inst::alu_rmi_r(
-                        OperandSize::Size64,
-                        AluRmiROpcode::Add,
-                        RegMemImm::reg(regs::rdx()),
-                        dst.regs()[1],
-                    ));
-                }
+                // For add, sub, and, or, xor: just do ops on lower then upper
+                // half. Carry-flag propagation is implicit (add/adc, sub/sbb).
+                ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
+                ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[1], types::I64));
+                ctx.emit(Inst::alu_rmi_r(
+                    OperandSize::Size64,
+                    alu_ops.0,
+                    RegMemImm::reg(rhs.regs()[0]),
+                    dst.regs()[0],
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    OperandSize::Size64,
+                    alu_ops.1,
+                    RegMemImm::reg(rhs.regs()[1]),
+                    dst.regs()[1],
+                ));
             } else {
                 let size = if ty == types::I64 {
                     OperandSize::Size64
@@ -1793,7 +1622,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let alu_op = match op {
                     Opcode::Iadd | Opcode::IaddIfcout => AluRmiROpcode::Add,
                     Opcode::Isub => AluRmiROpcode::Sub,
-                    Opcode::Imul => AluRmiROpcode::Mul,
                     Opcode::Band => AluRmiROpcode::And,
                     Opcode::Bor => AluRmiROpcode::Or,
                     Opcode::Bxor => AluRmiROpcode::Xor,
@@ -1803,7 +1631,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let (lhs, rhs) = match op {
                     Opcode::Iadd
                     | Opcode::IaddIfcout
-                    | Opcode::Imul
                     | Opcode::Band
                     | Opcode::Bor
                     | Opcode::Bxor => {
@@ -1833,6 +1660,218 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::Imul => {
+            let ty = ty.unwrap();
+            if ty == types::I64X2 {
+                // For I64X2 multiplication we describe a lane A as being
+                // composed of a 32-bit upper half "Ah" and a 32-bit lower half
+                // "Al". The 32-bit long hand multiplication can then be written
+                // as:
+                //    Ah Al
+                // *  Bh Bl
+                //    -----
+                //    Al * Bl
+                // + (Ah * Bl) << 32
+                // + (Al * Bh) << 32
+                //
+                // So for each lane we will compute:
+                //   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
+                //
+                // Note, the algorithm will use pmuldq which operates directly
+                // on the lower 32-bit (Al or Bl) of a lane and writes the
+                // result to the full 64-bits of the lane of the destination.
+                // For this reason we don't need shifts to isolate the lower
+                // 32-bits, however, we will need to use shifts to isolate the
+                // high 32-bits when doing calculations, i.e., Ah == A >> 32.
+                //
+                // The full sequence then is as follows:
+                //   A' = A
+                //   A' = A' >> 32
+                //   A' = Ah' * Bl
+                //   B' = B
+                //   B' = B' >> 32
+                //   B' = Bh' * Al
+                //   B' = B' + A'
+                //   B' = B' << 32
+                //   A' = A
+                //   A' = Al' * Bl
+                //   A' = A' + B'
+                //   dst = A'
+
+                // Get inputs rhs=A and lhs=B and the dst register
+                let lhs = put_input_in_reg(ctx, inputs[0]);
+                let rhs = put_input_in_reg(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+                // A' = A
+                let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+
+                // A' = A' >> 32
+                // A' = Ah' * Bl
+                ctx.emit(Inst::xmm_rmi_reg(
+                    SseOpcode::Psrlq,
+                    RegMemImm::imm(32),
+                    rhs_1,
+                ));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pmuludq,
+                    RegMem::reg(lhs.clone()),
+                    rhs_1,
+                ));
+
+                // B' = B
+                let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
+
+                // B' = B' >> 32
+                // B' = Bh' * Al
+                ctx.emit(Inst::xmm_rmi_reg(
+                    SseOpcode::Psrlq,
+                    RegMemImm::imm(32),
+                    lhs_1,
+                ));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
+
+                // B' = B' + A'
+                // B' = B' << 32
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Paddq,
+                    RegMem::reg(rhs_1.to_reg()),
+                    lhs_1,
+                ));
+                ctx.emit(Inst::xmm_rmi_reg(
+                    SseOpcode::Psllq,
+                    RegMemImm::imm(32),
+                    lhs_1,
+                ));
+
+                // A' = A
+                // A' = Al' * Bl
+                // A' = A' + B'
+                // dst = A'
+                ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pmuludq,
+                    RegMem::reg(lhs.clone()),
+                    rhs_1,
+                ));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Paddq,
+                    RegMem::reg(lhs_1.to_reg()),
+                    rhs_1,
+                ));
+                ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
+            } else if ty.lane_count() > 1 {
+                // Emit single instruction lowerings for the remaining vector
+                // multiplications.
+                let sse_op = match ty {
+                    types::I16X8 => SseOpcode::Pmullw,
+                    types::I32X4 => SseOpcode::Pmulld,
+                    _ => panic!("Unsupported type for packed imul instruction: {}", ty),
+                };
+                let lhs = put_input_in_reg(ctx, inputs[0]);
+                let rhs = input_to_reg_mem(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+
+                // Move the `lhs` to the same register as `dst`.
+                ctx.emit(Inst::gen_move(dst, lhs, ty));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            } else if ty == types::I128 || ty == types::B128 {
+                // Handle 128-bit multiplications.
+                let lhs = put_input_in_regs(ctx, inputs[0]);
+                let rhs = put_input_in_regs(ctx, inputs[1]);
+                let dst = get_output_reg(ctx, outputs[0]);
+                assert_eq!(lhs.len(), 2);
+                assert_eq!(rhs.len(), 2);
+                assert_eq!(dst.len(), 2);
+
+                // mul:
+                //   dst_lo = lhs_lo * rhs_lo
+                //   dst_hi = umulhi(lhs_lo, rhs_lo) + lhs_lo * rhs_hi + lhs_hi * rhs_lo
+                //
+                // so we emit:
+                //   mov dst_lo, lhs_lo
+                //   mul dst_lo, rhs_lo
+                //   mov dst_hi, lhs_lo
+                //   mul dst_hi, rhs_hi
+                //   mov tmp, lhs_hi
+                //   mul tmp, rhs_lo
+                //   add dst_hi, tmp
+                //   mov rax, lhs_lo
+                //   umulhi rhs_lo  // implicit rax arg/dst
+                //   add dst_hi, rax
+                let tmp = ctx.alloc_tmp(types::I64).only_reg().unwrap();
+                ctx.emit(Inst::gen_move(dst.regs()[0], lhs.regs()[0], types::I64));
+                ctx.emit(Inst::alu_rmi_r(
+                    OperandSize::Size64,
+                    AluRmiROpcode::Mul,
+                    RegMemImm::reg(rhs.regs()[0]),
+                    dst.regs()[0],
+                ));
+                ctx.emit(Inst::gen_move(dst.regs()[1], lhs.regs()[0], types::I64));
+                ctx.emit(Inst::alu_rmi_r(
+                    OperandSize::Size64,
+                    AluRmiROpcode::Mul,
+                    RegMemImm::reg(rhs.regs()[1]),
+                    dst.regs()[1],
+                ));
+                ctx.emit(Inst::gen_move(tmp, lhs.regs()[1], types::I64));
+                ctx.emit(Inst::alu_rmi_r(
+                    OperandSize::Size64,
+                    AluRmiROpcode::Mul,
+                    RegMemImm::reg(rhs.regs()[0]),
+                    tmp,
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    OperandSize::Size64,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(tmp.to_reg()),
+                    dst.regs()[1],
+                ));
+                ctx.emit(Inst::gen_move(
+                    Writable::from_reg(regs::rax()),
+                    lhs.regs()[0],
+                    types::I64,
+                ));
+                ctx.emit(Inst::mul_hi(
+                    OperandSize::Size64,
+                    /* signed = */ false,
+                    RegMem::reg(rhs.regs()[0]),
+                ));
+                ctx.emit(Inst::alu_rmi_r(
+                    OperandSize::Size64,
+                    AluRmiROpcode::Add,
+                    RegMemImm::reg(regs::rdx()),
+                    dst.regs()[1],
+                ));
+            } else {
+                let size = if ty == types::I64 {
+                    OperandSize::Size64
+                } else {
+                    OperandSize::Size32
+                };
+                let alu_op = AluRmiROpcode::Mul;
+
+                // For commutative operations, try to commute operands if one is
+                // an immediate or direct memory reference. Do so by converting
+                // LHS to RMI; if reg, then always convert RHS to RMI; else, use
+                // LHS as RMI and convert RHS to reg.
+                let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
+                let (lhs, rhs) = if let RegMemImm::Reg { reg: lhs_reg } = lhs {
+                    let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
+                    (lhs_reg, rhs)
+                } else {
+                    let rhs_reg = put_input_in_reg(ctx, inputs[1]);
+                    (rhs_reg, lhs)
+                };
+
+                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                ctx.emit(Inst::mov_r_r(OperandSize::Size64, lhs, dst));
+                ctx.emit(Inst::alu_rmi_r(size, alu_op, rhs, dst));
+            }
+        }
+
         Opcode::BandNot => {
             let ty = ty.unwrap();
             debug_assert!(ty.is_vector() && ty.bytes() == 16);

From 11457db285df4cf902f3da3acc8edb04107e764d Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Mon, 10 May 2021 15:29:26 -0700
Subject: [PATCH 2/3] x64: improve arithmetic filetests

---
 .../isa/x64/simd-arithmetic-run.clif          | 280 ++++++------------
 1 file changed, 95 insertions(+), 185 deletions(-)

diff --git a/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif b/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif
index 916f80f278f9..552e34fc15af 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-arithmetic-run.clif
@@ -18,36 +18,19 @@ block0:
 }
 ; run: %iadd_i8x16_with_overflow() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 
-function %isub_i32x4_rex() -> b1 {
-block0:
-    v0 = vconst.i32x4 [1 1 1 1]
-    v1 = vconst.i32x4 [1 2 3 4]
+function %isub_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
     v2 = isub v0, v1
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0
-
-    v5 = extractlane v2, 1
-    v6 = icmp_imm eq v5, 0xffffffff
-    ; TODO replace extractlanes with vector comparison
-
-    v7 = band v4, v6
-    return v7
+    return v2
 }
-; run
-
+; run: %isub_i32x4([1 1 1 1], [1 2 3 4]) == [0 -1 -2 -3]
 
-function %ineg_i32x4() -> b1 {
-block0:
-    v0 = vconst.i32x4 [1 1 1 1]
-    v2 = ineg v0
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, -1
-
-    return v4
+function %ineg_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = ineg v0
+    return v1
 }
-; run
+; run: %ineg_i32x4([1 1 1 1]) == [-1 -1 -1 -1]
 
 function %imul_i64x2(i64x2, i64x2) -> i64x2 {
 block0(v0: i64x2, v1: i64x2):
@@ -56,141 +39,93 @@ block0(v0: i64x2, v1: i64x2):
 }
 ; run: %imul_i64x2([0 2], [0 2]) == [0 4]
 
-function %imul_i32x4() -> b1 {
-block0:
-    v0 = vconst.i32x4 [-1 0 1 0x80_00_00_01]
-    v1 = vconst.i32x4 [2 2 2 2]
+function %imul_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
     v2 = imul v0, v1
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, -2
-
-    v5 = extractlane v2, 1
-    v6 = icmp_imm eq v5, 0
-
-    v7 = extractlane v2, 3
-    v8 = icmp_imm eq v7, 2 ; 0x80_00_00_01 * 2 == 0x1_00_00_00_02 (and the 1 is dropped)
-
-    v9 = band v4, v6
-    v10 = band v8, v9
-    return v10
+    return v2
 }
-; run
+; run: %imul_i32x4([-1 0 1 0x80_00_00_01], [2 2 2 2]) == [-2 0 2 2]
+; Note above how bits are truncated: 0x80_00_00_01 * 2 == 0x1_00_00_00_02, but
+; the leading 1 is dropped.
 
-function %imul_i16x8() -> b1 {
-block0:
-    v0 = vconst.i16x8 [-1 0 1 0x7f_ff 0 0 0 0]
-    v1 = vconst.i16x8 [2 2 2 2 0 0 0 0]
+function %imul_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
     v2 = imul v0, v1
+    return v2
+}
+; run: %imul_i16x8([-1 0 1 0x7f_ff 0 0 0 0], [2 2 2 2 0 0 0 0]) == [-2 0 2 0xff_fe 0 0 0 0]
 
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0xfffe ; 0xfffe == -2; -2 will not work here and below because v3 is
-    ; being uextend-ed, not sextend-ed
-
-    v5 = extractlane v2, 1
-    v6 = icmp_imm eq v5, 0
-
-    v7 = extractlane v2, 3
-    v8 = icmp_imm eq v7, 0xfffe ; 0x7f_ff * 2 == 0xff_fe
-
-    v9 = band v4, v6
-    v10 = band v8, v9
-
-    return v4
+function %sadd_sat_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = sadd_sat v0, v1
+    return v2
 }
-; run
+; run: %sadd_sat_i8x16([0x7f 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [0x7f 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 
-function %sadd_sat_i8x16() -> b1 {
-block0:
-    v0 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
-    v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+function %uadd_sat_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = uadd_sat v0, v1
+    return v2
+}
+; run: %uadd_sat_i16x8([-1 0 0 0 0 0 0 0], [-1 1 1 1 1 1 1 1]) == [65535 1 1 1 1 1 1 1]
 
-    v2 = sadd_sat v0, v1
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 127
+function %ssub_sat_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = ssub_sat v0, v1
+    return v2
+}
+; run: %ssub_sat_i8x16([0x80 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [0x80 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff]
+; Note that 0x80 == -128 and subtracting 1 from that should saturate.
 
-    return v4
+function %usub_sat_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = usub_sat v0, v1
+    return v2
 }
-; run
+; run: %usub_sat_i8x16([0x80 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [0x7f 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 
-function %uadd_sat_i16x8() -> b1 {
+function %add_sub_f32x4() -> b1 {
 block0:
-    v0 = vconst.i16x8 [-1 0 0 0 0 0 0 0]
-    v1 = vconst.i16x8 [-1 1 1 1 1 1 1 1]
+    v0 = vconst.f32x4 [0x4.2 0.0 0.0 0.0]
+    v1 = vconst.f32x4 [0x1.0 0x1.0 0x1.0 0x1.0]
+    v2 = vconst.f32x4 [0x5.2 0x1.0 0x1.0 0x1.0]
 
-    v2 = uadd_sat v0, v1
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 65535
+    v3 = fadd v0, v1
+    v4 = fcmp eq v3, v2
+
+    v6 = fsub v2, v1
+    v7 = fcmp eq v6, v0
 
-    return v4
+    v8 = band v4, v7
+    v9 = vall_true v8
+    return v9
 }
 ; run
 
-function %sub_sat_i8x16() -> b1 {
+function %mul_div_f32x4() -> b1 {
 block0:
-    v0 = vconst.i8x16 [128 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] ; 128 == 0x80 == -128
-    v1 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+    v0 = vconst.f32x4 [0x4.2 -0x2.1 0x2.0 0.0]
+    v1 = vconst.f32x4 [0x3.4 0x6.7 0x8.9 0xa.b]
+    v2 = vconst.f32x4 [0xd.68 -0xd.47 0x11.2 0x0.0]
 
-    v2 = ssub_sat v0, v1
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0x80 ; 0x80 == -128
+    v3 = fmul v0, v1
+    v4 = fcmp eq v3, v2
 
-    ; now re-use 0x80 as an unsigned 128
-    v5 = usub_sat v0, v2
-    v6 = extractlane v5, 0
-    v7 = icmp_imm eq v6, 0
+    v6 = fdiv v2, v1
+    v7 = fcmp eq v6, v0
 
     v8 = band v4, v7
-    return v8
+    v9 = vall_true v8
+    return v9
 }
 ; run
 
-;function %add_sub_f32x4() -> b1 {
-;block0:
-;    v0 = vconst.f32x4 [0x4.2 0.0 0.0 0.0]
-;    v1 = vconst.f32x4 [0x1.0 0x1.0 0x1.0 0x1.0]
-;    v2 = vconst.f32x4 [0x5.2 0x1.0 0x1.0 0x1.0]
-;
-;    v3 = fadd v0, v1
-;    v4 = fcmp eq v3, v2
-;
-;    v6 = fsub v2, v1
-;    v7 = fcmp eq v6, v0
-;
-;    v8 = band v4, v7
-;    v9 = vall_true v8
-;    return v9
-;}
-; _run
-
-;function %mul_div_f32x4() -> b1 {
-;block0:
-;    v0 = vconst.f32x4 [0x4.2 -0x2.1 0x2.0 0.0]
-;    v1 = vconst.f32x4 [0x3.4 0x6.7 0x8.9 0xa.b]
-;    v2 = vconst.f32x4 [0xd.68 -0xd.47 0x11.2 0x0.0]
-;
-;    v3 = fmul v0, v1
-;    v4 = fcmp eq v3, v2
-;
-;    v6 = fdiv v2, v1
-;    v7 = fcmp eq v6, v0
-;
-;    v8 = band v4, v7
-;    v9 = vall_true v8
-;    return v9
-;}
-; _run
-
-;function %sqrt_f64x2() -> b1 {
-;block0:
-;    v0 = vconst.f64x2 [0x9.0 0x1.0]
-;    v1 = sqrt v0
-;    v2 = vconst.f64x2 [0x3.0 0x1.0]
-;    v3 = fcmp eq v2, v1
-;    v4 = vall_true v3
-;    return v4
-;}
-; _run
+function %sqrt_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2): 
+    v1 = sqrt v0
+    return v1
+}
+; run: %sqrt_f64x2([0x9.0 0x1.0]) == [0x3.0 0x1.0]
 
 function %fmax_f64x2(f64x2, f64x2) -> f64x2 {
 block0(v0: f64x2, v1: f64x2):
@@ -218,58 +153,33 @@ block0(v0: f64x2, v1: f64x2):
 ; run: %fmin_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0]
 ; run: %fmin_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [-NaN 0.0]
 
-;function %fneg_f64x2() -> b1 {
-;block0:
-;    v0 = vconst.f64x2 [0x1.0 -0x1.0]
-;    v1 = fneg v0
-;
-;    v2 = vconst.f64x2 [-0x1.0 0x1.0]
-;    v3 = fcmp eq v1, v2
-;    v4 = vall_true v3
-;
-;    return v4
-;}
-; _run
+function %fneg_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2):
+    v1 = fneg v0
+    return v1
+}
+; run: %fneg_f64x2([0x1.0 -0x1.0]) == [-0x1.0 0x1.0]
 
-;function %fneg_f32x4() -> b1 {
-;block0:
-;    v0 = vconst.f32x4 [0x0.0 -0x0.0 -Inf Inf]
-;    v1 = fneg v0
-;
-;    v2 = vconst.f32x4 [-0x0.0 0x0.0 Inf -Inf]
-;    v3 = fcmp eq v1, v2
-;    v4 = vall_true v3
-;
-;    return v4
-;}
-; _run
+function %fneg_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+    v1 = fneg v0
+    return v1
+}
+; run: %fneg_f32x4([0x0.0 -0x0.0 -Inf Inf]) == [-0x0.0 0x0.0 Inf -Inf]
 
-;function %fabs_f32x4() -> b1 {
-;block0:
-;    v0 = vconst.f32x4 [0x0.0 -0x1.0 0x2.0 -0x3.0]
-;    v1 = fabs v0
-;
-;    v2 = vconst.f32x4 [0x0.0 0x1.0 0x2.0 0x3.0]
-;    v3 = fcmp eq v1, v2
-;    v4 = vall_true v3
-;
-;    return v4
-;}
-; _run
+function %fabs_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+    v1 = fabs v0
+    return v1
+}
+; run: %fabs_f32x4([0x0.0 -0x1.0 0x2.0 -0x3.0]) == [0x0.0 0x1.0 0x2.0 0x3.0]
 
-;function %average_rounding_i16x8() -> b1 {
-;block0:
-;    v0 = vconst.i16x8 [0 0 0 1 42 19 -1 0xffff]
-;    v1 = vconst.i16x8 [0 1 2 4 42 18 -1 0]
-;    v2 = vconst.i16x8 [0 1 1 3 42 19 -1 0x8000]
-;
-;    v3 = avg_round v0, v1
-;    v4 = icmp eq v2, v3
-;    v5 = vall_true v4
-;
-;    return v5
-;}
-; _run
+function %average_rounding_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = avg_round v0, v1
+    return v2
+}
+; run: %average_rounding_i16x8([0 0 0 1 42 19 -1 0xffff], [0 1 2 4 42 18 -1 0]) == [0 1 1 3 42 19 -1 0x8000]
 
 function %iabs(i32x4) -> i32x4 {
 block0(v0: i32x4):

From fff978daba87813acbfb0bc90bc03ca56f9ae48f Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Mon, 10 May 2021 16:25:03 -0700
Subject: [PATCH 3/3] x64: lower i64x2.imul to VPMULLQ when possible

This adds the machinery to encode the VPMULLQ instruction which is
available in AVX512VL and AVX512DQ. When these feature sets are
available, we use this instruction instead of a lengthy 12-instruction
sequence.
---
 cranelift/codegen/src/isa/x64/inst/args.rs    |   4 +
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  27 +++
 .../codegen/src/isa/x64/inst/emit_tests.rs    |   7 +
 cranelift/codegen/src/isa/x64/inst/mod.rs     |  57 +++++-
 cranelift/codegen/src/isa/x64/lower.rs        | 193 +++++++++---------
 5 files changed, 196 insertions(+), 92 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index b54f1b6126fe..6e0d507ab05b 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -462,6 +462,7 @@ pub(crate) enum InstructionSet {
     BMI2,
     AVX512F,
     AVX512VL,
+    AVX512DQ,
 }
 
 /// Some SSE operations requiring 2 operands r/m and r.
@@ -994,6 +995,7 @@ impl fmt::Display for SseOpcode {
 #[derive(Clone)]
 pub enum Avx512Opcode {
     Vpabsq,
+    Vpmullq,
 }
 
 impl Avx512Opcode {
@@ -1001,6 +1003,7 @@ impl Avx512Opcode {
     pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
         match self {
             Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL],
+            Avx512Opcode::Vpmullq => smallvec![InstructionSet::AVX512VL, InstructionSet::AVX512DQ],
         }
     }
 }
@@ -1009,6 +1012,7 @@ impl fmt::Debug for Avx512Opcode {
     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
         let name = match self {
             Avx512Opcode::Vpabsq => "vpabsq",
+            Avx512Opcode::Vpmullq => "vpmullq",
         };
         write!(fmt, "{}", name)
     }
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 0bd74ecd8ba4..134d6eafa197 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -128,6 +128,7 @@ pub(crate) fn emit(
             InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
             InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
             InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
+            InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
         }
     };
 
@@ -1409,6 +1410,7 @@ pub(crate) fn emit(
         Inst::XmmUnaryRmREvex { op, src, dst } => {
             let opcode = match op {
                 Avx512Opcode::Vpabsq => 0x1f,
+                _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             match src {
                 RegMem::Reg { reg: src } => EvexInstruction::new()
@@ -1545,6 +1547,31 @@ pub(crate) fn emit(
             }
         }
 
+        Inst::XmmRmREvex {
+            op,
+            src1,
+            src2,
+            dst,
+        } => {
+            let opcode = match op {
+                Avx512Opcode::Vpmullq => 0x40,
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+            match src1 {
+                RegMem::Reg { reg: src } => EvexInstruction::new()
+                    .length(EvexVectorLength::V128)
+                    .prefix(LegacyPrefixes::_66)
+                    .map(OpcodeMap::_0F38)
+                    .w(true)
+                    .opcode(opcode)
+                    .reg(dst.to_reg().get_hw_encoding())
+                    .rm(src.get_hw_encoding())
+                    .vvvvv(src2.get_hw_encoding())
+                    .encode(sink),
+                _ => todo!(),
+            };
+        }
+
         Inst::XmmMinMaxSeq {
             size,
             is_min,
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index f03762b97bab..1d0dd4aba5df 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3555,6 +3555,12 @@ fn test_x64_emit() {
         "pmullw  %xmm14, %xmm1",
     ));
 
+    insns.push((
+        Inst::xmm_rm_r_evex(Avx512Opcode::Vpmullq, RegMem::reg(xmm14), xmm10, w_xmm1),
+        "62D2AD0840CE",
+        "vpmullq %xmm14, %xmm10, %xmm1",
+    ));
+
     insns.push((
         Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
         "66450FF4C8",
@@ -4283,6 +4289,7 @@ fn test_x64_emit() {
     isa_flag_builder.enable("has_ssse3").unwrap();
     isa_flag_builder.enable("has_sse41").unwrap();
     isa_flag_builder.enable("has_avx512f").unwrap();
+    isa_flag_builder.enable("has_avx512dq").unwrap();
     let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder);
 
     let rru = regs::create_reg_universe_systemv(&flags);
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index fe89ac4c9009..547d8413cbfe 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -212,6 +212,13 @@ pub enum Inst {
         dst: Writable<Reg>,
     },
 
+    XmmRmREvex {
+        op: Avx512Opcode,
+        src1: RegMem,
+        src2: Reg,
+        dst: Writable<Reg>,
+    },
+
     /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
     /// etc.
     ///
@@ -577,7 +584,7 @@ impl Inst {
             | Inst::XmmToGpr { op, .. }
             | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
 
-            Inst::XmmUnaryRmREvex { op, .. } => op.available_from(),
+            Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } => op.available_from(),
         }
     }
 }
@@ -724,6 +731,23 @@ impl Inst {
         Inst::XmmRmR { op, src, dst }
     }
 
+    pub(crate) fn xmm_rm_r_evex(
+        op: Avx512Opcode,
+        src1: RegMem,
+        src2: Reg,
+        dst: Writable<Reg>,
+    ) -> Self {
+        src1.assert_regclass_is(RegClass::V128);
+        debug_assert!(src2.get_class() == RegClass::V128);
+        debug_assert!(dst.to_reg().get_class() == RegClass::V128);
+        Inst::XmmRmREvex {
+            op,
+            src1,
+            src2,
+            dst,
+        }
+    }
+
     pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
         debug_assert!(dst.to_reg().get_class() == RegClass::V128);
         Inst::XmmUninitializedValue { dst }
@@ -1425,6 +1449,20 @@ impl PrettyPrint for Inst {
                 show_ireg_sized(dst.to_reg(), mb_rru, 8),
             ),
 
+            Inst::XmmRmREvex {
+                op,
+                src1,
+                src2,
+                dst,
+                ..
+            } => format!(
+                "{} {}, {}, {}",
+                ljustify(op.to_string()),
+                src1.show_rru_sized(mb_rru, 8),
+                show_ireg_sized(*src2, mb_rru, 8),
+                show_ireg_sized(dst.to_reg(), mb_rru, 8),
+            ),
+
             Inst::XmmMinMaxSeq {
                 lhs,
                 rhs_dst,
@@ -1898,6 +1936,13 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
                 collector.add_mod(*dst);
             }
         }
+        Inst::XmmRmREvex {
+            src1, src2, dst, ..
+        } => {
+            src1.get_regs_as_uses(collector);
+            collector.add_use(*src2);
+            collector.add_def(*dst);
+        }
         Inst::XmmRmRImm { op, src, dst, .. } => {
             if inst.produces_const() {
                 // No need to account for src, since src == dst.
@@ -2283,6 +2328,16 @@ fn x64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
                 map_mod(mapper, dst);
             }
         }
+        Inst::XmmRmREvex {
+            ref mut src1,
+            ref mut src2,
+            ref mut dst,
+            ..
+        } => {
+            src1.map_uses(mapper);
+            map_use(mapper, src2);
+            map_def(mapper, dst);
+        }
         Inst::XmmRmiReg {
             ref mut src,
             ref mut dst,
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 3f62b375a7c7..9c77e879f286 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1663,105 +1663,116 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         Opcode::Imul => {
             let ty = ty.unwrap();
             if ty == types::I64X2 {
-                // For I64X2 multiplication we describe a lane A as being
-                // composed of a 32-bit upper half "Ah" and a 32-bit lower half
-                // "Al". The 32-bit long hand multiplication can then be written
-                // as:
-                //    Ah Al
-                // *  Bh Bl
-                //    -----
-                //    Al * Bl
-                // + (Ah * Bl) << 32
-                // + (Al * Bh) << 32
-                //
-                // So for each lane we will compute:
-                //   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
-                //
-                // Note, the algorithm will use pmuldq which operates directly
-                // on the lower 32-bit (Al or Bl) of a lane and writes the
-                // result to the full 64-bits of the lane of the destination.
-                // For this reason we don't need shifts to isolate the lower
-                // 32-bits, however, we will need to use shifts to isolate the
-                // high 32-bits when doing calculations, i.e., Ah == A >> 32.
-                //
-                // The full sequence then is as follows:
-                //   A' = A
-                //   A' = A' >> 32
-                //   A' = Ah' * Bl
-                //   B' = B
-                //   B' = B' >> 32
-                //   B' = Bh' * Al
-                //   B' = B' + A'
-                //   B' = B' << 32
-                //   A' = A
-                //   A' = Al' * Bl
-                //   A' = A' + B'
-                //   dst = A'
-
-                // Get inputs rhs=A and lhs=B and the dst register
+                // Eventually one of these should be `input_to_reg_mem` (TODO).
                 let lhs = put_input_in_reg(ctx, inputs[0]);
                 let rhs = put_input_in_reg(ctx, inputs[1]);
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
 
-                // A' = A
-                let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
-
-                // A' = A' >> 32
-                // A' = Ah' * Bl
-                ctx.emit(Inst::xmm_rmi_reg(
-                    SseOpcode::Psrlq,
-                    RegMemImm::imm(32),
-                    rhs_1,
-                ));
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Pmuludq,
-                    RegMem::reg(lhs.clone()),
-                    rhs_1,
-                ));
+                if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() {
+                    // With the right AVX512 features (VL, DQ) this operation
+                    // can lower to a single operation.
+                    ctx.emit(Inst::xmm_rm_r_evex(
+                        Avx512Opcode::Vpmullq,
+                        RegMem::reg(rhs),
+                        lhs,
+                        dst,
+                    ));
+                } else {
+                    // Otherwise, for I64X2 multiplication we describe a lane A as being
+                    // composed of a 32-bit upper half "Ah" and a 32-bit lower half
+                    // "Al". The 32-bit long hand multiplication can then be written
+                    // as:
+                    //    Ah Al
+                    // *  Bh Bl
+                    //    -----
+                    //    Al * Bl
+                    // + (Ah * Bl) << 32
+                    // + (Al * Bh) << 32
+                    //
+                    // So for each lane we will compute:
+                    //   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
+                    //
+                    // Note, the algorithm will use pmuldq which operates directly
+                    // on the lower 32-bit (Al or Bl) of a lane and writes the
+                    // result to the full 64-bits of the lane of the destination.
+                    // For this reason we don't need shifts to isolate the lower
+                    // 32-bits, however, we will need to use shifts to isolate the
+                    // high 32-bits when doing calculations, i.e., Ah == A >> 32.
+                    //
+                    // The full sequence then is as follows:
+                    //   A' = A
+                    //   A' = A' >> 32
+                    //   A' = Ah' * Bl
+                    //   B' = B
+                    //   B' = B' >> 32
+                    //   B' = Bh' * Al
+                    //   B' = B' + A'
+                    //   B' = B' << 32
+                    //   A' = A
+                    //   A' = Al' * Bl
+                    //   A' = A' + B'
+                    //   dst = A'
+
+                    // A' = A
+                    let rhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
+                    ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+
+                    // A' = A' >> 32
+                    // A' = Ah' * Bl
+                    ctx.emit(Inst::xmm_rmi_reg(
+                        SseOpcode::Psrlq,
+                        RegMemImm::imm(32),
+                        rhs_1,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pmuludq,
+                        RegMem::reg(lhs.clone()),
+                        rhs_1,
+                    ));
 
-                // B' = B
-                let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
+                    // B' = B
+                    let lhs_1 = ctx.alloc_tmp(types::I64X2).only_reg().unwrap();
+                    ctx.emit(Inst::gen_move(lhs_1, lhs, ty));
 
-                // B' = B' >> 32
-                // B' = Bh' * Al
-                ctx.emit(Inst::xmm_rmi_reg(
-                    SseOpcode::Psrlq,
-                    RegMemImm::imm(32),
-                    lhs_1,
-                ));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
+                    // B' = B' >> 32
+                    // B' = Bh' * Al
+                    ctx.emit(Inst::xmm_rmi_reg(
+                        SseOpcode::Psrlq,
+                        RegMemImm::imm(32),
+                        lhs_1,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
 
-                // B' = B' + A'
-                // B' = B' << 32
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Paddq,
-                    RegMem::reg(rhs_1.to_reg()),
-                    lhs_1,
-                ));
-                ctx.emit(Inst::xmm_rmi_reg(
-                    SseOpcode::Psllq,
-                    RegMemImm::imm(32),
-                    lhs_1,
-                ));
+                    // B' = B' + A'
+                    // B' = B' << 32
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Paddq,
+                        RegMem::reg(rhs_1.to_reg()),
+                        lhs_1,
+                    ));
+                    ctx.emit(Inst::xmm_rmi_reg(
+                        SseOpcode::Psllq,
+                        RegMemImm::imm(32),
+                        lhs_1,
+                    ));
 
-                // A' = A
-                // A' = Al' * Bl
-                // A' = A' + B'
-                // dst = A'
-                ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Pmuludq,
-                    RegMem::reg(lhs.clone()),
-                    rhs_1,
-                ));
-                ctx.emit(Inst::xmm_rm_r(
-                    SseOpcode::Paddq,
-                    RegMem::reg(lhs_1.to_reg()),
-                    rhs_1,
-                ));
-                ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
+                    // A' = A
+                    // A' = Al' * Bl
+                    // A' = A' + B'
+                    // dst = A'
+                    ctx.emit(Inst::gen_move(rhs_1, rhs, ty));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pmuludq,
+                        RegMem::reg(lhs.clone()),
+                        rhs_1,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Paddq,
+                        RegMem::reg(lhs_1.to_reg()),
+                        rhs_1,
+                    ));
+                    ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
+                }
             } else if ty.lane_count() > 1 {
                 // Emit single instruction lowerings for the remaining vector
                 // multiplications.