Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,7 @@
Pshuflw
Pshufhw
Pblendw
Movddup
))

(type CmpOpcode extern
Expand Down Expand Up @@ -1292,6 +1293,11 @@
Vpextrd
Vpextrq
Vpblendw
Vmovddup
Vpbroadcastb
Vpbroadcastw
Vpbroadcastd
Vbroadcastss
))

(type Avx512Opcode extern
Expand Down Expand Up @@ -1622,6 +1628,9 @@
(decl pure has_avx () bool)
(extern constructor has_avx has_avx)

(decl pure has_avx2 () bool)
(extern constructor has_avx2 has_avx2)

;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;;

;; Extract a constant `Imm8Reg.Imm8` from a value operand.
Expand Down Expand Up @@ -1656,9 +1665,21 @@

;; Extract a `SinkableLoad` that works with `RegMemImm.Mem` from a value
;; operand.
;;
;; Note that this will only work for 32-bit-types-or-larger since this is
;; pervasively used with operations that load a minimum of 32-bits. For
;; instructions which load exactly the type width necessary use
;; `sinkable_load_exact`.
(decl sinkable_load (SinkableLoad) Value)
(extern extractor sinkable_load sinkable_load)

;; Same as `sinkable_load` except that all type widths of loads are supported.
;;
;; Only use this when the instruction which performs the load is guaranteed to
;; load the precisely correct size.
(decl sinkable_load_exact (SinkableLoad) Value)
(extern extractor sinkable_load_exact sinkable_load_exact)

;; Sink a `SinkableLoad` into a `SyntheticAmode`.
;;
;; This is a side-effectful operation that notifies the context that the
Expand All @@ -1678,6 +1699,9 @@
(decl sink_load_to_reg_mem (SinkableLoad) RegMem)
(rule (sink_load_to_reg_mem load) (RegMem.Mem load))

(decl sink_load_to_gpr_mem (SinkableLoad) GprMem)
(rule (sink_load_to_gpr_mem load) (RegMem.Mem load))

(decl sink_load_to_reg_mem_imm (SinkableLoad) RegMemImm)
(rule (sink_load_to_reg_mem_imm load) (RegMemImm.Mem load))

Expand Down Expand Up @@ -4103,6 +4127,34 @@
(rule (trap_if_fcmp (FcmpCondResult.OrCondition producer cc1 cc2) tc)
(with_flags_side_effect producer (trap_if_or cc1 cc2 tc)))

;; Helper for creating `movddup` instructions
(decl x64_movddup (XmmMem) Xmm)
(rule (x64_movddup src)
(xmm_unary_rm_r_unaligned (SseOpcode.Movddup) src))
(rule 1 (x64_movddup src)
(if-let $true (has_avx))
(xmm_unary_rm_r_vex (AvxOpcode.Vmovddup) src))

;; Helper for creating `vpbroadcastb` instructions
(decl x64_vpbroadcastb (XmmMem) Xmm)
(rule (x64_vpbroadcastb src)
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastb) src))

;; Helper for creating `vpbroadcastw` instructions
(decl x64_vpbroadcastw (XmmMem) Xmm)
(rule (x64_vpbroadcastw src)
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastw) src))

;; Helper for creating `vpbroadcastd` instructions
(decl x64_vpbroadcastd (XmmMem) Xmm)
(rule (x64_vpbroadcastd src)
(xmm_unary_rm_r_vex (AvxOpcode.Vpbroadcastd) src))

;; Helper for creating `vbroadcastss` instructions
(decl x64_vbroadcastss (XmmMem) Xmm)
(rule (x64_vbroadcastss src)
(xmm_unary_rm_r_vex (AvxOpcode.Vbroadcastss) src))

;;;; Jumps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Unconditional jump.
Expand Down Expand Up @@ -4664,6 +4716,7 @@
(convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)

(convert SinkableLoad RegMem sink_load_to_reg_mem)
(convert SinkableLoad GprMem sink_load_to_gpr_mem)
(convert SinkableLoad RegMemImm sink_load_to_reg_mem_imm)
(convert SinkableLoad GprMemImm sink_load_to_gpr_mem_imm)
(convert SinkableLoad XmmMem sink_load_to_xmm_mem)
Expand Down
14 changes: 12 additions & 2 deletions cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,7 @@ pub(crate) enum InstructionSet {
BMI2,
FMA,
AVX,
AVX2,
AVX512BITALG,
AVX512DQ,
AVX512F,
Expand Down Expand Up @@ -1126,6 +1127,7 @@ pub enum SseOpcode {
Pshuflw,
Pshufhw,
Pblendw,
Movddup,
}

impl SseOpcode {
Expand Down Expand Up @@ -1280,7 +1282,8 @@ impl SseOpcode {
| SseOpcode::Pmulhrsw
| SseOpcode::Pshufb
| SseOpcode::Phaddw
| SseOpcode::Phaddd => SSSE3,
| SseOpcode::Phaddd
| SseOpcode::Movddup => SSSE3,

SseOpcode::Blendvpd
| SseOpcode::Blendvps
Expand Down Expand Up @@ -1524,6 +1527,7 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Pshuflw => "pshuflw",
SseOpcode::Pshufhw => "pshufhw",
SseOpcode::Pblendw => "pblendw",
SseOpcode::Movddup => "movddup",
};
write!(fmt, "{}", name)
}
Expand Down Expand Up @@ -1709,9 +1713,15 @@ impl AvxOpcode {
| AvxOpcode::Vpextrw
| AvxOpcode::Vpextrd
| AvxOpcode::Vpextrq
| AvxOpcode::Vpblendw => {
| AvxOpcode::Vpblendw
| AvxOpcode::Vmovddup
| AvxOpcode::Vbroadcastss => {
smallvec![InstructionSet::AVX]
}

AvxOpcode::Vpbroadcastb | AvxOpcode::Vpbroadcastw | AvxOpcode::Vpbroadcastd => {
smallvec![InstructionSet::AVX2]
}
}
}
}
Expand Down
9 changes: 9 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ pub(crate) fn emit(
InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
InstructionSet::FMA => info.isa_flags.has_fma(),
InstructionSet::AVX => info.isa_flags.has_avx(),
InstructionSet::AVX2 => info.isa_flags.has_avx2(),
InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
Expand Down Expand Up @@ -1826,6 +1827,7 @@ pub(crate) fn emit(
SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2),
SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2),
SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2),
SseOpcode::Movddup => (LegacyPrefixes::_F2, 0x0F12, 2),
_ => unimplemented!("Opcode {:?} not implemented", op),
};

Expand Down Expand Up @@ -2450,6 +2452,13 @@ pub(crate) fn emit(
RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10),
_ => unreachable!(),
},

AvxOpcode::Vpbroadcastb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x78),
AvxOpcode::Vpbroadcastw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x79),
AvxOpcode::Vpbroadcastd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x58),
AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),

_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};

Expand Down
124 changes: 83 additions & 41 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3915,47 +3915,89 @@

;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (multi_lane 8 16) (splat src)))
(let ((vec Xmm (vec_insert_lane $I8X16 (xmm_uninit_value) src 0))
(zeros Xmm (xmm_zero $I8X16)))
;; Shuffle the lowest byte lane to all other lanes.
(x64_pshufb vec zeros)))

(rule (lower (has_type (multi_lane 16 8) (splat src)))
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane $I16X8 (xmm_uninit_value) src 0))
(vec Xmm (vec_insert_lane $I16X8 vec src 1)))
;; Shuffle the lowest two lanes to all other lanes.
(x64_pshufd vec 0)))

(rule 1 (lower (has_type (multi_lane 32 4) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_32x4 $F32X4 src))

(rule (lower (has_type (multi_lane 32 4) (splat src)))
(lower_splat_32x4 $I32X4 src))

(decl lower_splat_32x4 (Type Value) Xmm)
(rule (lower_splat_32x4 ty src)
(let ((src RegMem src)
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
;; Shuffle the lowest lane to all other lanes.
(x64_pshufd vec 0)))

(rule 1 (lower (has_type (multi_lane 64 2) (splat src @ (value_type (ty_scalar_float _)))))
(lower_splat_64x2 $F64X2 src))

(rule (lower (has_type (multi_lane 64 2) (splat src)))
(lower_splat_64x2 $I64X2 src))

(decl lower_splat_64x2 (Type Value) Xmm)
(rule (lower_splat_64x2 ty src)
(let (;; Force the input into a register so that we don't create a
;; VCodeConstant.
(src RegMem (RegMem.Reg src))
(vec Xmm (vec_insert_lane ty (xmm_uninit_value) src 0)))
(vec_insert_lane ty vec src 1)))
;; For all the splat rules below one of the goals is that splatting a value
;; doesn't end up accidentally depending on the previous value in a register.
;; This means that instructions are chosen to avoid false dependencies where
;; new values are created fresh or otherwise overwrite previous register
;; contents where possible.
;;
;; Additionally splats are specialized to special-case load-and-splat which
;; has a number of micro-optimizations available.

;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
;; with a mask of zero which is calculated with an xor-against-itself register.
(rule 0 (lower (has_type $I8X16 (splat src)))
(x64_pshufb (bitcast_gpr_to_xmm $I32 src) (xmm_zero $I8X16)))
(rule 1 (lower (has_type $I8X16 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastb (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
(rule 3 (lower (has_type $I8X16 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2))
(x64_vpbroadcastb addr))

;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is
;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane
;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which
;; at that point is two of the 16-bit values we want to broadcast) to all the
;; lanes.
(rule 0 (lower (has_type $I16X8 (splat src)))
(x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm $I32 src) 0) 0))
(rule 1 (lower (has_type $I16X8 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastw (bitcast_gpr_to_xmm $I32 src)))
(rule 2 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0))
(rule 3 (lower (has_type $I16X8 (splat (sinkable_load_exact addr))))
(if-let $true (has_avx2))
(x64_vpbroadcastw addr))

;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be
;; used to broadcast the low lane to all other lanes.
;;
;; Note that sinkable-load cases come later
(rule 0 (lower (has_type $I32X4 (splat src)))
(x64_pshufd (bitcast_gpr_to_xmm $I32 src) 0))
(rule 1 (lower (has_type $I32X4 (splat src)))
(if-let $true (has_avx2))
(x64_vpbroadcastd (bitcast_gpr_to_xmm $I32 src)))

;; f32x4.splat - the source is already in an xmm register so `shufps` is all
;; that's necessary to complete the splat. This is specialized to `vbroadcastss`
;; on AVX2 to leverage that specific instruction for this operation.
(rule 0 (lower (has_type $F32X4 (splat src)))
(let ((tmp Xmm src))
(x64_shufps src src 0)))
(rule 1 (lower (has_type $F32X4 (splat src)))
(if-let $true (has_avx2))
(x64_vbroadcastss src))

;; t32x4.splat of a load - use a `movss` to load into an xmm register and then
;; `shufps` broadcasts to the other lanes. Note that this is used for both i32
;; and f32 splats.
;;
;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note
;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but
;; the register-based encoding is only available with AVX2. With the
;; `sinkable_load` extractor this should be guaranteed to use the memory-based
;; encoding hence the `has_avx` test.
(rule 4 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(let ((tmp Xmm (x64_movss_load addr)))
(x64_shufps tmp tmp 0)))
(rule 5 (lower (has_type (multi_lane 32 4) (splat (sinkable_load addr))))
(if-let $true (has_avx))
(x64_vbroadcastss addr))

;; t64x2.splat - use `movddup` which is exactly what we want and there's a
;; minor specialization for sinkable loads to avoid going through a gpr for i64
;; splats
(rule 0 (lower (has_type $I64X2 (splat src)))
(x64_movddup (bitcast_gpr_to_xmm $I64 src)))
(rule 0 (lower (has_type $F64X2 (splat src)))
(x64_movddup src))
(rule 5 (lower (has_type (multi_lane 64 2) (splat (sinkable_load addr))))
(x64_movddup addr))

;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down
32 changes: 24 additions & 8 deletions cranelift/codegen/src/isa/x64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,26 +70,42 @@ fn put_input_in_reg(ctx: &mut Lower<Inst>, spec: InsnInput) -> Reg {
.expect("Multi-register value not expected")
}

enum MergeableLoadSize {
/// The load size performed by a sinkable load merging operation is
/// precisely the size necessary for the type in question.
Exact,

/// Narrower-than-32-bit values are handled by ALU insts that are at least
/// 32 bits wide, which is normally OK as we ignore upper buts; but, if we
/// generate, e.g., a direct-from-memory 32-bit add for a byte value and
/// the byte is the last byte in a page, the extra data that we load is
/// incorrectly accessed. So we only allow loads to merge for
/// 32-bit-and-above widths.
Min32,
}

/// Determines whether a load operation (indicated by `src_insn`) can be merged
/// into the current lowering point. If so, returns the address-base source (as
/// an `InsnInput`) and an offset from that address from which to perform the
/// load.
fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInput, i32)> {
fn is_mergeable_load(
ctx: &mut Lower<Inst>,
src_insn: IRInst,
size: MergeableLoadSize,
) -> Option<(InsnInput, i32)> {
let insn_data = ctx.data(src_insn);
let inputs = ctx.num_inputs(src_insn);
if inputs != 1 {
return None;
}

// If this type is too small to get a merged load, don't merge the load.
let load_ty = ctx.output_ty(src_insn, 0);
if ty_bits(load_ty) < 32 {
// Narrower values are handled by ALU insts that are at least 32 bits
// wide, which is normally OK as we ignore upper buts; but, if we
// generate, e.g., a direct-from-memory 32-bit add for a byte value and
// the byte is the last byte in a page, the extra data that we load is
// incorrectly accessed. So we only allow loads to merge for
// 32-bit-and-above widths.
return None;
match size {
MergeableLoadSize::Exact => {}
MergeableLoadSize::Min32 => return None,
}
}

// Just testing the opcode is enough, because the width will always match if
Expand Down
Loading