Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions cranelift/codegen/src/isa/aarch64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,8 @@
;; Shift for vector types.
(rule (lower (has_type (ty_vec128 ty) (ishl x y)))
(let ((size VectorSize (vector_size ty))
(shift Reg (vec_dup y size)))
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
(shift Reg (vec_dup masked_shift_amt size)))
(sshl x shift size)))

;; Helper function to emit a shift operation with the opcode specified and
Expand Down Expand Up @@ -986,7 +987,8 @@
;; Vector shifts.
(rule (lower (has_type (ty_vec128 ty) (ushr x y)))
(let ((size VectorSize (vector_size ty))
(shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
(shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
(ushl x shift size)))

;; lsr lo_rshift, src_lo, amt
Expand Down Expand Up @@ -1035,7 +1037,8 @@
;; Note that right shifts are implemented with a negative left shift.
(rule (lower (has_type (ty_vec128 ty) (sshr x y)))
(let ((size VectorSize (vector_size ty))
(shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
(masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
(shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
(sshl x shift size)))

;; lsr lo_rshift, src_lo, amt
Expand Down
4 changes: 3 additions & 1 deletion cranelift/codegen/src/isa/aarch64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,9 @@ where
}

fn shift_mask(&mut self, ty: Type) -> ImmLogic {
let mask = (ty.bits() - 1) as u64;
debug_assert!(ty.lane_bits().is_power_of_two());

let mask = (ty.lane_bits() - 1) as u64;
Comment thread
afonso360 marked this conversation as resolved.
ImmLogic::maybe_from_u64(mask, I32).unwrap()
}

Expand Down
9 changes: 9 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1146,6 +1146,10 @@
(decl reg_mem_to_xmm_mem (RegMem) XmmMem)
(extern constructor reg_mem_to_xmm_mem reg_mem_to_xmm_mem)

;; Construct a new `RegMemImm` from the given `Reg`.
(decl reg_to_reg_mem_imm (Reg) RegMemImm)
(extern constructor reg_to_reg_mem_imm reg_to_reg_mem_imm)

;; Construct a new `GprMemImm` from the given `RegMemImm`.
;;
;; Asserts that the `RegMemImm`'s register, if any, is an GPR register.
Expand Down Expand Up @@ -1353,6 +1357,10 @@
(decl const_to_type_masked_imm8 (u64 Type) Imm8Gpr)
(extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)

;; Generate a mask for the bit-width of the given type
(decl shift_mask (Type) u32)
(extern constructor shift_mask shift_mask)

;; Extract a constant `GprMemImm.Imm` from a value operand.
(decl simm32_from_value (GprMemImm) Value)
(extern extractor simm32_from_value simm32_from_value)
Expand Down Expand Up @@ -3042,6 +3050,7 @@
(convert Xmm RegMem xmm_to_reg_mem)
(convert Reg Xmm xmm_new)
(convert Reg XmmMem reg_to_xmm_mem)
(convert Reg RegMemImm reg_to_reg_mem_imm)
(convert RegMem XmmMem reg_mem_to_xmm_mem)
(convert RegMemImm XmmMemImm mov_rmi_to_xmm)
(convert Xmm XmmMem xmm_to_xmm_mem)
Expand Down
66 changes: 40 additions & 26 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -531,13 +531,15 @@
;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
;; instructions. The basic idea, whether the amount to shift by is an immediate
;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
(rule (lower (has_type $I8X16 (ishl src amt)))
(rule (lower (has_type ty @ $I8X16 (ishl src amt)))
(let (
;; Mask the amount to ensure wrapping behaviour
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
;; correct for half of the lanes; the others must be fixed up with
;; the mask below.
(unmasked Xmm (x64_psllw src (mov_rmi_to_xmm amt)))
(mask_addr SyntheticAmode (ishl_i8x16_mask amt))
(unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
(mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
(sse_and $I8X16 unmasked (RegMem.Reg mask))))

Expand Down Expand Up @@ -571,16 +573,19 @@
(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
(ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))

;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.

(rule (lower (has_type $I16X8 (ishl src amt)))
(x64_psllw src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I16X8 (ishl src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psllw src (mov_rmi_to_xmm masked_amt))))

(rule (lower (has_type $I32X4 (ishl src amt)))
(x64_pslld src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I32X4 (ishl src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_pslld src (mov_rmi_to_xmm masked_amt))))

(rule (lower (has_type $I64X2 (ishl src amt)))
(x64_psllq src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I64X2 (ishl src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psllq src (mov_rmi_to_xmm masked_amt))))

;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down Expand Up @@ -630,13 +635,15 @@

;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
;; with 8x16 `ishl`.
(rule (lower (has_type $I8X16 (ushr src amt)))
(rule (lower (has_type ty @ $I8X16 (ushr src amt)))
(let (
;; Mask the amount to ensure wrapping behaviour
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
;; correct for half of the lanes; the others must be fixed up with
;; the mask below.
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm amt)))
(mask_addr SyntheticAmode (ushr_i8x16_mask amt))
(unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))
(mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
(mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
(sse_and $I8X16
unmasked
Expand Down Expand Up @@ -673,16 +680,19 @@
(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
(ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))

;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.

(rule (lower (has_type $I16X8 (ushr src amt)))
(x64_psrlw src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I16X8 (ushr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrlw src (mov_rmi_to_xmm masked_amt))))

(rule (lower (has_type $I32X4 (ushr src amt)))
(x64_psrld src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I32X4 (ushr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrld src (mov_rmi_to_xmm masked_amt))))

(rule (lower (has_type $I64X2 (ushr src amt)))
(x64_psrlq src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I64X2 (ushr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrlq src (mov_rmi_to_xmm masked_amt))))

;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down Expand Up @@ -746,14 +756,16 @@
;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
;; shifted_hi.i16x8 = shift each lane of `high`
;; result = [s0'', s1'', ..., s15'']
(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
(rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
(let ((src_ Xmm (put_in_xmm src))
;; Mask the amount to ensure wrapping behaviour
(masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
;; In order for `packsswb` later to only use the high byte of each
;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
;; fill in the upper bits appropriately.
(lo Xmm (x64_punpcklbw src_ src_))
(hi Xmm (x64_punpckhbw src_ src_))
(amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty amt))
(amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt))
(shifted_lo Xmm (x64_psraw lo amt_))
(shifted_hi Xmm (x64_psraw hi amt_)))
(x64_packsswb shifted_lo shifted_hi)))
Expand All @@ -773,11 +785,13 @@
;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
;; that if the shift amount is in a register, it is in an XMM register.

(rule (lower (has_type $I16X8 (sshr src amt)))
(x64_psraw src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I16X8 (sshr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psraw src (mov_rmi_to_xmm masked_amt))))

(rule (lower (has_type $I32X4 (sshr src amt)))
(x64_psrad src (mov_rmi_to_xmm amt)))
(rule (lower (has_type ty @ $I32X4 (sshr src amt)))
(let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
(x64_psrad src (mov_rmi_to_xmm masked_amt))))

;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit
Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/x64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,11 @@ where
.unwrap()
}

#[inline]
fn shift_mask(&mut self, ty: Type) -> u32 {
ty.lane_bits() - 1
}

#[inline]
fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
let inst = self.lower_ctx.dfg().value_def(val).inst()?;
Expand Down Expand Up @@ -413,6 +418,11 @@ where
Writable::from_reg(Xmm::new(self.temp_writable_reg(I8X16).to_reg()).unwrap())
}

#[inline]
fn reg_to_reg_mem_imm(&mut self, reg: Reg) -> RegMemImm {
RegMemImm::Reg { reg }
}

#[inline]
fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem {
XmmMem::new(rm.clone()).unwrap()
Expand Down
8 changes: 4 additions & 4 deletions cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,10 @@ block0(v0: i8x16):

; block0:
; movz x3, #1
; sub w5, wzr, w3
; dup v7.16b, w5
; ushl v0.16b, v0.16b, v7.16b
; and w5, w3, #7
; sub x7, xzr, x5
; dup v17.16b, w7
; ushl v0.16b, v0.16b, v17.16b
; ret

function %add_i128(i128, i128) -> i128 {
Expand Down Expand Up @@ -492,4 +493,3 @@ block0(v0: i64):
; b.vc 8 ; udf
; sdiv x0, x0, x3
; ret

61 changes: 35 additions & 26 deletions cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,13 @@ block0(v0: i32):
; movq %rsp, %rbp
; block0:
; load_const VCodeConstant(1), %xmm0
; movd %edi, %xmm5
; psllw %xmm0, %xmm5, %xmm0
; lea const(VCodeConstant(0)), %rsi
; andq %rdi, $7, %rdi
; movd %edi, %xmm7
; psllw %xmm0, %xmm7, %xmm0
; lea const(VCodeConstant(0)), %rax
; shlq $4, %rdi, %rdi
; movdqu 0(%rsi,%rdi,1), %xmm13
; pand %xmm0, %xmm13, %xmm0
; movdqu 0(%rax,%rdi,1), %xmm15
; pand %xmm0, %xmm15, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -228,9 +229,14 @@ block0:
; movq %rsp, %rbp
; block0:
; load_const VCodeConstant(1), %xmm0
; psrlw %xmm0, $1, %xmm0
; movdqu const(VCodeConstant(0)), %xmm5
; pand %xmm0, %xmm5, %xmm0
; movl $1, %r11d
; andq %r11, $7, %r11
; movd %r11d, %xmm7
; psrlw %xmm0, %xmm7, %xmm0
; lea const(VCodeConstant(0)), %rax
; shlq $4, %r11, %r11
; movdqu 0(%rax,%r11,1), %xmm15
; pand %xmm0, %xmm15, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -245,15 +251,16 @@ block0(v0: i32):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; load_const VCodeConstant(0), %xmm9
; movdqa %xmm9, %xmm0
; punpcklbw %xmm0, %xmm9, %xmm0
; punpckhbw %xmm9, %xmm9, %xmm9
; load_const VCodeConstant(0), %xmm10
; andq %rdi, $7, %rdi
; movdqa %xmm10, %xmm0
; punpcklbw %xmm0, %xmm10, %xmm0
; punpckhbw %xmm10, %xmm10, %xmm10
; addl %edi, $8, %edi
; movd %edi, %xmm11
; psraw %xmm0, %xmm11, %xmm0
; psraw %xmm9, %xmm11, %xmm9
; packsswb %xmm0, %xmm9, %xmm0
; movd %edi, %xmm13
; psraw %xmm0, %xmm13, %xmm0
; psraw %xmm10, %xmm13, %xmm10
; packsswb %xmm0, %xmm10, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -267,17 +274,19 @@ block0(v0: i8x16, v1: i32):
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movdqa %xmm0, %xmm9
; punpcklbw %xmm9, %xmm0, %xmm9
; movl $3, %esi
; andq %rsi, $7, %rsi
; movdqa %xmm0, %xmm15
; punpcklbw %xmm15, %xmm0, %xmm15
; movdqa %xmm15, %xmm13
; punpckhbw %xmm0, %xmm0, %xmm0
; movdqa %xmm9, %xmm12
; psraw %xmm12, $11, %xmm12
; movdqa %xmm12, %xmm9
; psraw %xmm0, $11, %xmm0
; movdqa %xmm9, %xmm1
; packsswb %xmm1, %xmm0, %xmm1
; movdqa %xmm1, %xmm9
; movdqa %xmm9, %xmm0
; movdqa %xmm0, %xmm7
; addl %esi, $8, %esi
; movd %esi, %xmm15
; movdqa %xmm13, %xmm0
; psraw %xmm0, %xmm15, %xmm0
; psraw %xmm7, %xmm15, %xmm7
; packsswb %xmm0, %xmm7, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand Down
30 changes: 30 additions & 0 deletions cranelift/filetests/filetests/runtests/simd-bitselect.clif
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,33 @@ block0(v0: i32x4, v1: i32x4, v2: i32x4):
; run: %bitselect_i32x4(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000

function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16, v2: i8x16):
v3 = bitselect v0, v1, v2
return v3
}
; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]

function %bitselect_i8x16() -> b1 {
block0:
v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255] ; the selector vector
v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected
v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected
v3 = bitselect v0, v1, v2

v4 = extractlane v3, 0
v5 = icmp_imm eq v4, 42

v6 = extractlane v3, 1
v7 = icmp_imm eq v6, 0

v8 = extractlane v3, 15
v9 = icmp_imm eq v8, 42

v10 = band v5, v7
v11 = band v10, v9
return v11
}
; run
Loading