bytecodealliance · jameysharp · Jul 27, 2022 · Jul 24, 2022 · Jul 24, 2022 · Jul 24, 2022
@@ -927,7 +927,8 @@
 ;; Shift for vector types.
 (rule (lower (has_type (ty_vec128 ty) (ishl x y)))
       (let ((size VectorSize (vector_size ty))
-            (shift Reg (vec_dup y size)))
+            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
+            (shift Reg (vec_dup masked_shift_amt size)))
         (sshl x shift size)))
 
 ;; Helper function to emit a shift operation with the opcode specified and
@@ -986,7 +987,8 @@
 ;; Vector shifts.
 (rule (lower (has_type (ty_vec128 ty) (ushr x y)))
       (let ((size VectorSize (vector_size ty))
-            (shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
+            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
+            (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
         (ushl x shift size)))
 
 ;;     lsr       lo_rshift, src_lo, amt
@@ -1035,7 +1037,8 @@
 ;; Note that right shifts are implemented with a negative left shift.
 (rule (lower (has_type (ty_vec128 ty) (sshr x y)))
       (let ((size VectorSize (vector_size ty))
-            (shift Reg (vec_dup (sub $I32 (zero_reg) y) size)))
+            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
+            (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
         (sshl x shift size)))
 
 ;;     lsr       lo_rshift, src_lo, amt

@@ -335,7 +335,9 @@ where
     }
 
     fn shift_mask(&mut self, ty: Type) -> ImmLogic {
-        let mask = (ty.bits() - 1) as u64;
+        debug_assert!(ty.lane_bits().is_power_of_two());
+
+        let mask = (ty.lane_bits() - 1) as u64;
         ImmLogic::maybe_from_u64(mask, I32).unwrap()
     }
 

@@ -1146,6 +1146,10 @@
 (decl reg_mem_to_xmm_mem (RegMem) XmmMem)
 (extern constructor reg_mem_to_xmm_mem reg_mem_to_xmm_mem)
 
+;; Construct a new `RegMemImm` from the given `Reg`.
+(decl reg_to_reg_mem_imm (Reg) RegMemImm)
+(extern constructor reg_to_reg_mem_imm reg_to_reg_mem_imm)
+
 ;; Construct a new `GprMemImm` from the given `RegMemImm`.
 ;;
 ;; Asserts that the `RegMemImm`'s register, if any, is an GPR register.
@@ -1353,6 +1357,10 @@
 (decl const_to_type_masked_imm8 (u64 Type) Imm8Gpr)
 (extern constructor const_to_type_masked_imm8 const_to_type_masked_imm8)
 
+;; Generate a mask for the bit-width of the given type
+(decl shift_mask (Type) u32)
+(extern constructor shift_mask shift_mask)
+
 ;; Extract a constant `GprMemImm.Imm` from a value operand.
 (decl simm32_from_value (GprMemImm) Value)
 (extern extractor simm32_from_value simm32_from_value)
@@ -3042,6 +3050,7 @@
 (convert Xmm RegMem xmm_to_reg_mem)
 (convert Reg Xmm xmm_new)
 (convert Reg XmmMem reg_to_xmm_mem)
+(convert Reg RegMemImm reg_to_reg_mem_imm)
 (convert RegMem XmmMem reg_mem_to_xmm_mem)
 (convert RegMemImm XmmMemImm mov_rmi_to_xmm)
 (convert Xmm XmmMem xmm_to_xmm_mem)

@@ -531,13 +531,15 @@
 ;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
 ;; instructions. The basic idea, whether the amount to shift by is an immediate
 ;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
-(rule (lower (has_type $I8X16 (ishl src amt)))
+(rule (lower (has_type ty @ $I8X16 (ishl src amt)))
       (let (
+            ;; Mask the amount to ensure wrapping behaviour
+            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
             ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
             ;; correct for half of the lanes; the others must be fixed up with
             ;; the mask below.
-            (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm amt)))
-            (mask_addr SyntheticAmode (ishl_i8x16_mask amt))
+            (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
+            (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
             (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
         (sse_and $I8X16 unmasked (RegMem.Reg mask))))
 
@@ -571,16 +573,19 @@
 (rule (ishl_i8x16_mask (RegMemImm.Mem amt))
       (ishl_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
 
-;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
+;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
 
-(rule (lower (has_type $I16X8 (ishl src amt)))
-      (x64_psllw src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I16X8 (ishl src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_psllw src (mov_rmi_to_xmm masked_amt))))
 
-(rule (lower (has_type $I32X4 (ishl src amt)))
-      (x64_pslld src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I32X4 (ishl src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_pslld src (mov_rmi_to_xmm masked_amt))))
 
-(rule (lower (has_type $I64X2 (ishl src amt)))
-      (x64_psllq src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I64X2 (ishl src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_psllq src (mov_rmi_to_xmm masked_amt))))
 
 ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -630,13 +635,15 @@
 
 ;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
 ;; with 8x16 `ishl`.
-(rule (lower (has_type $I8X16 (ushr src amt)))
+(rule (lower (has_type ty @ $I8X16 (ushr src amt)))
       (let (
+            ;; Mask the amount to ensure wrapping behaviour
+            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
             ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
             ;; correct for half of the lanes; the others must be fixed up with
             ;; the mask below.
-            (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm amt)))
-            (mask_addr SyntheticAmode (ushr_i8x16_mask amt))
+            (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))
+            (mask_addr SyntheticAmode (ushr_i8x16_mask masked_amt))
             (mask Reg (x64_load $I8X16 mask_addr (ExtKind.None))))
         (sse_and $I8X16
                  unmasked
@@ -673,16 +680,19 @@
 (rule (ushr_i8x16_mask (RegMemImm.Mem amt))
       (ushr_i8x16_mask (RegMemImm.Reg (x64_load $I64 amt (ExtKind.None)))))
 
-;; 16x8, 32x4, and 64x2 shifts can each use a single instruction.
+;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
 
-(rule (lower (has_type $I16X8 (ushr src amt)))
-      (x64_psrlw src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I16X8 (ushr src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
 
-(rule (lower (has_type $I32X4 (ushr src amt)))
-      (x64_psrld src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I32X4 (ushr src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_psrld src (mov_rmi_to_xmm masked_amt))))
 
-(rule (lower (has_type $I64X2 (ushr src amt)))
-      (x64_psrlq src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I64X2 (ushr src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_psrlq src (mov_rmi_to_xmm masked_amt))))
 
 ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -746,14 +756,16 @@
 ;;   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
 ;;   shifted_hi.i16x8 = shift each lane of `high`
 ;;   result = [s0'', s1'', ..., s15'']
-(rule (lower (has_type $I8X16 (sshr src amt @ (value_type amt_ty))))
+(rule (lower (has_type ty @ $I8X16 (sshr src amt @ (value_type amt_ty))))
       (let ((src_ Xmm (put_in_xmm src))
+            ;; Mask the amount to ensure wrapping behaviour
+            (masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))
             ;; In order for `packsswb` later to only use the high byte of each
             ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
             ;; fill in the upper bits appropriately.
             (lo Xmm (x64_punpcklbw src_ src_))
             (hi Xmm (x64_punpckhbw src_ src_))
-            (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty amt))
+            (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt))
             (shifted_lo Xmm (x64_psraw lo amt_))
             (shifted_hi Xmm (x64_psraw hi amt_)))
         (x64_packsswb shifted_lo shifted_hi)))
@@ -773,11 +785,13 @@
 ;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
 ;; that if the shift amount is in a register, it is in an XMM register.
 
-(rule (lower (has_type $I16X8 (sshr src amt)))
-      (x64_psraw src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I16X8 (sshr src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_psraw src (mov_rmi_to_xmm masked_amt))))
 
-(rule (lower (has_type $I32X4 (sshr src amt)))
-      (x64_psrad src (mov_rmi_to_xmm amt)))
+(rule (lower (has_type ty @ $I32X4 (sshr src amt)))
+      (let ((masked_amt Reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
+        (x64_psrad src (mov_rmi_to_xmm masked_amt))))
 
 ;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
 ;; feature sets. Newer ones like AVX512VL + AVX512F include `vpsraq`, a 128-bit

@@ -227,6 +227,11 @@ where
         .unwrap()
     }
 
+    #[inline]
+    fn shift_mask(&mut self, ty: Type) -> u32 {
+        ty.lane_bits() - 1
+    }
+
     #[inline]
     fn simm32_from_value(&mut self, val: Value) -> Option<GprMemImm> {
         let inst = self.lower_ctx.dfg().value_def(val).inst()?;
@@ -413,6 +418,11 @@ where
         Writable::from_reg(Xmm::new(self.temp_writable_reg(I8X16).to_reg()).unwrap())
     }
 
+    #[inline]
+    fn reg_to_reg_mem_imm(&mut self, reg: Reg) -> RegMemImm {
+        RegMemImm::Reg { reg }
+    }
+
     #[inline]
     fn reg_mem_to_xmm_mem(&mut self, rm: &RegMem) -> XmmMem {
         XmmMem::new(rm.clone()).unwrap()

@@ -344,9 +344,10 @@ block0(v0: i8x16):
 
 ; block0:
 ;   movz x3, #1
-;   sub w5, wzr, w3
-;   dup v7.16b, w5
-;   ushl v0.16b, v0.16b, v7.16b
+;   and w5, w3, #7
+;   sub x7, xzr, x5
+;   dup v17.16b, w7
+;   ushl v0.16b, v0.16b, v17.16b
 ;   ret
 
 function %add_i128(i128, i128) -> i128 {
@@ -492,4 +493,3 @@ block0(v0: i64):
 ;   b.vc 8 ; udf
 ;   sdiv x0, x0, x3
 ;   ret
-
@@ -206,12 +206,13 @@ block0(v0: i32):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   load_const VCodeConstant(1), %xmm0
-;   movd    %edi, %xmm5
-;   psllw   %xmm0, %xmm5, %xmm0
-;   lea     const(VCodeConstant(0)), %rsi
+;   andq    %rdi, $7, %rdi
+;   movd    %edi, %xmm7
+;   psllw   %xmm0, %xmm7, %xmm0
+;   lea     const(VCodeConstant(0)), %rax
 ;   shlq    $4, %rdi, %rdi
-;   movdqu  0(%rsi,%rdi,1), %xmm13
-;   pand    %xmm0, %xmm13, %xmm0
+;   movdqu  0(%rax,%rdi,1), %xmm15
+;   pand    %xmm0, %xmm15, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -228,9 +229,14 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   load_const VCodeConstant(1), %xmm0
-;   psrlw   %xmm0, $1, %xmm0
-;   movdqu  const(VCodeConstant(0)), %xmm5
-;   pand    %xmm0, %xmm5, %xmm0
+;   movl    $1, %r11d
+;   andq    %r11, $7, %r11
+;   movd    %r11d, %xmm7
+;   psrlw   %xmm0, %xmm7, %xmm0
+;   lea     const(VCodeConstant(0)), %rax
+;   shlq    $4, %r11, %r11
+;   movdqu  0(%rax,%r11,1), %xmm15
+;   pand    %xmm0, %xmm15, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -245,15 +251,16 @@ block0(v0: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   load_const VCodeConstant(0), %xmm9
-;   movdqa  %xmm9, %xmm0
-;   punpcklbw %xmm0, %xmm9, %xmm0
-;   punpckhbw %xmm9, %xmm9, %xmm9
+;   load_const VCodeConstant(0), %xmm10
+;   andq    %rdi, $7, %rdi
+;   movdqa  %xmm10, %xmm0
+;   punpcklbw %xmm0, %xmm10, %xmm0
+;   punpckhbw %xmm10, %xmm10, %xmm10
 ;   addl    %edi, $8, %edi
-;   movd    %edi, %xmm11
-;   psraw   %xmm0, %xmm11, %xmm0
-;   psraw   %xmm9, %xmm11, %xmm9
-;   packsswb %xmm0, %xmm9, %xmm0
+;   movd    %edi, %xmm13
+;   psraw   %xmm0, %xmm13, %xmm0
+;   psraw   %xmm10, %xmm13, %xmm10
+;   packsswb %xmm0, %xmm10, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -267,17 +274,19 @@ block0(v0: i8x16, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm9
-;   punpcklbw %xmm9, %xmm0, %xmm9
+;   movl    $3, %esi
+;   andq    %rsi, $7, %rsi
+;   movdqa  %xmm0, %xmm15
+;   punpcklbw %xmm15, %xmm0, %xmm15
+;   movdqa  %xmm15, %xmm13
 ;   punpckhbw %xmm0, %xmm0, %xmm0
-;   movdqa  %xmm9, %xmm12
-;   psraw   %xmm12, $11, %xmm12
-;   movdqa  %xmm12, %xmm9
-;   psraw   %xmm0, $11, %xmm0
-;   movdqa  %xmm9, %xmm1
-;   packsswb %xmm1, %xmm0, %xmm1
-;   movdqa  %xmm1, %xmm9
-;   movdqa  %xmm9, %xmm0
+;   movdqa  %xmm0, %xmm7
+;   addl    %esi, $8, %esi
+;   movd    %esi, %xmm15
+;   movdqa  %xmm13, %xmm0
+;   psraw   %xmm0, %xmm15, %xmm0
+;   psraw   %xmm7, %xmm15, %xmm7
+;   packsswb %xmm0, %xmm7, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret

@@ -13,3 +13,33 @@ block0(v0: i32x4, v1: i32x4, v2: i32x4):
 ; run: %bitselect_i32x4(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
 ; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
 ; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
+
+function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16, v2: i8x16):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
+; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+
+function %bitselect_i8x16() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255]  ; the selector vector
+    v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected
+    v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected
+    v3 = bitselect v0, v1, v2
+
+    v4 = extractlane v3, 0
+    v5 = icmp_imm eq v4, 42
+
+    v6 = extractlane v3, 1
+    v7 = icmp_imm eq v6, 0
+
+    v8 = extractlane v3, 15
+    v9 = icmp_imm eq v8, 42
+
+    v10 = band v5, v7
+    v11 = band v10, v9
+    return v11
+}
+; run