bytecodealliance · alexcrichton · Mar 9, 2023 · Mar 3, 2023 · Mar 3, 2023 · Mar 3, 2023
@@ -863,6 +863,12 @@
             Xorpd
             Phaddw
             Phaddd
+            Punpckhdq
+            Punpckldq
+            Punpckhqdq
+            Punpcklqdq
+            Pshuflw
+            Pshufhw
           ))
 
 (type CmpOpcode extern
@@ -1347,6 +1353,12 @@
             Vcvttps2dq
             Vphaddw
             Vphaddd
+            Vpunpckhdq
+            Vpunpckldq
+            Vpunpckhqdq
+            Vpunpcklqdq
+            Vpshuflw
+            Vpshufhw
           ))
 
 (type Avx512Opcode extern
@@ -2729,6 +2741,38 @@
       (if-let $true (has_avx))
       (xmm_rmir_vex (AvxOpcode.Vpunpcklwd) src1 src2))
 
+;; Helper for creating `punpckldq` instructions.
+(decl x64_punpckldq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckldq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckldq) src1 src2))
+(rule 1 (x64_punpckldq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckldq) src1 src2))
+
+;; Helper for creating `punpckhdq` instructions.
+(decl x64_punpckhdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckhdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckhdq) src1 src2))
+(rule 1 (x64_punpckhdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckhdq) src1 src2))
+
+;; Helper for creating `punpcklqdq` instructions.
+(decl x64_punpcklqdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpcklqdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpcklqdq) src1 src2))
+(rule 1 (x64_punpcklqdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpcklqdq) src1 src2))
+
+;; Helper for creating `punpckhqdq` instructions.
+(decl x64_punpckhqdq (Xmm XmmMem) Xmm)
+(rule 0 (x64_punpckhqdq src1 src2)
+      (xmm_rm_r (SseOpcode.Punpckhqdq) src1 src2))
+(rule 1 (x64_punpckhqdq src1 src2)
+      (if-let $true (has_avx))
+      (xmm_rmir_vex (AvxOpcode.Vpunpckhqdq) src1 src2))
+
 ;; Helper for creating `unpcklps` instructions.
 (decl x64_unpcklps (Xmm XmmMem) Xmm)
 (rule 0 (x64_unpcklps src1 src2)
@@ -3284,6 +3328,22 @@
       (if-let $true (has_avx))
       (xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2))
 
+;; Helper for creating `pshuflw` instructions.
+(decl x64_pshuflw (XmmMem u8) Xmm)
+(rule (x64_pshuflw src imm)
+      (xmm_unary_rm_r_imm (SseOpcode.Pshuflw) src imm))
+(rule 1 (x64_pshuflw src imm)
+      (if-let $true (has_avx))
+      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshuflw) src imm))
+
+;; Helper for creating `pshufhw` instructions.
+(decl x64_pshufhw (XmmMem u8) Xmm)
+(rule (x64_pshufhw src imm)
+      (xmm_unary_rm_r_imm (SseOpcode.Pshufhw) src imm))
+(rule 1 (x64_pshufhw src imm)
+      (if-let $true (has_avx))
+      (xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufhw) src imm))
+
 ;; Helper for creating `shufps` instructions.
 (decl x64_shufps (Xmm XmmMem u8) Xmm)
 (rule 0 (x64_shufps src1 src2 byte)

@@ -1117,6 +1117,12 @@ pub enum SseOpcode {
     Xorpd,
     Phaddw,
     Phaddd,
+    Punpckhdq,
+    Punpckldq,
+    Punpckhqdq,
+    Punpcklqdq,
+    Pshuflw,
+    Pshufhw,
 }
 
 impl SseOpcode {
@@ -1256,7 +1262,13 @@ impl SseOpcode {
             | SseOpcode::Subpd
             | SseOpcode::Subsd
             | SseOpcode::Ucomisd
-            | SseOpcode::Xorpd => SSE2,
+            | SseOpcode::Xorpd
+            | SseOpcode::Punpckldq
+            | SseOpcode::Punpckhdq
+            | SseOpcode::Punpcklqdq
+            | SseOpcode::Punpckhqdq
+            | SseOpcode::Pshuflw
+            | SseOpcode::Pshufhw => SSE2,
 
             SseOpcode::Pabsb
             | SseOpcode::Pabsw
@@ -1501,6 +1513,12 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Xorpd => "xorpd",
             SseOpcode::Phaddw => "phaddw",
             SseOpcode::Phaddd => "phaddd",
+            SseOpcode::Punpckldq => "punpckldq",
+            SseOpcode::Punpckhdq => "punpckhdq",
+            SseOpcode::Punpcklqdq => "punpcklqdq",
+            SseOpcode::Punpckhqdq => "punpckhqdq",
+            SseOpcode::Pshuflw => "pshuflw",
+            SseOpcode::Pshufhw => "pshufhw",
         };
         write!(fmt, "{}", name)
     }
@@ -1669,7 +1687,13 @@ impl AvxOpcode {
             | AvxOpcode::Vcvttpd2dq
             | AvxOpcode::Vcvttps2dq
             | AvxOpcode::Vphaddw
-            | AvxOpcode::Vphaddd => {
+            | AvxOpcode::Vphaddd
+            | AvxOpcode::Vpunpckldq
+            | AvxOpcode::Vpunpckhdq
+            | AvxOpcode::Vpunpcklqdq
+            | AvxOpcode::Vpunpckhqdq
+            | AvxOpcode::Vpshuflw
+            | AvxOpcode::Vpshufhw => {
                 smallvec![InstructionSet::AVX]
             }
         }

@@ -1789,6 +1789,8 @@ pub(crate) fn emit(
                 SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
                 SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
                 SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
+                SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
+                SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             match src {
@@ -1946,6 +1948,10 @@ pub(crate) fn emit(
                 SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
                 SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
                 SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
+                SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
+                SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
+                SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
+                SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
                 SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
                 SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
                 SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
@@ -2171,6 +2177,10 @@ pub(crate) fn emit(
                 AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
                 AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
                 AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
+                AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
+                AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
+                AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
+                AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
                 _ => panic!("unexpected rmir vex opcode {op:?}"),
             };
             VexInstruction::new()
@@ -2400,6 +2410,8 @@ pub(crate) fn emit(
             let (prefix, map, opcode) = match op {
                 AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
                 AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
+                AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
+                AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
                 _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
             };
 

@@ -3529,16 +3529,98 @@
 
 ;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; Special case for the `punpckhbw` instruction which interleaves the upper
-;; lanes of the two input registers.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
+;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
+;; integers within one value, preserving the other four 16-bit integers in that
+;; value (either the high or low half). The complicated logic is in the
+;; extractors here implemented in Rust and note that there's two cases for each
+;; instruction here to match when either the first or second shuffle operand is
+;; used.
+(rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm)))
+      (x64_pshuflw x imm))
+(rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm)))
+      (x64_pshuflw y imm))
+(rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm)))
+      (x64_pshufhw x imm))
+(rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm)))
+      (x64_pshufhw y imm))
+
+(decl pshuflw_lhs_imm (u8) Immediate)
+(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
+(decl pshuflw_rhs_imm (u8) Immediate)
+(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
+(decl pshufhw_lhs_imm (u8) Immediate)
+(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
+(decl pshufhw_rhs_imm (u8) Immediate)
+(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)
+
+;; Special case for the `pshufd` instruction which will permute 32-bit values
+;; within a single register. This is only applicable if the `imm` specified
+;; selects 32-bit values from either `x` or `y`, but not both. This means
+;; there's one rule for selecting from `x` and another rule for selecting from
+;; `y`.
+(rule 8 (lower (shuffle x y (pshufd_lhs_imm imm)))
+      (x64_pshufd x imm))
+(rule 7 (lower (shuffle x y (pshufd_rhs_imm imm)))
+      (x64_pshufd y imm))
+
+(decl pshufd_lhs_imm (u8) Immediate)
+(extern extractor pshufd_lhs_imm pshufd_lhs_imm)
+(decl pshufd_rhs_imm (u8) Immediate)
+(extern extractor pshufd_rhs_imm pshufd_rhs_imm)
+
+;; Special case for i8-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
       (x64_punpckhbw a b))
-
-;; Special case for the `punpcklbw` instruction which interleaves the lower
-;; lanes of the two input registers.
-(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
       (x64_punpcklbw a b))
 
+;; Special case for i16-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
+      (x64_punpckhwd a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
+      (x64_punpcklwd a b))
+
+;; Special case for i32-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
+      (x64_punpckhdq a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
+      (x64_punpckldq a b))
+
+;; Special case for i64-level interleaving of upper/low bytes.
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
+      (x64_punpckhqdq a b))
+(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
+      (x64_punpcklqdq a b))
+
+;; If the vector shift mask is all 0s then that means the first byte of the
+;; first operand is broadcast to all bytes. Falling through would load an
+;; all-zeros constant from a rip-relative location but it should be slightly
+;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
+;; register.
+(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
+      (x64_pshufb a (xmm_zero $I8X16)))
+
+;; Special case for the `shufps` instruction which will select two 32-bit values
+;; from the first operand and two 32-bit values from the second operand. Note
+;; that there is a second case here as well for when the operands can be
+;; swapped.
+;;
+;; Note that the priority of this instruction is currently lower than the above
+;; special cases since `shufps` handles many of them and for now it's
+;; hypothesized that the dedicated instructions are better than `shufps`.
+;; Someone with more knowledge about x86 timings should perhaps reorder the
+;; rules here eventually though.
+(rule 5 (lower (shuffle x y (shufps_imm imm)))
+      (x64_shufps x y imm))
+(rule 4 (lower (shuffle x y (shufps_rev_imm imm)))
+      (x64_shufps y x imm))
+
+(decl shufps_imm(u8) Immediate)
+(extern extractor shufps_imm shufps_imm)
+(decl shufps_rev_imm(u8) Immediate)
+(extern extractor shufps_rev_imm shufps_rev_imm)
+
+
 ;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
 ;; register. We statically build `constructed_mask` to zero out any unknown lane
 ;; indices (may not be completely necessary: verification could fail incorrect