Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,12 @@
Xorpd
Phaddw
Phaddd
Punpckhdq
Punpckldq
Punpckhqdq
Punpcklqdq
Pshuflw
Pshufhw
))

(type CmpOpcode extern
Expand Down Expand Up @@ -1347,6 +1353,12 @@
Vcvttps2dq
Vphaddw
Vphaddd
Vpunpckhdq
Vpunpckldq
Vpunpckhqdq
Vpunpcklqdq
Vpshuflw
Vpshufhw
))

(type Avx512Opcode extern
Expand Down Expand Up @@ -2729,6 +2741,38 @@
(if-let $true (has_avx))
(xmm_rmir_vex (AvxOpcode.Vpunpcklwd) src1 src2))

;; Helper for creating `punpckldq` instructions.
(decl x64_punpckldq (Xmm XmmMem) Xmm)
(rule 0 (x64_punpckldq src1 src2)
(xmm_rm_r (SseOpcode.Punpckldq) src1 src2))
(rule 1 (x64_punpckldq src1 src2)
(if-let $true (has_avx))
(xmm_rmir_vex (AvxOpcode.Vpunpckldq) src1 src2))

;; Helper for creating `punpckhdq` instructions.
(decl x64_punpckhdq (Xmm XmmMem) Xmm)
(rule 0 (x64_punpckhdq src1 src2)
(xmm_rm_r (SseOpcode.Punpckhdq) src1 src2))
(rule 1 (x64_punpckhdq src1 src2)
(if-let $true (has_avx))
(xmm_rmir_vex (AvxOpcode.Vpunpckhdq) src1 src2))

;; Helper for creating `punpcklqdq` instructions.
(decl x64_punpcklqdq (Xmm XmmMem) Xmm)
(rule 0 (x64_punpcklqdq src1 src2)
(xmm_rm_r (SseOpcode.Punpcklqdq) src1 src2))
(rule 1 (x64_punpcklqdq src1 src2)
(if-let $true (has_avx))
(xmm_rmir_vex (AvxOpcode.Vpunpcklqdq) src1 src2))

;; Helper for creating `punpckhqdq` instructions.
(decl x64_punpckhqdq (Xmm XmmMem) Xmm)
(rule 0 (x64_punpckhqdq src1 src2)
(xmm_rm_r (SseOpcode.Punpckhqdq) src1 src2))
(rule 1 (x64_punpckhqdq src1 src2)
(if-let $true (has_avx))
(xmm_rmir_vex (AvxOpcode.Vpunpckhqdq) src1 src2))

;; Helper for creating `unpcklps` instructions.
(decl x64_unpcklps (Xmm XmmMem) Xmm)
(rule 0 (x64_unpcklps src1 src2)
Expand Down Expand Up @@ -3284,6 +3328,22 @@
(if-let $true (has_avx))
(xmm_rmir_vex (AvxOpcode.Vpshufb) src1 src2))

;; Helper for creating `pshuflw` instructions.
(decl x64_pshuflw (XmmMem u8) Xmm)
(rule (x64_pshuflw src imm)
(xmm_unary_rm_r_imm (SseOpcode.Pshuflw) src imm))
(rule 1 (x64_pshuflw src imm)
(if-let $true (has_avx))
(xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshuflw) src imm))

;; Helper for creating `pshufhw` instructions.
(decl x64_pshufhw (XmmMem u8) Xmm)
(rule (x64_pshufhw src imm)
(xmm_unary_rm_r_imm (SseOpcode.Pshufhw) src imm))
(rule 1 (x64_pshufhw src imm)
(if-let $true (has_avx))
(xmm_unary_rm_r_imm_vex (AvxOpcode.Vpshufhw) src imm))

;; Helper for creating `shufps` instructions.
(decl x64_shufps (Xmm XmmMem u8) Xmm)
(rule 0 (x64_shufps src1 src2 byte)
Expand Down
28 changes: 26 additions & 2 deletions cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,12 @@ pub enum SseOpcode {
Xorpd,
Phaddw,
Phaddd,
Punpckhdq,
Punpckldq,
Punpckhqdq,
Punpcklqdq,
Pshuflw,
Pshufhw,
}

impl SseOpcode {
Expand Down Expand Up @@ -1256,7 +1262,13 @@ impl SseOpcode {
| SseOpcode::Subpd
| SseOpcode::Subsd
| SseOpcode::Ucomisd
| SseOpcode::Xorpd => SSE2,
| SseOpcode::Xorpd
| SseOpcode::Punpckldq
| SseOpcode::Punpckhdq
| SseOpcode::Punpcklqdq
| SseOpcode::Punpckhqdq
| SseOpcode::Pshuflw
| SseOpcode::Pshufhw => SSE2,

SseOpcode::Pabsb
| SseOpcode::Pabsw
Expand Down Expand Up @@ -1501,6 +1513,12 @@ impl fmt::Debug for SseOpcode {
SseOpcode::Xorpd => "xorpd",
SseOpcode::Phaddw => "phaddw",
SseOpcode::Phaddd => "phaddd",
SseOpcode::Punpckldq => "punpckldq",
SseOpcode::Punpckhdq => "punpckhdq",
SseOpcode::Punpcklqdq => "punpcklqdq",
SseOpcode::Punpckhqdq => "punpckhqdq",
SseOpcode::Pshuflw => "pshuflw",
SseOpcode::Pshufhw => "pshufhw",
};
write!(fmt, "{}", name)
}
Expand Down Expand Up @@ -1669,7 +1687,13 @@ impl AvxOpcode {
| AvxOpcode::Vcvttpd2dq
| AvxOpcode::Vcvttps2dq
| AvxOpcode::Vphaddw
| AvxOpcode::Vphaddd => {
| AvxOpcode::Vphaddd
| AvxOpcode::Vpunpckldq
| AvxOpcode::Vpunpckhdq
| AvxOpcode::Vpunpcklqdq
| AvxOpcode::Vpunpckhqdq
| AvxOpcode::Vpshuflw
| AvxOpcode::Vpshufhw => {
smallvec![InstructionSet::AVX]
}
}
Expand Down
12 changes: 12 additions & 0 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1789,6 +1789,8 @@ pub(crate) fn emit(
SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
_ => unimplemented!("Opcode {:?} not implemented", op),
};
match src {
Expand Down Expand Up @@ -1946,6 +1948,10 @@ pub(crate) fn emit(
SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
Expand Down Expand Up @@ -2171,6 +2177,10 @@ pub(crate) fn emit(
AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
_ => panic!("unexpected rmir vex opcode {op:?}"),
};
VexInstruction::new()
Expand Down Expand Up @@ -2400,6 +2410,8 @@ pub(crate) fn emit(
let (prefix, map, opcode) = match op {
AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};

Expand Down
96 changes: 89 additions & 7 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3529,16 +3529,98 @@

;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Special case for the `punpckhbw` instruction which interleaves the upper
;; lanes of the two input registers.
(rule 4 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
;; integers within one value, preserving the other four 16-bit integers in that
;; value (either the high or low half). The complicated logic is in the
;; extractors here implemented in Rust and note that there's two cases for each
;; instruction here to match when either the first or second shuffle operand is
;; used.
(rule 12 (lower (shuffle x y (pshuflw_lhs_imm imm)))
(x64_pshuflw x imm))
(rule 11 (lower (shuffle x y (pshuflw_rhs_imm imm)))
(x64_pshuflw y imm))
(rule 10 (lower (shuffle x y (pshufhw_lhs_imm imm)))
(x64_pshufhw x imm))
(rule 9 (lower (shuffle x y (pshufhw_rhs_imm imm)))
(x64_pshufhw y imm))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I started to wonder how many comparisons need to be made before one can emit a shuffle lowering: I think in the end it is worth it to spend a bit more time finding the right instruction here but I wonder if there is quicker way to do so. (Just wondering, no action neeeded).

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same! My thinking though was that there's really not all that many shuffle instructions in a program so if shuffle is 100x slower to translate than load it's probably not the end of the world. That being said I do think that these can be slightly optimized where instead of having an extractor-per-immediate we could have one extractor that returns an enum of all the possible shuffle instructions that could be used which is then pattern-matched on in ISLE. I'm not sure if it would be that much faster but it would prevent bouncing back and forth between generated code and isle.rs if it becomes an issue.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to mention that V8 does this "match once, then emit" through separate opcodes like you mention below:

Oh I meant to take a look at that but never got around to it. Interestingly though it looks like the x64 backend for v8 has individual opcodes for all the punpck* variants so it seems like they probably translate shuffles to different opcodes earlier in the pipeline than what Cranelift is doing in the backend here.

I like the enum idea for some point in the future.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somewhere in v8 though something is translating from the wasm shuffle to each individual kX64* instruction though, right? I would naively expect that the translation there is on the same order-of-magnitude of complexity/time/etc for what these ISLE lowerings are generating.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since ISLE will combine all the u128_from_immediate calls in the rules at priority 6 below, many of these rules are really cheap to evaluate together. This isn't as bad as it looks. 😆

The enum approach would let ISLE do the same collapsing for a bunch more of these rules, so yeah, that should be good. Also, all the rules that would then use the same extractor but match on different returned enum variants should then be placed at the same priority if there aren't other rules at priorities in between.


(decl pshuflw_lhs_imm (u8) Immediate)
(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
(decl pshuflw_rhs_imm (u8) Immediate)
(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
(decl pshufhw_lhs_imm (u8) Immediate)
(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
(decl pshufhw_rhs_imm (u8) Immediate)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wish the matching logic could be visible here in ISLE but that doesn't really change how I feel about this PR.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I get that it gets rather hairy in the helpers...

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree! I originally was wondering how bad it would be to put this all in ISLE but I ended up concluding that it would end up with even more extractors and not necessarily simplify the overall scheme, so I opted to leave the rules in ISLE but the extraction of immediates in Rust (for better or worse)

(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)

;; Special case for the `pshufd` instruction which will permute 32-bit values
;; within a single register. This is only applicable if the `imm` specified
;; selects 32-bit values from either `x` or `y`, but not both. This means
;; there's one rule for selecting from `x` and another rule for selecting from
;; `y`.
(rule 8 (lower (shuffle x y (pshufd_lhs_imm imm)))
(x64_pshufd x imm))
(rule 7 (lower (shuffle x y (pshufd_rhs_imm imm)))
(x64_pshufd y imm))

(decl pshufd_lhs_imm (u8) Immediate)
(extern extractor pshufd_lhs_imm pshufd_lhs_imm)
(decl pshufd_rhs_imm (u8) Immediate)
(extern extractor pshufd_rhs_imm pshufd_rhs_imm)

;; Special case for i8-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
(x64_punpckhbw a b))

;; Special case for the `punpcklbw` instruction which interleaves the lower
;; lanes of the two input registers.
(rule 4 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
(x64_punpcklbw a b))

;; Special case for i16-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
(x64_punpckhwd a b))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
(x64_punpcklwd a b))

;; Special case for i32-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
(x64_punpckhdq a b))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
(x64_punpckldq a b))

;; Special case for i64-level interleaving of upper/low bytes.
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
(x64_punpckhqdq a b))
(rule 6 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
(x64_punpcklqdq a b))

;; If the vector shift mask is all 0s then that means the first byte of the
;; first operand is broadcast to all bytes. Falling through would load an
;; all-zeros constant from a rip-relative location but it should be slightly
;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
;; register.
(rule 6 (lower (shuffle a _ (u128_from_immediate 0)))
(x64_pshufb a (xmm_zero $I8X16)))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose there could be a similar version of this rule if all lanes specify to broadcast the first byte of the second operand. That's, what, if every byte in the mask is 0x10?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True! I've been going back and forth a bit on how exhaustive all these rules should be. For example this rule:

(rule 6 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
      (x64_punpckhwd a b))

could also match the u128 with the two 64-bit halves swapped to generate a punpckhwd b a instruction.

I added a bunch of lhs/rhs shuffles here for things like pshufd and shufps but I'm also second guessing myself doing that since I didn't actually see any such shuffles in the wild and I was just adding them here on a whim.

Some of this I've been feeling would be better implemented as a egraph-style optimization. For example if all the shuffles lanes are <16 then the b operand could be dropped. If they're all >=16 then they could all be updated and the a operand could be dropped. For the punpckhwd case above it's theoretically possible where the LSB of the shuffle mask could be one of the lowest numbered bytes. You can see though that by this point it feels much more complicated than a simple rule so I also haven't tried to do this yet (plus from an egraph-perspective it's not really "simpler" to replace a shuffle with a shuffle)

In the end I figured I'd stick to doing the obvious lowerings for each instruction a platform has along with ensuring that some various simd binaries I've been seeing all have all their shuffles special-cased. We can always add more here in the future, and I was mainly hoping that by that point it's more obivous how to solve the problem of "could this shuffle be special-cased if the operands were swapped?"


;; Special case for the `shufps` instruction which will select two 32-bit values
;; from the first operand and two 32-bit values from the second operand. Note
;; that there is a second case here as well for when the operands can be
;; swapped.
;;
;; Note that the priority of this instruction is currently lower than the above
;; special cases since `shufps` handles many of them and for now it's
;; hypothesized that the dedicated instructions are better than `shufps`.
;; Someone with more knowledge about x86 timings should perhaps reorder the
;; rules here eventually though.
(rule 5 (lower (shuffle x y (shufps_imm imm)))
(x64_shufps x y imm))
(rule 4 (lower (shuffle x y (shufps_rev_imm imm)))
(x64_shufps y x imm))

(decl shufps_imm(u8) Immediate)
(extern extractor shufps_imm shufps_imm)
(decl shufps_rev_imm(u8) Immediate)
(extern extractor shufps_rev_imm shufps_rev_imm)


;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
;; register. We statically build `constructed_mask` to zero out any unknown lane
;; indices (may not be completely necessary: verification could fail incorrect
Expand Down
Loading