Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 51 additions & 4 deletions cranelift/codegen/src/isa/aarch64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,8 @@
(VecDupFromFpu
(rd WritableReg)
(rn Reg)
(size VectorSize))
(size VectorSize)
(lane u8))

;; Duplicate FP immediate to vector.
(VecDupFPImm
Expand Down Expand Up @@ -1390,8 +1391,18 @@
(Addp)
;; Zip vectors (primary) [meaning, high halves]
(Zip1)
;; Zip vectors (secondary)
(Zip2)
;; Signed saturating rounding doubling multiply returning high half
(Sqrdmulh)
;; Unzip vectors (primary)
(Uzp1)
;; Unzip vectors (secondary)
(Uzp2)
;; Transpose vectors (primary)
(Trn1)
;; Transpose vectors (secondary)
(Trn2)
))

;; A Vector ALU operation which modifies a source register.
Expand Down Expand Up @@ -1420,6 +1431,10 @@
(Fneg)
;; Floating-point square root
(Fsqrt)
;; Reverse elements in 16-bit lanes
(Rev16)
;; Reverse elements in 32-bit lanes
(Rev32)
;; Reverse elements in 64-bit doublewords
(Rev64)
;; Floating-point convert to signed integer, rounding toward zero
Expand Down Expand Up @@ -1887,10 +1902,10 @@
dst))

;; Helper for emitting `MInst.VecDupFromFpu` instructions.
(decl vec_dup_from_fpu (Reg VectorSize) Reg)
(rule (vec_dup_from_fpu src size)
(decl vec_dup_from_fpu (Reg VectorSize u8) Reg)
(rule (vec_dup_from_fpu src size lane)
(let ((dst WritableReg (temp_writable_reg $I8X16))
(_ Unit (emit (MInst.VecDupFromFpu dst src size))))
(_ Unit (emit (MInst.VecDupFromFpu dst src size lane))))
dst))

;; Helper for emitting `MInst.AluRRImm12` instructions.
Expand Down Expand Up @@ -2386,6 +2401,14 @@
(decl neg (Reg VectorSize) Reg)
(rule (neg x size) (vec_misc (VecMisc2.Neg) x size))

;; Helper for generating `rev16` instructions.
(decl rev16 (Reg VectorSize) Reg)
(rule (rev16 x size) (vec_misc (VecMisc2.Rev16) x size))

;; Helper for generating `rev32` instructions.
(decl rev32 (Reg VectorSize) Reg)
(rule (rev32 x size) (vec_misc (VecMisc2.Rev32) x size))

;; Helper for generating `rev64` instructions.
(decl rev64 (Reg VectorSize) Reg)
(rule (rev64 x size) (vec_misc (VecMisc2.Rev64) x size))
Expand Down Expand Up @@ -3767,3 +3790,27 @@
(emit_side_effect (with_flags_side_effect
(cmp (OperandSize.Size32) ridx jt_size)
(jt_sequence ridx jt_info)))))

;; Helper for emitting the `uzp1` instruction
(decl vec_uzp1 (Reg Reg VectorSize) Reg)
(rule (vec_uzp1 rn rm size) (vec_rrr (VecALUOp.Uzp1) rn rm size))

;; Helper for emitting the `uzp2` instruction
(decl vec_uzp2 (Reg Reg VectorSize) Reg)
(rule (vec_uzp2 rn rm size) (vec_rrr (VecALUOp.Uzp2) rn rm size))

;; Helper for emitting the `zip1` instruction
(decl vec_zip1 (Reg Reg VectorSize) Reg)
(rule (vec_zip1 rn rm size) (vec_rrr (VecALUOp.Zip1) rn rm size))

;; Helper for emitting the `zip2` instruction
(decl vec_zip2 (Reg Reg VectorSize) Reg)
(rule (vec_zip2 rn rm size) (vec_rrr (VecALUOp.Zip2) rn rm size))

;; Helper for emitting the `trn1` instruction
(decl vec_trn1 (Reg Reg VectorSize) Reg)
(rule (vec_trn1 rn rm size) (vec_rrr (VecALUOp.Trn1) rn rm size))

;; Helper for emitting the `trn2` instruction
(decl vec_trn2 (Reg Reg VectorSize) Reg)
(rule (vec_trn2 rn rm size) (vec_rrr (VecALUOp.Trn2) rn rm size))
39 changes: 35 additions & 4 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1977,8 +1977,20 @@ impl MachInstEmit for Inst {
);
(0b1, 0b11111, enc_size)
}
VecMisc2::Rev16 => {
debug_assert_eq!(size, VectorSize::Size8x16);
(0b0, 0b00001, enc_size)
}
VecMisc2::Rev32 => {
debug_assert!(size == VectorSize::Size8x16 || size == VectorSize::Size16x8);
(0b1, 0b00000, enc_size)
}
VecMisc2::Rev64 => {
debug_assert_ne!(VectorSize::Size64x2, size);
debug_assert!(
size == VectorSize::Size8x16
|| size == VectorSize::Size16x8
|| size == VectorSize::Size32x4
);
(0b0, 0b00000, enc_size)
}
VecMisc2::Fcvtzs => {
Expand Down Expand Up @@ -2493,13 +2505,27 @@ impl MachInstEmit for Inst {
| machreg_to_vec(rd.to_reg()),
);
}
&Inst::VecDupFromFpu { rd, rn, size } => {
&Inst::VecDupFromFpu { rd, rn, size, lane } => {
let rd = allocs.next_writable(rd);
let rn = allocs.next(rn);
let q = size.is_128bits() as u32;
let imm5 = match size.lane_size() {
ScalarSize::Size32 => 0b00100,
ScalarSize::Size64 => 0b01000,
ScalarSize::Size8 => {
assert!(lane < 16);
0b00001 | (u32::from(lane) << 1)
}
ScalarSize::Size16 => {
assert!(lane < 8);
0b00010 | (u32::from(lane) << 2)
}
ScalarSize::Size32 => {
assert!(lane < 4);
0b00100 | (u32::from(lane) << 3)
}
ScalarSize::Size64 => {
assert!(lane < 2);
0b01000 | (u32::from(lane) << 4)
}
_ => unimplemented!(),
};
sink.put4(
Expand Down Expand Up @@ -2870,6 +2896,7 @@ impl MachInstEmit for Inst {
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
VecALUOp::Zip2 => (0b01001110_00_0 | enc_size << 1, 0b011110),
VecALUOp::Sqrdmulh => {
debug_assert!(
size.lane_size() == ScalarSize::Size16
Expand All @@ -2878,6 +2905,10 @@ impl MachInstEmit for Inst {

(0b001_01110_00_1 | enc_size << 1, 0b101101)
}
VecALUOp::Uzp1 => (0b01001110_00_0 | enc_size << 1, 0b000110),
VecALUOp::Uzp2 => (0b01001110_00_0 | enc_size << 1, 0b010110),
VecALUOp::Trn1 => (0b01001110_00_0 | enc_size << 1, 0b001010),
VecALUOp::Trn2 => (0b01001110_00_0 | enc_size << 1, 0b011010),
};
let top11 = if is_float {
top11 | size.enc_float_size() << 1
Expand Down
2 changes: 2 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2657,6 +2657,7 @@ fn test_aarch64_binemit() {
rd: writable_vreg(14),
rn: vreg(19),
size: VectorSize::Size32x4,
lane: 0,
},
"6E06044E",
"dup v14.4s, v19.s[0]",
Expand All @@ -2666,6 +2667,7 @@ fn test_aarch64_binemit() {
rd: writable_vreg(18),
rn: vreg(10),
size: VectorSize::Size64x2,
lane: 0,
},
"5205084E",
"dup v18.2d, v10.d[0]",
Expand Down
11 changes: 9 additions & 2 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2123,9 +2123,9 @@ impl Inst {
let rn = pretty_print_ireg(rn, size.operand_size(), allocs);
format!("dup {}, {}", rd, rn)
}
&Inst::VecDupFromFpu { rd, rn, size } => {
&Inst::VecDupFromFpu { rd, rn, size, lane } => {
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
let rn = pretty_print_vreg_element(rn, 0, size.lane_size(), allocs);
let rn = pretty_print_vreg_element(rn, lane.into(), size.lane_size(), allocs);
format!("dup {}, {}", rd, rn)
}
&Inst::VecDupFPImm { rd, imm, size } => {
Expand Down Expand Up @@ -2345,7 +2345,12 @@ impl Inst {
VecALUOp::Fmul => ("fmul", size),
VecALUOp::Addp => ("addp", size),
VecALUOp::Zip1 => ("zip1", size),
VecALUOp::Zip2 => ("zip2", size),
VecALUOp::Sqrdmulh => ("sqrdmulh", size),
VecALUOp::Uzp1 => ("uzp1", size),
VecALUOp::Uzp2 => ("uzp2", size),
VecALUOp::Trn1 => ("trn1", size),
VecALUOp::Trn2 => ("trn2", size),
};
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
let rn = pretty_print_vreg_vector(rn, size, allocs);
Expand Down Expand Up @@ -2471,6 +2476,8 @@ impl Inst {
VecMisc2::Fabs => ("fabs", size, ""),
VecMisc2::Fneg => ("fneg", size, ""),
VecMisc2::Fsqrt => ("fsqrt", size, ""),
VecMisc2::Rev16 => ("rev16", size, ""),
VecMisc2::Rev32 => ("rev32", size, ""),
VecMisc2::Rev64 => ("rev64", size, ""),
VecMisc2::Fcvtzs => ("fcvtzs", size, ""),
VecMisc2::Fcvtzu => ("fcvtzu", size, ""),
Expand Down
114 changes: 113 additions & 1 deletion cranelift/codegen/src/isa/aarch64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,118 @@

;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; When a single element of one vector is broadcast to all the destination
;; lanes then the `dup` instruction can be used for this operation. Note that
;; for now this only matches lane selection from the first vector `a`, but
;; if necessary in the future rules can be added to select from `b` as well.
(rule 6 (lower (shuffle a b (shuffle_dup8_from_imm n)))
(vec_dup_from_fpu a (VectorSize.Size8x16) n))
(rule 5 (lower (shuffle a b (shuffle_dup16_from_imm n)))
(vec_dup_from_fpu a (VectorSize.Size16x8) n))
(rule 4 (lower (shuffle a b (shuffle_dup32_from_imm n)))
(vec_dup_from_fpu a (VectorSize.Size32x4) n))
(rule 3 (lower (shuffle a b (shuffle_dup64_from_imm n)))
(vec_dup_from_fpu a (VectorSize.Size64x2) n))

;; If the `Immediate` specified to the extractor looks like a duplication of the
;; `n`th lane of the first vector of size K-byte lanes, then each extractor
;; returns the `n` value as a `u8` to be used as part of a `vec_dup_from_fpu`
;; instruction. Note that there's a different extractor for each bit-width of
;; lane.
(decl shuffle_dup8_from_imm (u8) Immediate)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add doc comments here to describe what pattern in the Immediate each of these etors matches on? (Likewise below)

(extern extractor shuffle_dup8_from_imm shuffle_dup8_from_imm)
(decl shuffle_dup16_from_imm (u8) Immediate)
(extern extractor shuffle_dup16_from_imm shuffle_dup16_from_imm)
(decl shuffle_dup32_from_imm (u8) Immediate)
(extern extractor shuffle_dup32_from_imm shuffle_dup32_from_imm)
(decl shuffle_dup64_from_imm (u8) Immediate)
(extern extractor shuffle_dup64_from_imm shuffle_dup64_from_imm)

;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
;; bytes", that's an `ext` instruction.
(rule 2 (lower (shuffle a b (vec_extract_imm4_from_immediate n)))
(vec_extract a b n))

;; Attempts to extract `n` from the specified shuffle `Immediate` where each
;; byte of the `Immediate` is a consecutive sequence starting from `n`. This
;; value of `n` is used as part of the `vec_extract` instruction which extracts
;; consecutive bytes from two vectors into one final vector, offset by `n`
;; bytes.
(decl vec_extract_imm4_from_immediate (u8) Immediate)
(extern extractor vec_extract_imm4_from_immediate vec_extract_imm4_from_immediate)

;; Rules for the `uzp1` and `uzp2` instructions which gather even-numbered lanes
;; or odd-numbered lanes
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1e1c_1a18_1614_1210_0e0c_0a08_0604_0200)))
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it would make these patterns clearer to have an extractor something like (shuffle_immediate 30 28 26 ...) (with external Rust impl that is Fn(&mut self, imm: Immediate) -> Option<(u8, u8, u8, u8, ...)>)?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I originally did this in #5905 but @jameysharp preferred the hex masks instead. I don't mind myself, but I do think it's worth being consistent across the backends so I'd want to update all the x64 things if these aarch64 rules change as wlel.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Funny, I suggested exactly the opposite in a previous PR 😆

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting!

I see the points in #5905 now about exposing more opportunity to islec by making the full mask visible as one value; that's a reasonable argument I think. My rationale was that I was having some friction converting hex values in my head to understand the permutation (but maybe the right answer to that is just to think in hex directly). I don't feel too strongly about it, so this is fine as-is.

(vec_uzp1 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1d_1b19_1715_1311_0f0d_0b09_0705_0301)))
(vec_uzp2 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1d1c_1918_1514_1110_0d0c_0908_0504_0100)))
(vec_uzp1 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_1b1a_1716_1312_0f0e_0b0a_0706_0302)))
(vec_uzp2 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1b1a1918_13121110_0b0a0908_03020100)))
(vec_uzp1 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_17161514_0f0e0d0c_07060504)))
(vec_uzp2 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
(vec_uzp1 a b (VectorSize.Size64x2)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
(vec_uzp2 a b (VectorSize.Size64x2)))

;; Rules for the `zip1` and `zip2` instructions which interleave lanes in the
;; low or high halves of the two input vectors.
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
(vec_zip1 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
(vec_zip2 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
(vec_zip1 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
(vec_zip2 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
(vec_zip1 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
(vec_zip2 a b (VectorSize.Size32x4)))
;; Note that zip1/zip2 for i64x2 vectors is omitted since it's already covered
;; by the i64x2 cases of uzp1/uzp2 above where both zip and uzp have the same
;; semantics for 64-bit lanes.

;; Rules for the `trn1` and `trn2` instructions which interleave odd or even
;; lanes in the two input vectors.
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1e0e_1c0c_1a0a_1808_1606_1404_1202_1000)))
(vec_trn1 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f0f_1d0d_1b0b_1909_1707_1505_1303_1101)))
(vec_trn2 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1d1c_0d0c_1918_0908_1514_0504_1110_0100)))
(vec_trn1 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1b1a_0b0a_1716_0706_1312_0302)))
(vec_trn2 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1b1a1918_0b0a0908_13121110_03020100)))
(vec_trn1 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_17161514_07060504)))
(vec_trn2 a b (VectorSize.Size32x4)))
;; Note that trn1/trn2 for i64x2 vectors is omitted since it's already covered
;; by the i64x2 cases of uzp1/uzp2 above where both trn and uzp have the same
;; semantics for 64-bit lanes.

;; Rules for the `rev{16,32,64}` instructions where reversals happen at either
;; the byte level, the 16-bit level, or 32-bit level. Note that all of these
;; patterns only match reversals in the first operand, but they can
;; theoretically be extended if necessary to reversals in the second operand.
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0e0f_0c0d_0a0b_0809_0607_0405_0203_0001)))
(rev16 a (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0c0d0e0f_08090a0b_04050607_00010203)))
(rev32 a (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0d0c0f0e_09080b0a_05040706_01000302)))
(rev32 a (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x08090a0b0c0d0e0f_0001020304050607)))
(rev64 a (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x09080b0a0d0c0f0e_0100030205040706)))
(rev64 a (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0b0a09080f0e0d0c_0302010007060504)))
(rev64 a (VectorSize.Size32x4)))

(rule (lower (has_type ty (shuffle rn rn2 (u128_from_immediate mask))))
(let ((mask_reg Reg (constant_f128 mask)))
(vec_tbl2 rn rn2 mask_reg ty)))
Expand Down Expand Up @@ -1840,7 +1952,7 @@
(vec_dup x (vector_size ty)))

(rule -2 (lower (has_type ty (splat x @ (value_type (ty_scalar_float _)))))
(vec_dup_from_fpu x (vector_size ty)))
(vec_dup_from_fpu x (vector_size ty) 0))

(rule (lower (has_type ty (splat (f32const (u64_from_ieee32 n)))))
(splat_const n (vector_size ty)))
Expand Down
43 changes: 43 additions & 0 deletions cranelift/codegen/src/isa/aarch64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -742,4 +742,47 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
);
}
}

fn vec_extract_imm4_from_immediate(&mut self, imm: Immediate) -> Option<u8> {
let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();

if bytes.windows(2).all(|a| a[0] + 1 == a[1]) && bytes[0] < 16 {
Some(bytes[0])
} else {
None
}
}

fn shuffle_dup8_from_imm(&mut self, imm: Immediate) -> Option<u8> {
let bytes = self.lower_ctx.get_immediate_data(imm).as_slice();
if bytes.iter().all(|b| *b == bytes[0]) && bytes[0] < 16 {
Some(bytes[0])
} else {
None
}
}
fn shuffle_dup16_from_imm(&mut self, imm: Immediate) -> Option<u8> {
let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?;
if a == b && b == c && c == d && d == e && e == f && f == g && g == h && a < 8 {
Some(a)
} else {
None
}
}
fn shuffle_dup32_from_imm(&mut self, imm: Immediate) -> Option<u8> {
let (a, b, c, d) = self.shuffle32_from_imm(imm)?;
if a == b && b == c && c == d && a < 4 {
Some(a)
} else {
None
}
}
fn shuffle_dup64_from_imm(&mut self, imm: Immediate) -> Option<u8> {
let (a, b) = self.shuffle64_from_imm(imm)?;
if a == b && a < 2 {
Some(a)
} else {
None
}
}
}
Loading