Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cranelift/codegen/src/isa/aarch64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -852,7 +852,7 @@
(rd WritableReg)
;; Offset in range -2^20 .. 2^20.
(off i32))

;; Compute the address (using a PC-relative offset) of a 4KB page.
(Adrp
(rd WritableReg)
Expand Down Expand Up @@ -1401,6 +1401,8 @@
(Bsl)
;; Floating-point fused multiply-add vectors
(Fmla)
;; Floating-point fused multiply-subtract vectors
(Fmls)
))

;; A Vector miscellaneous operation with two registers.
Expand Down
3 changes: 3 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2906,6 +2906,9 @@ impl MachInstEmit for Inst {
VecALUModOp::Fmla => {
(0b000_01110_00_1 | (size.enc_float_size() << 1), 0b110011)
}
VecALUModOp::Fmls => {
(0b000_01110_10_1 | (size.enc_float_size() << 1), 0b110011)
}
};
sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
}
Expand Down
1 change: 1 addition & 0 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2363,6 +2363,7 @@ impl Inst {
let (op, size) = match alu_op {
VecALUModOp::Bsl => ("bsl", VectorSize::Size8x16),
VecALUModOp::Fmla => ("fmla", size),
VecALUModOp::Fmls => ("fmls", size),
};
let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
let ri = pretty_print_vreg_vector(ri, size, allocs);
Expand Down
8 changes: 7 additions & 1 deletion cranelift/codegen/src/isa/aarch64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,13 @@
(rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
(vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty)))

(rule 1 (lower (has_type (ty_scalar_float ty) (fma x y z)))
(rule 1 (lower (has_type ty @ (multi_lane _ _) (fma (fneg x) y z)))
(vec_rrr_mod (VecALUModOp.Fmls) z x y (vector_size ty)))

(rule 2 (lower (has_type ty @ (multi_lane _ _) (fma x (fneg y) z)))
(vec_rrr_mod (VecALUModOp.Fmls) z x y (vector_size ty)))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose if both x and y are fneg then this can emit fmla instead of fneg+fmls, right? But I guess that's a rewrite we ought to do in the egraph optimizations instead.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed! The x64 rules actually end up implementing that (they enable sort of switching back and forth given their structure) but it wasn't as obvious to do here - x64 uses a helper that manages sinking a load as well which adds a fair number of permutations.

I'll send a follow-up which implements the egraph optimization.


(rule 3 (lower (has_type (ty_scalar_float ty) (fma x y z)))
(fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))

;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Expand Down
159 changes: 159 additions & 0 deletions cranelift/filetests/filetests/isa/aarch64/fma.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
test compile precise-output
target aarch64

function %fma_f32(f32, f32, f32) -> f32 {
block0(v0: f32, v1: f32, v2: f32):
v3 = fma v0, v1, v2
return v3
}

; VCode:
; block0:
; fmadd s0, s0, s1, s2
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fmadd s0, s0, s1, s2
; ret

function %fma_f64(f64, f64, f64) -> f64 {
block0(v0: f64, v1: f64, v2: f64):
v3 = fma v0, v1, v2
return v3
}

; VCode:
; block0:
; fmadd d0, d0, d1, d2
; ret
;
; Disassembled:
; block0: ; offset 0x0
; fmadd d0, d0, d1, d2
; ret

function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = fma v0, v1, v2
return v3
}

; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.4s, v0.4s, v5.4s, v1.4s
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.4s, v5.4s, v1.4s
; ret

function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = fma v0, v1, v2
return v3
}

; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.2d, v0.2d, v5.2d, v1.2d
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmla v0.2d, v5.2d, v1.2d
; ret

function %fma_neg_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = fneg v0
v4 = fma v3, v1, v2
return v4
}

; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v0.4s, v5.4s, v1.4s
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v5.4s, v1.4s
; ret

function %fma_neg_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = fneg v0
v4 = fma v3, v1, v2
return v4
}

; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v0.2d, v5.2d, v1.2d
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v5.2d, v1.2d
; ret

function %fma_neg_other_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
block0(v0: f32x4, v1: f32x4, v2: f32x4):
v3 = fneg v1
v4 = fma v0, v3, v2
return v4
}

; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v0.4s, v5.4s, v1.4s
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.4s, v5.4s, v1.4s
; ret

function %fma_neg_other_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2, v2: f64x2):
v3 = fneg v1
v4 = fma v0, v3, v2
return v4
}

; VCode:
; block0:
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v0.2d, v5.2d, v1.2d
; ret
;
; Disassembled:
; block0: ; offset 0x0
; mov v5.16b, v0.16b
; mov v0.16b, v2.16b
; fmls v0.2d, v5.2d, v1.2d
; ret