Skip to content

Commit d9dfc44

Browse files
authored
ISLE: port more ops on x64 to lowering patterns. (#3855)
1 parent 90a081a commit d9dfc44

6 files changed

Lines changed: 777 additions & 270 deletions

File tree

cranelift/codegen/src/isa/x64/inst.isle

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1861,6 +1861,86 @@
18611861
(rule (pandn src1 src2)
18621862
(xmm_rm_r $F64X2 (SseOpcode.Pandn) src1 src2))
18631863

1864+
;; Helper for creating `addss` instructions.
1865+
(decl addss (Xmm XmmMem) Xmm)
1866+
(rule (addss src1 src2)
1867+
(xmm_rm_r $F32 (SseOpcode.Addss) src1 src2))
1868+
1869+
;; Helper for creating `addsd` instructions.
1870+
(decl addsd (Xmm XmmMem) Xmm)
1871+
(rule (addsd src1 src2)
1872+
(xmm_rm_r $F64 (SseOpcode.Addsd) src1 src2))
1873+
1874+
;; Helper for creating `addps` instructions.
1875+
(decl addps (Xmm XmmMem) Xmm)
1876+
(rule (addps src1 src2)
1877+
(xmm_rm_r $F32 (SseOpcode.Addps) src1 src2))
1878+
1879+
;; Helper for creating `addpd` instructions.
1880+
(decl addpd (Xmm XmmMem) Xmm)
1881+
(rule (addpd src1 src2)
1882+
(xmm_rm_r $F32 (SseOpcode.Addpd) src1 src2))
1883+
1884+
;; Helper for creating `subss` instructions.
1885+
(decl subss (Xmm XmmMem) Xmm)
1886+
(rule (subss src1 src2)
1887+
(xmm_rm_r $F32 (SseOpcode.Subss) src1 src2))
1888+
1889+
;; Helper for creating `subsd` instructions.
1890+
(decl subsd (Xmm XmmMem) Xmm)
1891+
(rule (subsd src1 src2)
1892+
(xmm_rm_r $F64 (SseOpcode.Subsd) src1 src2))
1893+
1894+
;; Helper for creating `subps` instructions.
1895+
(decl subps (Xmm XmmMem) Xmm)
1896+
(rule (subps src1 src2)
1897+
(xmm_rm_r $F32 (SseOpcode.Subps) src1 src2))
1898+
1899+
;; Helper for creating `subpd` instructions.
1900+
(decl subpd (Xmm XmmMem) Xmm)
1901+
(rule (subpd src1 src2)
1902+
(xmm_rm_r $F32 (SseOpcode.Subpd) src1 src2))
1903+
1904+
;; Helper for creating `mulss` instructions.
1905+
(decl mulss (Xmm XmmMem) Xmm)
1906+
(rule (mulss src1 src2)
1907+
(xmm_rm_r $F32 (SseOpcode.Mulss) src1 src2))
1908+
1909+
;; Helper for creating `mulsd` instructions.
1910+
(decl mulsd (Xmm XmmMem) Xmm)
1911+
(rule (mulsd src1 src2)
1912+
(xmm_rm_r $F64 (SseOpcode.Mulsd) src1 src2))
1913+
1914+
;; Helper for creating `mulps` instructions.
1915+
(decl mulps (Xmm XmmMem) Xmm)
1916+
(rule (mulps src1 src2)
1917+
(xmm_rm_r $F32 (SseOpcode.Mulps) src1 src2))
1918+
1919+
;; Helper for creating `mulpd` instructions.
1920+
(decl mulpd (Xmm XmmMem) Xmm)
1921+
(rule (mulpd src1 src2)
1922+
(xmm_rm_r $F32 (SseOpcode.Mulpd) src1 src2))
1923+
1924+
;; Helper for creating `divss` instructions.
1925+
(decl divss (Xmm XmmMem) Xmm)
1926+
(rule (divss src1 src2)
1927+
(xmm_rm_r $F32 (SseOpcode.Divss) src1 src2))
1928+
1929+
;; Helper for creating `divsd` instructions.
1930+
(decl divsd (Xmm XmmMem) Xmm)
1931+
(rule (divsd src1 src2)
1932+
(xmm_rm_r $F64 (SseOpcode.Divsd) src1 src2))
1933+
1934+
;; Helper for creating `divps` instructions.
1935+
(decl divps (Xmm XmmMem) Xmm)
1936+
(rule (divps src1 src2)
1937+
(xmm_rm_r $F32 (SseOpcode.Divps) src1 src2))
1938+
1939+
;; Helper for creating `divpd` instructions.
1940+
(decl divpd (Xmm XmmMem) Xmm)
1941+
(rule (divpd src1 src2)
1942+
(xmm_rm_r $F32 (SseOpcode.Divpd) src1 src2))
1943+
18641944
(decl sse_blend_op (Type) SseOpcode)
18651945
(rule (sse_blend_op $F32X4) (SseOpcode.Blendvps))
18661946
(rule (sse_blend_op $F64X2) (SseOpcode.Blendvpd))
@@ -2041,6 +2121,16 @@
20412121
lane
20422122
size))
20432123

2124+
;; Helper for creating `pmaddwd` instructions.
2125+
(decl pmaddwd (Xmm XmmMem) Xmm)
2126+
(rule (pmaddwd src1 src2)
2127+
(let ((dst WritableXmm (temp_writable_xmm))
2128+
(_ Unit (emit (MInst.XmmRmR (SseOpcode.Pmaddwd)
2129+
src1
2130+
src2
2131+
dst))))
2132+
dst))
2133+
20442134
;; Helper for creating `insertps` instructions.
20452135
(decl insertps (Xmm XmmMem u8) Xmm)
20462136
(rule (insertps src1 src2 lane)
@@ -2271,6 +2361,11 @@
22712361
(rule (ud2 code)
22722362
(SideEffectNoResult.Inst (MInst.Ud2 code)))
22732363

2364+
;; Helper for creating `hlt` instructions.
2365+
(decl hlt () SideEffectNoResult)
2366+
(rule (hlt)
2367+
(SideEffectNoResult.Inst (MInst.Hlt)))
2368+
22742369
;; Helper for creating `lzcnt` instructions.
22752370
(decl lzcnt (Type Gpr) Gpr)
22762371
(rule (lzcnt ty src)

cranelift/codegen/src/isa/x64/lower.isle

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1986,3 +1986,76 @@
19861986

19871987
(rule (lower (has_type (fits_in_64 ty) (breduce src)))
19881988
(value_regs_get_gpr src 0))
1989+
1990+
;; Rules for `bint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1991+
1992+
;; Booleans are stored as all-zeroes (0) or all-ones (-1). We AND out
1993+
;; the LSB to give a 0 / 1-valued integer result.
1994+
1995+
(rule (lower (has_type (fits_in_64 ty)
1996+
(bint src)))
1997+
(x64_and ty src (RegMemImm.Imm 1)))
1998+
(rule (lower (has_type $I128
1999+
(bint src)))
2000+
(value_regs
2001+
(x64_and $I64 src (RegMemImm.Imm 1))
2002+
(imm $I64 0)))
2003+
2004+
;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2005+
2006+
(rule (lower (debugtrap))
2007+
(side_effect (hlt)))
2008+
2009+
;; Rules for `widening_pairwise_dot_product_s` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2010+
2011+
(rule (lower (has_type $I32X4
2012+
(widening_pairwise_dot_product_s x y)))
2013+
(pmaddwd x y))
2014+
2015+
;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2016+
2017+
;; N.B.: there are no load-op merging rules here. We can't guarantee
2018+
;; the RHS (if a load) is 128-bit aligned, so we must avoid merging a
2019+
;; load. Likewise for other ops below.
2020+
2021+
(rule (lower (has_type $F32 (fadd x y)))
2022+
(addss x y))
2023+
(rule (lower (has_type $F64 (fadd x y)))
2024+
(addsd x y))
2025+
(rule (lower (has_type $F32X4 (fadd x y)))
2026+
(addps x y))
2027+
(rule (lower (has_type $F64X2 (fadd x y)))
2028+
(addpd x y))
2029+
2030+
;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2031+
2032+
(rule (lower (has_type $F32 (fsub x y)))
2033+
(subss x y))
2034+
(rule (lower (has_type $F64 (fsub x y)))
2035+
(subsd x y))
2036+
(rule (lower (has_type $F32X4 (fsub x y)))
2037+
(subps x y))
2038+
(rule (lower (has_type $F64X2 (fsub x y)))
2039+
(subpd x y))
2040+
2041+
;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2042+
2043+
(rule (lower (has_type $F32 (fmul x y)))
2044+
(mulss x y))
2045+
(rule (lower (has_type $F64 (fmul x y)))
2046+
(mulsd x y))
2047+
(rule (lower (has_type $F32X4 (fmul x y)))
2048+
(mulps x y))
2049+
(rule (lower (has_type $F64X2 (fmul x y)))
2050+
(mulpd x y))
2051+
2052+
;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2053+
2054+
(rule (lower (has_type $F32 (fdiv x y)))
2055+
(divss x y))
2056+
(rule (lower (has_type $F64 (fdiv x y)))
2057+
(divsd x y))
2058+
(rule (lower (has_type $F32X4 (fdiv x y)))
2059+
(divps x y))
2060+
(rule (lower (has_type $F64X2 (fdiv x y)))
2061+
(divpd x y))

cranelift/codegen/src/isa/x64/lower.rs

Lines changed: 8 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -903,33 +903,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
903903
| Opcode::Sextend
904904
| Opcode::Breduce
905905
| Opcode::Bextend
906-
| Opcode::Ireduce => implemented_in_isle(ctx),
907-
908-
Opcode::Bint => {
909-
// Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
910-
// out the LSB to give a 0 / 1-valued integer result.
911-
let rn = put_input_in_reg(ctx, inputs[0]);
912-
let rd = get_output_reg(ctx, outputs[0]);
913-
let ty = ctx.output_ty(insn, 0);
914-
915-
ctx.emit(Inst::gen_move(rd.regs()[0], rn, types::I64));
916-
ctx.emit(Inst::alu_rmi_r(
917-
OperandSize::Size64,
918-
AluRmiROpcode::And,
919-
RegMemImm::imm(1),
920-
rd.regs()[0],
921-
));
922-
923-
if ty == types::I128 {
924-
let upper = rd.regs()[1];
925-
ctx.emit(Inst::alu_rmi_r(
926-
OperandSize::Size64,
927-
AluRmiROpcode::Xor,
928-
RegMemImm::reg(upper.to_reg()),
929-
upper,
930-
));
931-
}
932-
}
906+
| Opcode::Ireduce
907+
| Opcode::Bint
908+
| Opcode::Debugtrap
909+
| Opcode::WideningPairwiseDotProductS
910+
| Opcode::Fadd
911+
| Opcode::Fsub
912+
| Opcode::Fmul
913+
| Opcode::Fdiv => implemented_in_isle(ctx),
933914

934915
Opcode::Icmp => {
935916
let condcode = ctx.data(insn).cond_code().unwrap();
@@ -1240,10 +1221,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
12401221
abi.emit_stack_post_adjust(ctx);
12411222
}
12421223

1243-
Opcode::Debugtrap => {
1244-
ctx.emit(Inst::Hlt);
1245-
}
1246-
12471224
Opcode::Trapif | Opcode::Trapff => {
12481225
let trap_code = ctx.data(insn).trap_code().unwrap();
12491226

@@ -1301,77 +1278,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
13011278
};
13021279
}
13031280

1304-
Opcode::WideningPairwiseDotProductS => {
1305-
let lhs = put_input_in_reg(ctx, inputs[0]);
1306-
let rhs = input_to_reg_mem(ctx, inputs[1]);
1307-
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1308-
let ty = ty.unwrap();
1309-
1310-
ctx.emit(Inst::gen_move(dst, lhs, ty));
1311-
1312-
if ty == types::I32X4 {
1313-
ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddwd, rhs, dst));
1314-
} else {
1315-
panic!(
1316-
"Opcode::WideningPairwiseDotProductS: unsupported laneage: {:?}",
1317-
ty
1318-
);
1319-
}
1320-
}
1321-
1322-
Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv => {
1323-
let lhs = put_input_in_reg(ctx, inputs[0]);
1324-
// We can't guarantee the RHS (if a load) is 128-bit aligned, so we
1325-
// must avoid merging a load here.
1326-
let rhs = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
1327-
let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
1328-
let ty = ty.unwrap();
1329-
1330-
// Move the `lhs` to the same register as `dst`; this may not emit an actual move
1331-
// but ensures that the registers are the same to match x86's read-write operand
1332-
// encoding.
1333-
ctx.emit(Inst::gen_move(dst, lhs, ty));
1334-
1335-
// Note: min and max can't be handled here, because of the way Cranelift defines them:
1336-
// if any operand is a NaN, they must return the NaN operand, while the x86 machine
1337-
// instruction will return the second operand if either operand is a NaN.
1338-
let sse_op = match ty {
1339-
types::F32 => match op {
1340-
Opcode::Fadd => SseOpcode::Addss,
1341-
Opcode::Fsub => SseOpcode::Subss,
1342-
Opcode::Fmul => SseOpcode::Mulss,
1343-
Opcode::Fdiv => SseOpcode::Divss,
1344-
_ => unreachable!(),
1345-
},
1346-
types::F64 => match op {
1347-
Opcode::Fadd => SseOpcode::Addsd,
1348-
Opcode::Fsub => SseOpcode::Subsd,
1349-
Opcode::Fmul => SseOpcode::Mulsd,
1350-
Opcode::Fdiv => SseOpcode::Divsd,
1351-
_ => unreachable!(),
1352-
},
1353-
types::F32X4 => match op {
1354-
Opcode::Fadd => SseOpcode::Addps,
1355-
Opcode::Fsub => SseOpcode::Subps,
1356-
Opcode::Fmul => SseOpcode::Mulps,
1357-
Opcode::Fdiv => SseOpcode::Divps,
1358-
_ => unreachable!(),
1359-
},
1360-
types::F64X2 => match op {
1361-
Opcode::Fadd => SseOpcode::Addpd,
1362-
Opcode::Fsub => SseOpcode::Subpd,
1363-
Opcode::Fmul => SseOpcode::Mulpd,
1364-
Opcode::Fdiv => SseOpcode::Divpd,
1365-
_ => unreachable!(),
1366-
},
1367-
_ => panic!(
1368-
"invalid type: expected one of [F32, F64, F32X4, F64X2], found {}",
1369-
ty
1370-
),
1371-
};
1372-
ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
1373-
}
1374-
13751281
Opcode::Fmin | Opcode::Fmax => {
13761282
let lhs = put_input_in_reg(ctx, inputs[0]);
13771283
let rhs = put_input_in_reg(ctx, inputs[1]);
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
src/clif.isle 9ea75a6f790b5c03
22
src/prelude.isle b2bc986bcbbbb77
3-
src/isa/x64/inst.isle 67eb719e568c2a81
4-
src/isa/x64/lower.isle 2d06b233fb3a1e1c
3+
src/isa/x64/inst.isle 9a8a3babd8257100
4+
src/isa/x64/lower.isle f0f4af691241209e

0 commit comments

Comments
 (0)