Skip to content

Commit f766d21

Browse files
elliotttjameysharp
andcommitted
Reuse the epilogue generation functions for tail call emission
Instead of building and copying the new frame over the old one, make use of the frame shrink/grow pseudo-instructions to move the frame, and then reuse the existing epilogue generation functions to setup the tail call. Co-authored-by: Jamey Sharp <jsharp@fastly.com>
1 parent fe39506 commit f766d21

11 files changed

Lines changed: 258 additions & 456 deletions

File tree

cranelift/codegen/src/isa/aarch64/lower/isle.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
100100
self.lower_ctx.sigs(),
101101
callee_sig,
102102
&callee,
103+
// TODO: this should be Opcode::ReturnCall, once aarch64 has been ported to the new
104+
// tail call strategy.
105+
Opcode::Call,
103106
distance,
104107
caller_conv,
105108
self.backend.flags().clone(),

cranelift/codegen/src/isa/riscv64/lower/isle.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use crate::machinst::{VCodeConstant, VCodeConstantData};
1818
use crate::{
1919
ir::{
2020
immediates::*, types::*, AtomicRmwOp, BlockCall, ExternalName, Inst, InstructionData,
21-
MemFlags, StackSlot, TrapCode, Value, ValueList,
21+
MemFlags, Opcode, StackSlot, TrapCode, Value, ValueList,
2222
},
2323
isa::riscv64::inst::*,
2424
machinst::{ArgPair, InstOutput},
@@ -82,6 +82,9 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend>
8282
self.lower_ctx.sigs(),
8383
callee_sig,
8484
&callee,
85+
// TODO: this should be Opcode::ReturnCall, once riscv64 has been ported to the new
86+
// tail call strategy.
87+
Opcode::Call,
8588
distance,
8689
caller_conv,
8790
self.backend.flags().clone(),

cranelift/codegen/src/isa/x64/abi.rs

Lines changed: 7 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -926,8 +926,9 @@ impl ABIMachineSpec for X64ABIMachineSpec {
926926

927927
impl X64CallSite {
928928
pub fn emit_return_call(mut self, ctx: &mut Lower<Inst>, args: isle::ValueSlice) {
929-
let (new_stack_arg_size, old_stack_arg_size) =
930-
self.emit_temporary_tail_call_frame(ctx, args);
929+
let new_stack_arg_size =
930+
u32::try_from(self.sig(ctx.sigs()).sized_stack_arg_space()).unwrap();
931+
let old_stack_arg_size = ctx.abi().stack_args_size(ctx.sigs());
931932

932933
match new_stack_arg_size.cmp(&old_stack_arg_size) {
933934
core::cmp::Ordering::Equal => {}
@@ -947,43 +948,15 @@ impl X64CallSite {
947948
}
948949
}
949950

950-
// Make a copy of the frame pointer, since we use it when copying down
951-
// the new stack frame.
952-
let fp = ctx.temp_writable_gpr();
953-
let rbp = PReg::from(regs::rbp().to_real_reg().unwrap());
954-
ctx.emit(Inst::MovFromPReg { src: rbp, dst: fp });
955-
956-
// Load the return address, because copying our new stack frame
957-
// over our current stack frame might overwrite it, and we'll need to
958-
// place it in the correct location after we do that copy.
959-
//
960-
// But we only need to actually move the return address if the size of
961-
// stack arguments changes.
962-
let ret_addr = if new_stack_arg_size != old_stack_arg_size {
963-
let ret_addr = ctx.temp_writable_gpr();
964-
ctx.emit(Inst::Mov64MR {
965-
src: SyntheticAmode::Real(Amode::ImmReg {
966-
simm32: 8,
967-
base: *fp.to_reg(),
968-
flags: MemFlags::trusted(),
969-
}),
970-
dst: ret_addr,
971-
});
972-
Some(ret_addr.to_reg())
973-
} else {
974-
None
975-
};
951+
// Put all arguments in registers and stack slots (within that newly
952+
// allocated stack space).
953+
self.emit_args(ctx, args);
954+
self.emit_stack_ret_arg_for_tail_call(ctx);
976955

977956
// Finally, emit the macro instruction to copy the new stack frame over
978957
// our current one and do the actual tail call!
979-
980958
let dest = self.dest().clone();
981959
let info = Box::new(ReturnCallInfo {
982-
new_stack_arg_size,
983-
old_stack_arg_size,
984-
ret_addr,
985-
fp: fp.to_reg(),
986-
tmp: ctx.temp_writable_gpr(),
987960
uses: self.take_uses(),
988961
});
989962
match dest {

cranelift/codegen/src/isa/x64/inst/emit.rs

Lines changed: 31 additions & 149 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::ir;
22
use crate::ir::immediates::{Ieee32, Ieee64};
3-
use crate::ir::{KnownSymbol, MemFlags};
3+
use crate::ir::KnownSymbol;
44
use crate::isa::x64::encoding::evex::{EvexInstruction, EvexVectorLength, RegisterOrAmode};
55
use crate::isa::x64::encoding::rex::{
66
emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
@@ -1628,18 +1628,7 @@ pub(crate) fn emit(
16281628
callee,
16291629
info: call_info,
16301630
} => {
1631-
emit_return_call_common_sequence(
1632-
allocs,
1633-
sink,
1634-
info,
1635-
state,
1636-
call_info.new_stack_arg_size,
1637-
call_info.old_stack_arg_size,
1638-
call_info.ret_addr,
1639-
call_info.fp,
1640-
call_info.tmp,
1641-
&call_info.uses,
1642-
);
1631+
emit_return_call_common_sequence(allocs, sink, info, state, &call_info.uses);
16431632

16441633
// Finally, jump to the callee!
16451634
//
@@ -1660,18 +1649,7 @@ pub(crate) fn emit(
16601649
} => {
16611650
let callee = callee.with_allocs(allocs);
16621651

1663-
emit_return_call_common_sequence(
1664-
allocs,
1665-
sink,
1666-
info,
1667-
state,
1668-
call_info.new_stack_arg_size,
1669-
call_info.old_stack_arg_size,
1670-
call_info.ret_addr,
1671-
call_info.fp,
1672-
call_info.tmp,
1673-
&call_info.uses,
1674-
);
1652+
emit_return_call_common_sequence(allocs, sink, info, state, &call_info.uses);
16751653

16761654
Inst::JmpUnknown { target: callee }.emit(&[], sink, info, state);
16771655
sink.add_call_site(ir::Opcode::ReturnCallIndirect);
@@ -1737,14 +1715,21 @@ pub(crate) fn emit(
17371715
// TODO: this needs to be accounted for by the stack check, before `GrowFrame` is
17381716
// emitted, so that the increment here is expected.
17391717
//
1740-
// Decrement SP by `amount`
1718+
// Decrement SP and FP by `amount`
17411719
Inst::alu_rmi_r(
17421720
OperandSize::Size64,
17431721
AluRmiROpcode::Sub,
17441722
RegMemImm::imm(*amount),
17451723
Writable::from_reg(regs::rsp()),
17461724
)
17471725
.emit(&[], sink, info, state);
1726+
Inst::alu_rmi_r(
1727+
OperandSize::Size64,
1728+
AluRmiROpcode::Sub,
1729+
RegMemImm::imm(*amount),
1730+
Writable::from_reg(regs::rbp()),
1731+
)
1732+
.emit(&[], sink, info, state);
17481733

17491734
// The total size that we're going to copy, including the return address and frame
17501735
// pointer that are pushed on the stack alreadcy.
@@ -1817,6 +1802,15 @@ pub(crate) fn emit(
18171802
Writable::from_reg(regs::rsp()),
18181803
)
18191804
.emit(&[], sink, info, state);
1805+
1806+
// Increment FP by `amount`
1807+
Inst::alu_rmi_r(
1808+
OperandSize::Size64,
1809+
AluRmiROpcode::Add,
1810+
RegMemImm::imm(*amount),
1811+
Writable::from_reg(regs::rbp()),
1812+
)
1813+
.emit(&[], sink, info, state);
18201814
}
18211815

18221816
Inst::Args { .. } => {}
@@ -4349,11 +4343,6 @@ fn emit_return_call_common_sequence(
43494343
sink: &mut MachBuffer<Inst>,
43504344
info: &EmitInfo,
43514345
state: &mut EmitState,
4352-
new_stack_arg_size: u32,
4353-
old_stack_arg_size: u32,
4354-
ret_addr: Option<Gpr>,
4355-
fp: Gpr,
4356-
tmp: WritableGpr,
43574346
uses: &CallArgList,
43584347
) {
43594348
assert!(
@@ -4366,125 +4355,18 @@ fn emit_return_call_common_sequence(
43664355
let _ = allocs.next(u.vreg);
43674356
}
43684357

4369-
let ret_addr = ret_addr.map(|r| Gpr::new(allocs.next(*r)).unwrap());
4370-
4371-
let fp = allocs.next(*fp);
4372-
4373-
let tmp = allocs.next(tmp.to_reg().to_reg());
4374-
let tmp = Gpr::new(tmp).unwrap();
4375-
let tmp_w = WritableGpr::from_reg(tmp);
4376-
4377-
// Copy the new frame (which is `frame_size` bytes above the SP)
4378-
// onto our current frame, using only volatile, non-argument
4379-
// registers.
4380-
//
4381-
//
4382-
// The current stack layout is the following:
4383-
//
4384-
// | ... |
4385-
// +---------------------+
4386-
// | ... |
4387-
// | stack arguments |
4388-
// | ... |
4389-
// current | return address |
4390-
// frame | old FP | <-- FP
4391-
// | callee saves |
4392-
// | ... |
4393-
// | old stack slots |
4394-
// | ... |
4395-
// +---------------------+
4396-
// | ... |
4397-
// new | new stack arguments |
4398-
// frame | ... | <-- SP
4399-
// +---------------------+
4400-
//
4401-
// We need to restore the old FP, copy the new stack arguments over the old
4402-
// stack arguments, write the return address into the correct slot just
4403-
// after the new stack arguments, adjust SP to point to the new return
4404-
// address, and then jump to the callee (which will push the old FP again).
4405-
4406-
// Restore the old FP into `rbp`.
4407-
Inst::Mov64MR {
4408-
src: SyntheticAmode::Real(Amode::ImmReg {
4409-
simm32: 0,
4410-
base: fp,
4411-
flags: MemFlags::trusted(),
4412-
}),
4413-
dst: Writable::from_reg(Gpr::new(regs::rbp()).unwrap()),
4358+
for inst in
4359+
X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())
4360+
{
4361+
inst.emit(&[], sink, info, state);
44144362
}
4415-
.emit(&[], sink, info, state);
44164363

4417-
// The new lowest address (top of stack) -- relative to FP -- for
4418-
// our tail callee. We compute this now so that we can move our
4419-
// stack arguments into place.
4420-
let callee_sp_relative_to_fp = i64::from(old_stack_arg_size) - i64::from(new_stack_arg_size);
4421-
4422-
// Copy over each word, using `tmp` as a temporary register.
4423-
//
4424-
// Note that we have to do this from stack slots with the highest
4425-
// address to lowest address because in the case of when the tail
4426-
// callee has more stack arguments than we do, we might otherwise
4427-
// overwrite some of our stack arguments before they've been copied
4428-
// into place.
4429-
assert_eq!(
4430-
new_stack_arg_size % 8,
4431-
0,
4432-
"stack argument space sizes should always be 8-byte aligned"
4433-
);
4434-
for i in (0..new_stack_arg_size / 8).rev() {
4435-
Inst::Mov64MR {
4436-
src: SyntheticAmode::Real(Amode::ImmReg {
4437-
simm32: (i * 8).try_into().unwrap(),
4438-
base: regs::rsp(),
4439-
flags: MemFlags::trusted(),
4440-
}),
4441-
dst: tmp_w,
4442-
}
4443-
.emit(&[], sink, info, state);
4444-
Inst::MovRM {
4445-
size: OperandSize::Size64,
4446-
src: tmp,
4447-
dst: SyntheticAmode::Real(Amode::ImmReg {
4448-
// Add 2 because we need to skip over the old FP and the
4449-
// return address.
4450-
simm32: (callee_sp_relative_to_fp + i64::from((i + 2) * 8))
4451-
.try_into()
4452-
.unwrap(),
4453-
base: fp,
4454-
flags: MemFlags::trusted(),
4455-
}),
4456-
}
4457-
.emit(&[], sink, info, state);
4458-
}
4459-
4460-
// Initialize SP for the tail callee, deallocating the temporary
4461-
// stack arguments space at the same time.
4462-
Inst::LoadEffectiveAddress {
4463-
size: OperandSize::Size64,
4464-
addr: SyntheticAmode::Real(Amode::ImmReg {
4465-
// NB: We add a word to `callee_sp_relative_to_fp` here because the
4466-
// callee will push FP, not us.
4467-
simm32: callee_sp_relative_to_fp.wrapping_add(8).try_into().unwrap(),
4468-
base: fp,
4469-
flags: MemFlags::trusted(),
4470-
}),
4471-
dst: Writable::from_reg(Gpr::new(regs::rsp()).unwrap()),
4472-
}
4473-
.emit(&[], sink, info, state);
4474-
4475-
state.adjust_virtual_sp_offset(-i64::from(new_stack_arg_size));
4476-
4477-
// Write the return address into the correct stack slot.
4478-
if let Some(ret_addr) = ret_addr {
4479-
Inst::MovRM {
4480-
size: OperandSize::Size64,
4481-
src: ret_addr,
4482-
dst: SyntheticAmode::Real(Amode::ImmReg {
4483-
simm32: 0,
4484-
base: regs::rsp(),
4485-
flags: MemFlags::trusted(),
4486-
}),
4487-
}
4488-
.emit(&[], sink, info, state);
4364+
for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(
4365+
CallConv::Tail,
4366+
&info.flags,
4367+
&info.isa_flags,
4368+
state.frame_layout(),
4369+
) {
4370+
inst.emit(&[], sink, info, state);
44894371
}
44904372
}

cranelift/codegen/src/isa/x64/inst/emit_state.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ pub struct EmitState {
1414
/// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and
1515
/// optimized away at compiletime. See [cranelift_control].
1616
ctrl_plane: ControlPlane,
17+
18+
/// A copy of the frame layout, used during the emission of `Inst::ReturnCallKnown` and
19+
/// `Inst::ReturnCallUnknown` instructions.
20+
frame_layout: FrameLayout,
1721
}
1822

1923
impl MachInstEmitState<Inst> for EmitState {
@@ -23,6 +27,7 @@ impl MachInstEmitState<Inst> for EmitState {
2327
nominal_sp_to_fp: abi.frame_size() as i64,
2428
stack_map: None,
2529
ctrl_plane,
30+
frame_layout: abi.frame_layout().clone(),
2631
}
2732
}
2833

@@ -62,4 +67,8 @@ impl EmitState {
6267
pub(crate) fn nominal_sp_to_fp(&self) -> i64 {
6368
self.nominal_sp_to_fp
6469
}
70+
71+
pub(crate) fn frame_layout(&self) -> &FrameLayout {
72+
&self.frame_layout
73+
}
6574
}

0 commit comments

Comments
 (0)