Skip to content

Commit 71ead6e

Browse files
committed
x64 backend: implement 128-bit ops and misc fixes.
This implements all of the ops on I128 that are implemented by the legacy x86 backend, and includes all that are required by at least one major use-case (cg_clif rustc backend). The sequences are open-coded where necessary; for e.g. the bit operations, this can be somewhat complex, but these sequences have been tested carefully. This PR also includes a drive-by fix of clz/ctz for 8- and 16-bit cases where they were incorrect previously. Also includes ridealong fixes developed while bringing up cg_clif support, because they are difficult to completely separate due to other refactors that occurred in this PR: - fix REX prefix logic for some 8-bit instructions. When using an 8-bit register in 64-bit mode on x86-64, the REX prefix semantics are somewhat subtle: without the REX prefix, register numbers 4--7 correspond to the second-to-lowest byte of the first four registers (AH, CH, BH, DH), whereas with the REX prefix, these register numbers correspond to the usual encoding (SPL, BPL, SIL, DIL). We could always emit a REX byte for instructions with 8-bit cases (this is harmless even if unneeded), but this would unnecessarily inflate code size; instead, the usual approach is to emit it only for these registers. This logic was present in some cases but missing for some other instructions: divide, not, negate, shifts. Fixes #2508. - avoid unaligned SSE loads on some f64 ops. The implementations of several FP ops, such as fabs/fneg, used SSE instructions. This is not a problem per-se, except that load-op merging did not take *alignment* into account. Specifically, if an op on an f64 loaded from memory happened to merge that load, and the instruction into which it was merged was an SSE instruction, then the SSE instruction imposes stricter (128-bit) alignment requirements than the load.f64 did. This PR simply forces any instruction lowerings that could use SSE instructions to implement non-SIMD operations to take inputs in registers only, and avoid load-op merging. Fixes #2507. - two bugfixes exposed by cg_clif: urem/srem.i8, select.b1. - urem/srem.i8: the 8-bit form of the DIV instruction on x86-64 places the remainder in AH, not RDX, different from all the other width-forms of this instruction. - select.b1: we were not recognizing selects of boolean values as integer-typed operations, so we were generating XMM moves instead (!).
1 parent 705af0a commit 71ead6e

12 files changed

Lines changed: 3198 additions & 660 deletions

File tree

cranelift/codegen/src/isa/x64/abi.rs

Lines changed: 49 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -138,42 +138,62 @@ impl ABIMachineSpec for X64ABIMachineSpec {
138138
),
139139
}
140140

141-
let intreg = in_int_reg(param.value_type);
142-
let vecreg = in_vec_reg(param.value_type);
143-
debug_assert!(intreg || vecreg);
144-
debug_assert!(!(intreg && vecreg));
145-
146-
let (next_reg, candidate) = if intreg {
147-
let candidate = match args_or_rets {
148-
ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr),
149-
ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i),
150-
};
151-
debug_assert!(candidate
152-
.map(|r| r.get_class() == RegClass::I64)
153-
.unwrap_or(true));
154-
(&mut next_gpr, candidate)
155-
} else {
156-
let candidate = match args_or_rets {
157-
ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg),
158-
ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i),
159-
};
160-
debug_assert!(candidate
161-
.map(|r| r.get_class() == RegClass::V128)
162-
.unwrap_or(true));
163-
(&mut next_vreg, candidate)
164-
};
165-
166141
if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
167-
assert!(intreg);
168142
ret.push(param);
169-
} else if let Some(reg) = candidate {
143+
continue;
144+
}
145+
146+
// Find regclass(es) of the register(s) used to store a value of this type.
147+
let (rcs, _) = Inst::rc_for_type(param.value_type)?;
148+
let intreg = rcs[0] == RegClass::I64;
149+
let num_regs = rcs.len();
150+
assert!(num_regs <= 2);
151+
if num_regs == 2 {
152+
assert_eq!(rcs[0], rcs[1]);
153+
}
154+
155+
let mut regs: SmallVec<[RealReg; 2]> = smallvec![];
156+
for j in 0..num_regs {
157+
let nextreg = if intreg {
158+
match args_or_rets {
159+
ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr + j),
160+
ArgsOrRets::Rets => {
161+
get_intreg_for_retval_systemv(&call_conv, next_gpr + j, i + j)
162+
}
163+
}
164+
} else {
165+
match args_or_rets {
166+
ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg + j),
167+
ArgsOrRets::Rets => {
168+
get_fltreg_for_retval_systemv(&call_conv, next_vreg + j, i + j)
169+
}
170+
}
171+
};
172+
if let Some(reg) = nextreg {
173+
regs.push(reg.to_real_reg());
174+
} else {
175+
regs.clear();
176+
break;
177+
}
178+
}
179+
180+
if regs.len() > 0 {
181+
let regs = match num_regs {
182+
1 => ValueRegs::one(regs[0]),
183+
2 => ValueRegs::two(regs[0], regs[1]),
184+
_ => panic!("More than two registers unexpected"),
185+
};
170186
ret.push(ABIArg::Reg(
171-
ValueRegs::one(reg.to_real_reg()),
187+
regs,
172188
param.value_type,
173189
param.extension,
174190
param.purpose,
175191
));
176-
*next_reg += 1;
192+
if intreg {
193+
next_gpr += num_regs;
194+
} else {
195+
next_vreg += num_regs;
196+
}
177197
} else {
178198
// Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
179199
// stack alignment happens separately after all args.)
@@ -658,31 +678,6 @@ impl From<StackAMode> for SyntheticAmode {
658678
}
659679
}
660680

661-
fn in_int_reg(ty: types::Type) -> bool {
662-
match ty {
663-
types::I8
664-
| types::I16
665-
| types::I32
666-
| types::I64
667-
| types::B1
668-
| types::B8
669-
| types::B16
670-
| types::B32
671-
| types::B64
672-
| types::R64 => true,
673-
types::R32 => panic!("unexpected 32-bits refs on x64!"),
674-
_ => false,
675-
}
676-
}
677-
678-
fn in_vec_reg(ty: types::Type) -> bool {
679-
match ty {
680-
types::F32 | types::F64 => true,
681-
_ if ty.is_vector() => true,
682-
_ => false,
683-
}
684-
}
685-
686681
fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
687682
match call_conv {
688683
CallConv::Fast

cranelift/codegen/src/isa/x64/inst/args.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,23 +346,35 @@ impl PrettyPrintSized for RegMem {
346346
#[derive(Copy, Clone, PartialEq)]
347347
pub enum AluRmiROpcode {
348348
Add,
349+
Adc,
349350
Sub,
351+
Sbb,
350352
And,
351353
Or,
352354
Xor,
353355
/// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
354356
Mul,
357+
/// 8-bit form of And. Handled separately as we don't have full 8-bit op
358+
/// support (we just use wider instructions). Used only with some sequences
359+
/// with SETcc.
360+
And8,
361+
/// 8-bit form of Or.
362+
Or8,
355363
}
356364

357365
impl fmt::Debug for AluRmiROpcode {
358366
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
359367
let name = match self {
360368
AluRmiROpcode::Add => "add",
369+
AluRmiROpcode::Adc => "adc",
361370
AluRmiROpcode::Sub => "sub",
371+
AluRmiROpcode::Sbb => "sbb",
362372
AluRmiROpcode::And => "and",
363373
AluRmiROpcode::Or => "or",
364374
AluRmiROpcode::Xor => "xor",
365375
AluRmiROpcode::Mul => "imul",
376+
AluRmiROpcode::And8 => "and",
377+
AluRmiROpcode::Or8 => "or",
366378
};
367379
write!(fmt, "{}", name)
368380
}
@@ -374,6 +386,16 @@ impl fmt::Display for AluRmiROpcode {
374386
}
375387
}
376388

389+
impl AluRmiROpcode {
390+
/// Is this a special-cased 8-bit ALU op?
391+
pub fn is_8bit(self) -> bool {
392+
match self {
393+
AluRmiROpcode::And8 | AluRmiROpcode::Or8 => true,
394+
_ => false,
395+
}
396+
}
397+
}
398+
377399
#[derive(Clone, PartialEq)]
378400
pub enum UnaryRmROpcode {
379401
/// Bit-scan reverse.
@@ -1010,7 +1032,7 @@ impl fmt::Display for ExtMode {
10101032
}
10111033

10121034
/// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right.
1013-
#[derive(Clone)]
1035+
#[derive(Clone, Copy)]
10141036
pub enum ShiftKind {
10151037
ShiftLeft,
10161038
/// Inserts zeros in the most significant bits.

0 commit comments

Comments
 (0)