From c439aadb7f0c6235acfdbdd52378cb78eb706342 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 11 Aug 2022 13:55:43 -0700 Subject: [PATCH 1/6] Add tests for existing behavior --- cranelift/codegen/src/isa/x64/inst/mod.rs | 14 +- .../filetests/filetests/isa/x64/fcvt.clif | 271 ++++++++++++++++++ 2 files changed, 279 insertions(+), 6 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 5a51424a43ca..73aa8165203b 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -1257,7 +1257,7 @@ impl PrettyPrint for Inst { dst_size, tmp_xmm, tmp_gpr, - .. + is_saturating, } => { let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs); @@ -1266,9 +1266,10 @@ impl PrettyPrint for Inst { format!( "{} {}, {}, {}, {}", ljustify(format!( - "cvt_float{}_to_sint{}_seq", + "cvt_float{}_to_sint{}{}_seq", src_size.to_bits(), - dst_size.to_bits() + dst_size.to_bits(), + if *is_saturating { "_sat" } else { "" }, )), src, dst, @@ -1284,7 +1285,7 @@ impl PrettyPrint for Inst { dst_size, tmp_gpr, tmp_xmm, - .. + is_saturating, } => { let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs); @@ -1293,9 +1294,10 @@ impl PrettyPrint for Inst { format!( "{} {}, {}, {}, {}", ljustify(format!( - "cvt_float{}_to_uint{}_seq", + "cvt_float{}_to_uint{}{}_seq", src_size.to_bits(), - dst_size.to_bits() + dst_size.to_bits(), + if *is_saturating { "_sat" } else { "" }, )), src, dst, diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 09d7c80336eb..2b5a98ed9247 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -200,3 +200,274 @@ block0(v0: i32x4): ; popq %rbp ; ret +function %f13(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_uint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f14(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_uint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f15(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_uint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f16(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_uint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f17(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_uint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f18(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_uint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f19(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_uint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f20(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_uint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_uint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f21(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_sint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f22(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_sint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f23(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_sint.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint32_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f24(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_sint.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint64_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f25(f32) -> i32 { +block0(v0: f32): + v1 = fcvt_to_sint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f26(f32) -> i64 { +block0(v0: f32): + v1 = fcvt_to_sint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float32_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f27(f64) -> i32 { +block0(v0: f64): + v1 = fcvt_to_sint_sat.i32 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint32_sat_seq %xmm0, %eax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f28(f64) -> i64 { +block0(v0: f64): + v1 = fcvt_to_sint_sat.i64 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvt_float64_to_sint64_sat_seq %xmm0, %rax, %r10, %xmm6 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f29(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_uint_sat.i32x4 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; pxor %xmm5, %xmm5, %xmm5 +; maxps %xmm0, %xmm5, %xmm0 +; pcmpeqd %xmm5, %xmm5, %xmm5 +; psrld %xmm5, $1, %xmm5 +; cvtdq2ps %xmm5, %xmm5 +; movdqa %xmm0, %xmm11 +; cvttps2dq %xmm0, %xmm0 +; subps %xmm11, %xmm5, %xmm11 +; cmpps $2, %xmm5, %xmm11, %xmm5 +; cvttps2dq %xmm11, %xmm11 +; pxor %xmm11, %xmm5, %xmm11 +; pxor %xmm5, %xmm5, %xmm5 +; pmaxsd %xmm11, %xmm5, %xmm11 +; paddd %xmm0, %xmm11, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret + +function %f30(f32x4) -> i32x4 { +block0(v0: f32x4): + v1 = fcvt_to_sint_sat.i32x4 v0 + return v1 +} + +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movdqa %xmm0, %xmm5 +; cmpps $0, %xmm5, %xmm5, %xmm5 +; andps %xmm0, %xmm5, %xmm0 +; pxor %xmm5, %xmm0, %xmm5 +; cvttps2dq %xmm0, %xmm0 +; pand %xmm5, %xmm0, %xmm5 +; psrad %xmm5, $31, %xmm5 +; pxor %xmm0, %xmm5, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret From 45a718394a02f1f35b2a6d03b660f7d7af062291 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 11 Aug 2022 14:22:39 -0700 Subject: [PATCH 2/6] Lower fcvt_to_{u,s}int{,_sat} --- cranelift/codegen/src/isa/x64/inst.isle | 28 ++++++++++++ cranelift/codegen/src/isa/x64/inst/mod.rs | 52 ----------------------- cranelift/codegen/src/isa/x64/lower.isle | 14 ++++++ cranelift/codegen/src/isa/x64/lower.rs | 40 +++-------------- 4 files changed, 47 insertions(+), 87 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index c2708175649b..f64a29d02c63 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -3058,6 +3058,34 @@ (_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2)))) dst)) +(decl cvt_float_to_uint_seq (Type Value bool) Gpr) +(rule (cvt_float_to_uint_seq out_ty src @ (value_type src_ty) is_saturating) + (let ((out_size OperandSize (raw_operand_size_of_type out_ty)) + (src_size OperandSize (raw_operand_size_of_type src_ty)) + + (tmp WritableXmm (temp_writable_xmm)) + (_ Unit (emit (gen_move src_ty tmp src))) + + (dst WritableGpr (temp_writable_gpr)) + (tmp_xmm WritableXmm (temp_writable_xmm)) + (tmp_gpr WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm)))) + dst)) + +(decl cvt_float_to_sint_seq (Type Value bool) Gpr) +(rule (cvt_float_to_sint_seq out_ty src @ (value_type src_ty) is_saturating) + (let ((out_size OperandSize (raw_operand_size_of_type out_ty)) + (src_size OperandSize (raw_operand_size_of_type src_ty)) + + (tmp WritableXmm (temp_writable_xmm)) + (_ Unit (emit (gen_move src_ty tmp src))) + + (dst WritableGpr (temp_writable_gpr)) + (tmp_xmm WritableXmm (temp_writable_xmm)) + (tmp_gpr WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm)))) + dst)) + (decl fcvt_uint_mask_const () VCodeConstant) (extern constructor fcvt_uint_mask_const fcvt_uint_mask_const) diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 73aa8165203b..354be76df2c7 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -408,58 +408,6 @@ impl Inst { Inst::XmmCmpRmR { op, src, dst } } - pub(crate) fn cvt_float_to_sint_seq( - src_size: OperandSize, - dst_size: OperandSize, - is_saturating: bool, - src: Writable, - dst: Writable, - tmp_gpr: Writable, - tmp_xmm: Writable, - ) -> Inst { - debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(src.to_reg().class() == RegClass::Float); - debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float); - debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int); - debug_assert!(dst.to_reg().class() == RegClass::Int); - Inst::CvtFloatToSintSeq { - src_size, - dst_size, - is_saturating, - src: WritableXmm::from_writable_reg(src).unwrap(), - dst: WritableGpr::from_writable_reg(dst).unwrap(), - tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(), - tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(), - } - } - - pub(crate) fn cvt_float_to_uint_seq( - src_size: OperandSize, - dst_size: OperandSize, - is_saturating: bool, - src: Writable, - dst: Writable, - tmp_gpr: Writable, - tmp_xmm: Writable, - ) -> Inst { - debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64])); - debug_assert!(src.to_reg().class() == RegClass::Float); - debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float); - debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int); - debug_assert!(dst.to_reg().class() == RegClass::Int); - Inst::CvtFloatToUintSeq { - src_size, - dst_size, - is_saturating, - src: WritableXmm::from_writable_reg(src).unwrap(), - dst: WritableGpr::from_writable_reg(dst).unwrap(), - tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(), - tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(), - } - } - #[allow(dead_code)] pub(crate) fn xmm_min_max_seq( size: OperandSize, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 8f840a086fdb..f2c46126745b 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3062,3 +3062,17 @@ ;; add together the two converted values (x64_addps a_hi a_lo))) + +;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_uint_seq out_ty val $false)) + +(rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_uint_seq out_ty val $true)) + +(rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_sint_seq out_ty val $false)) + +(rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _))))) + (cvt_float_to_sint_seq out_ty val $true)) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 4da2fe1d1d60..c3a0ed84ec57 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -557,49 +557,19 @@ fn lower_insn_to_regs( | Opcode::SelectifSpectreGuard | Opcode::FcvtFromSint | Opcode::FcvtLowFromSint - | Opcode::FcvtFromUint => { + | Opcode::FcvtFromUint + | Opcode::FcvtToUint + | Opcode::FcvtToSint => { implemented_in_isle(ctx); } - Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => { + Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => { let src = put_input_in_reg(ctx, inputs[0]); let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let input_ty = ctx.input_ty(insn, 0); if !input_ty.is_vector() { - let src_size = if input_ty == types::F32 { - OperandSize::Size32 - } else { - assert_eq!(input_ty, types::F64); - OperandSize::Size64 - }; - - let output_ty = ty.unwrap(); - let dst_size = if output_ty == types::I32 { - OperandSize::Size32 - } else { - assert_eq!(output_ty, types::I64); - OperandSize::Size64 - }; - - let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat; - let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat; - - let src_copy = ctx.alloc_tmp(input_ty).only_reg().unwrap(); - ctx.emit(Inst::gen_move(src_copy, src, input_ty)); - - let tmp_xmm = ctx.alloc_tmp(input_ty).only_reg().unwrap(); - let tmp_gpr = ctx.alloc_tmp(output_ty).only_reg().unwrap(); - - if to_signed { - ctx.emit(Inst::cvt_float_to_sint_seq( - src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, - )); - } else { - ctx.emit(Inst::cvt_float_to_uint_seq( - src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, - )); - } + implemented_in_isle(ctx); } else { if op == Opcode::FcvtToSintSat { // Sets destination to zero if float is NaN From 8cbe0c33c19eda26ffe7e98a71910ec4e2a8858d Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 11 Aug 2022 17:39:51 -0700 Subject: [PATCH 3/6] Lower fcvt_to_sint_sat for f32x4 --- cranelift/codegen/src/isa/x64/inst.isle | 4 ++ cranelift/codegen/src/isa/x64/lower.isle | 26 +++++++++ cranelift/codegen/src/isa/x64/lower.rs | 56 +------------------ .../filetests/filetests/isa/x64/fcvt.clif | 12 ++-- 4 files changed, 38 insertions(+), 60 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index f64a29d02c63..8043a1059bd9 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -3047,6 +3047,10 @@ (_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size)))) dst)) +(decl x64_cvttps2dq (Type XmmMem) Xmm) +(rule (x64_cvttps2dq ty x) + (xmm_unary_rm_r (SseOpcode.Cvttps2dq) x)) + (decl cvt_u64_to_float_seq (Type Gpr) Xmm) (rule (cvt_u64_to_float_seq ty src) (let ((size OperandSize (raw_operand_size_of_type ty)) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index f2c46126745b..fd90bf929883 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3076,3 +3076,29 @@ (rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _))))) (cvt_float_to_sint_seq out_ty val $true)) + +;; The x64 backend currently only supports these two type combinations. +(rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4)))) + (let (;; Sets tmp to zero if float is NaN + (tmp Xmm (x64_cmpps val val (FcmpImm.Equal))) + (dst Xmm (x64_andps val tmp)) + + ;; Sets top bit of tmp if float is positive + ;; Setting up to set top bit on negative float values + (tmp Xmm (x64_pxor tmp dst)) + + ;; Convert the packed float to packed doubleword. + (dst Xmm (x64_cvttps2dq $F32X4 dst)) + + ;; TODO: regalloc2 introduces a useless move here, is that + ;; acceptable? + + ;; Set top bit only if < 0 + (tmp Xmm (x64_pand dst tmp)) + (tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31)))) + + ;; On overflow 0x80000000 is returned to a lane. + ;; Below sets positive overflow lanes to 0x7FFFFFFF + ;; Keeps negative overflow lanes as is. + (x64_pxor tmp dst))) + diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index c3a0ed84ec57..408aca664be5 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -572,61 +572,7 @@ fn lower_insn_to_regs( implemented_in_isle(ctx); } else { if op == Opcode::FcvtToSintSat { - // Sets destination to zero if float is NaN - assert_eq!(types::F32X4, ctx.input_ty(insn, 0)); - let tmp = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Movapd, - RegMem::reg(src), - tmp, - )); - ctx.emit(Inst::gen_move(dst, src, input_ty)); - let cond = FcmpImm::from(FloatCC::Equal); - ctx.emit(Inst::xmm_rm_r_imm( - SseOpcode::Cmpps, - RegMem::reg(tmp.to_reg()), - tmp, - cond.encode(), - OperandSize::Size32, - )); - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Andps, - RegMem::reg(tmp.to_reg()), - dst, - )); - - // Sets top bit of tmp if float is positive - // Setting up to set top bit on negative float values - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pxor, - RegMem::reg(dst.to_reg()), - tmp, - )); - - // Convert the packed float to packed doubleword. - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::reg(dst.to_reg()), - dst, - )); - - // Set top bit only if < 0 - // Saturate lane with sign (top) bit. - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pand, - RegMem::reg(dst.to_reg()), - tmp, - )); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrad, RegMemImm::imm(31), tmp)); - - // On overflow 0x80000000 is returned to a lane. - // Below sets positive overflow lanes to 0x7FFFFFFF - // Keeps negative overflow lanes as is. - ctx.emit(Inst::xmm_rm_r( - SseOpcode::Pxor, - RegMem::reg(tmp.to_reg()), - dst, - )); + implemented_in_isle(ctx); } else if op == Opcode::FcvtToUintSat { // The algorithm for converting floats to unsigned ints is a little tricky. The // complication arises because we are converting from a signed 64-bit int with a positive diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index 2b5a98ed9247..c999a1f3180c 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -461,13 +461,15 @@ block0(v0: f32x4): ; movq %rsp, %rbp ; block0: ; movdqa %xmm0, %xmm5 -; cmpps $0, %xmm5, %xmm5, %xmm5 +; cmpps $0, %xmm5, %xmm0, %xmm5 ; andps %xmm0, %xmm5, %xmm0 ; pxor %xmm5, %xmm0, %xmm5 -; cvttps2dq %xmm0, %xmm0 -; pand %xmm5, %xmm0, %xmm5 -; psrad %xmm5, $31, %xmm5 -; pxor %xmm0, %xmm5, %xmm0 +; cvttps2dq %xmm0, %xmm9 +; movdqa %xmm9, %xmm0 +; pand %xmm0, %xmm5, %xmm0 +; psrad %xmm0, $31, %xmm0 +; pxor %xmm0, %xmm9, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret + From 65e5e94488433913c2d08a65776820e3648167e9 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 11 Aug 2022 22:39:46 -0700 Subject: [PATCH 4/6] Lower `fcvt_to_uint_sat` in ISLE --- cranelift/codegen/src/isa/x64/lower.isle | 90 +++++++++++ cranelift/codegen/src/isa/x64/lower.rs | 140 +----------------- .../filetests/filetests/isa/x64/fcvt.clif | 25 ++-- 3 files changed, 105 insertions(+), 150 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index fd90bf929883..cebfd382a4ab 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3102,3 +3102,93 @@ ;; Keeps negative overflow lanes as is. (x64_pxor tmp dst))) +;; The algorithm for converting floats to unsigned ints is a little tricky. The +;; complication arises because we are converting from a signed 64-bit int with a positive +;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended +;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX +;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our +;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but +;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than +;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes +;; precisely INT_MAX values we can correctly account for and convert every value in this range +;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction +;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). +;; After the conversion we add INT_MAX+1 back to this converted value, noting again that +;; values we are trying to account for were already set to INT_MAX+1 during the original conversion. +;; We simply have to create a mask and make sure we are adding together only the lanes that need +;; to be accounted for. Digesting it all the steps then are: +;; +;; Step 1 - Account for NaN and negative floats by setting these src values to zero. +;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for +;; reasons described above. +;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX +;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those +;; values that were originally in the range (0..INT_MAX). This will come in handy during +;; step 7 when we zero negative lanes. +;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than +;; UINT_MAX that are now less than INT_MAX thanks to the subtraction. +;; Step 6 - Convert the second set of values (tmp1) +;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been +;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF +;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000 +;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less +;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and +;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally +;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). +;; +;; +;; The table below illustrates the result after each step where it matters for the converted set. +;; Note the original value range (original src set) is the final dst in Step 8: +;; +;; Original src set: +;; | Original Value Range | Step 1 | Step 3 | Step 8 | +;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | +;; +;; Copied src set (tmp1): +;; | Step 2 | Step 4 | +;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | +;; +;; | Step 6 | Step 7 | +;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | +(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4)))) + (let (;; Converting to unsigned int so if float src is negative or NaN + ;; will first set to zero. + (tmp2 Xmm (x64_pxor val val)) ;; make a zero + (dst Xmm (x64_maxps val tmp2)) + + ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks + ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because + ;; single precision IEEE-754 floats can only accurately represent contingous + ;; integers up to 2^23 and outside of this range it rounds to the closest + ;; integer that it can represent. In the case of INT_MAX, this value gets + ;; represented as 0x4f000000 which is the integer value (INT_MAX+1). + (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2)) + (tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1))) + (tmp2 Xmm (x64_cvtdq2ps tmp2)) + + ;; Make a copy of these lanes and then do the first conversion. + ;; Overflow lanes greater than the maximum allowed signed value will + ;; set to 0x80000000. Negative and NaN lanes will be 0x0 + (tmp1 Xmm dst) + (dst Xmm (x64_cvttps2dq $F32X4 dst)) + + ;; Set lanes to src - max_signed_int + (tmp1 Xmm (x64_subps tmp1 tmp2)) + + ;; Create mask for all positive lanes to saturate (i.e. greater than + ;; or equal to the maxmimum allowable unsigned int). + (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual))) + + ;; Convert those set of lanes that have the max_signed_int factored out. + (tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1)) + + ;; Prepare converted lanes by zeroing negative lanes and prepping lanes + ;; that have positive overflow (based on the mask) by setting these lanes + ;; to 0x7FFFFFFF + (tmp1 Xmm (x64_pxor tmp1 tmp2)) + (tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero + (tmp1 Xmm (x64_pmaxsd tmp1 tmp2))) + + ;; Add this second set of converted lanes to the original to properly handle + ;; values greater than max signed int. + (x64_paddd tmp1 dst))) diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 408aca664be5..3e4decfda905 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -559,146 +559,12 @@ fn lower_insn_to_regs( | Opcode::FcvtLowFromSint | Opcode::FcvtFromUint | Opcode::FcvtToUint - | Opcode::FcvtToSint => { + | Opcode::FcvtToSint + | Opcode::FcvtToUintSat + | Opcode::FcvtToSintSat => { implemented_in_isle(ctx); } - Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => { - let src = put_input_in_reg(ctx, inputs[0]); - let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - let input_ty = ctx.input_ty(insn, 0); - if !input_ty.is_vector() { - implemented_in_isle(ctx); - } else { - if op == Opcode::FcvtToSintSat { - implemented_in_isle(ctx); - } else if op == Opcode::FcvtToUintSat { - // The algorithm for converting floats to unsigned ints is a little tricky. The - // complication arises because we are converting from a signed 64-bit int with a positive - // integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended - // range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX - // (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our - // conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but - // which conveniently setting underflows and overflows (smaller than MIN_INT or larger than - // MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes - // precisely INT_MAX values we can correctly account for and convert every value in this range - // if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction - // every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). - // After the conversion we add INT_MAX+1 back to this converted value, noting again that - // values we are trying to account for were already set to INT_MAX+1 during the original conversion. - // We simply have to create a mask and make sure we are adding together only the lanes that need - // to be accounted for. Digesting it all the steps then are: - // - // Step 1 - Account for NaN and negative floats by setting these src values to zero. - // Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for - // reasons described above. - // Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX - // Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those - // values that were originally in the range (0..INT_MAX). This will come in handy during - // step 7 when we zero negative lanes. - // Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than - // UINT_MAX that are now less than INT_MAX thanks to the subtraction. - // Step 6 - Convert the second set of values (tmp1) - // Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been - // converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF - // as this will allow us to properly saturate overflow lanes when adding to 0x80000000 - // Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less - // than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and - // UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally - // greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). - // - // - // The table below illustrates the result after each step where it matters for the converted set. - // Note the original value range (original src set) is the final dst in Step 8: - // - // Original src set: - // | Original Value Range | Step 1 | Step 3 | Step 8 | - // | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | - // - // Copied src set (tmp1): - // | Step 2 | Step 4 | - // | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | - // - // | Step 6 | Step 7 | - // | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | - - // Create temporaries - assert_eq!(types::F32X4, ctx.input_ty(insn, 0)); - let tmp1 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(types::I32X4).only_reg().unwrap(); - - // Converting to unsigned int so if float src is negative or NaN - // will first set to zero. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::gen_move(dst, src, input_ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Maxps, RegMem::from(tmp2), dst)); - - // Set tmp2 to INT_MAX+1. It is important to note here that after it looks - // like we are only converting INT_MAX (0x7FFFFFFF) but in fact because - // single precision IEEE-754 floats can only accurately represent contingous - // integers up to 2^23 and outside of this range it rounds to the closest - // integer that it can represent. In the case of INT_MAX, this value gets - // represented as 0x4f000000 which is the integer value (INT_MAX+1). - - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pcmpeqd, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::xmm_rmi_reg(SseOpcode::Psrld, RegMemImm::imm(1), tmp2)); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvtdq2ps, - RegMem::from(tmp2), - tmp2, - )); - - // Make a copy of these lanes and then do the first conversion. - // Overflow lanes greater than the maximum allowed signed value will - // set to 0x80000000. Negative and NaN lanes will be 0x0 - ctx.emit(Inst::xmm_mov(SseOpcode::Movaps, RegMem::from(dst), tmp1)); - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::from(dst), - dst, - )); - - // Set lanes to src - max_signed_int - ctx.emit(Inst::xmm_rm_r(SseOpcode::Subps, RegMem::from(tmp2), tmp1)); - - // Create mask for all positive lanes to saturate (i.e. greater than - // or equal to the maxmimum allowable unsigned int). - let cond = FcmpImm::from(FloatCC::LessThanOrEqual); - ctx.emit(Inst::xmm_rm_r_imm( - SseOpcode::Cmpps, - RegMem::from(tmp1), - tmp2, - cond.encode(), - OperandSize::Size32, - )); - - // Convert those set of lanes that have the max_signed_int factored out. - ctx.emit(Inst::xmm_unary_rm_r( - SseOpcode::Cvttps2dq, - RegMem::from(tmp1), - tmp1, - )); - - // Prepare converted lanes by zeroing negative lanes and prepping lanes - // that have positive overflow (based on the mask) by setting these lanes - // to 0x7FFFFFFF - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp1)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp2), tmp2)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::from(tmp2), tmp1)); - - // Add this second set of converted lanes to the original to properly handle - // values greater than max signed int. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::from(tmp1), dst)); - } else { - // Since this branch is also guarded by a check for vector types - // neither Opcode::FcvtToUint nor Opcode::FcvtToSint can reach here - // due to vector varients not existing. The first two branches will - // cover all reachable cases. - unreachable!(); - } - } - } Opcode::IaddPairwise => { if let (Some(swiden_low), Some(swiden_high)) = ( matches_input(ctx, inputs[0], Opcode::SwidenLow), diff --git a/cranelift/filetests/filetests/isa/x64/fcvt.clif b/cranelift/filetests/filetests/isa/x64/fcvt.clif index c999a1f3180c..09c6093c543e 100644 --- a/cranelift/filetests/filetests/isa/x64/fcvt.clif +++ b/cranelift/filetests/filetests/isa/x64/fcvt.clif @@ -433,20 +433,19 @@ block0(v0: f32x4): ; pushq %rbp ; movq %rsp, %rbp ; block0: -; pxor %xmm5, %xmm5, %xmm5 -; maxps %xmm0, %xmm5, %xmm0 -; pcmpeqd %xmm5, %xmm5, %xmm5 -; psrld %xmm5, $1, %xmm5 -; cvtdq2ps %xmm5, %xmm5 -; movdqa %xmm0, %xmm11 +; pxor %xmm3, %xmm3, %xmm3 +; maxps %xmm0, %xmm3, %xmm0 +; pcmpeqd %xmm8, %xmm8, %xmm8 +; psrld %xmm8, $1, %xmm8 +; cvtdq2ps %xmm8, %xmm14 +; cvttps2dq %xmm0, %xmm13 +; subps %xmm0, %xmm14, %xmm0 +; cmpps $2, %xmm14, %xmm0, %xmm14 ; cvttps2dq %xmm0, %xmm0 -; subps %xmm11, %xmm5, %xmm11 -; cmpps $2, %xmm5, %xmm11, %xmm5 -; cvttps2dq %xmm11, %xmm11 -; pxor %xmm11, %xmm5, %xmm11 -; pxor %xmm5, %xmm5, %xmm5 -; pmaxsd %xmm11, %xmm5, %xmm11 -; paddd %xmm0, %xmm11, %xmm0 +; pxor %xmm0, %xmm14, %xmm0 +; pxor %xmm7, %xmm7, %xmm7 +; pmaxsd %xmm0, %xmm7, %xmm0 +; paddd %xmm0, %xmm13, %xmm0 ; movq %rbp, %rsp ; popq %rbp ; ret From adb44ba948dbeb6e79e02c13acbd00095b149d76 Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Mon, 15 Aug 2022 14:25:42 -0700 Subject: [PATCH 5/6] Add some NaN run tests for fcvt_to_{s,u}int_sat --- cranelift/filetests/filetests/runtests/simd-conversion.clif | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cranelift/filetests/filetests/runtests/simd-conversion.clif b/cranelift/filetests/filetests/runtests/simd-conversion.clif index da8bced238af..9648acda8c09 100644 --- a/cranelift/filetests/filetests/runtests/simd-conversion.clif +++ b/cranelift/filetests/filetests/runtests/simd-conversion.clif @@ -28,6 +28,7 @@ block0(v0:f32x4): } ; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF] ; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000] +; run: %fcvt_to_sint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0] function %fcvt_to_uint_sat(f32x4) -> i32x4 { block0(v0:f32x4): @@ -37,3 +38,4 @@ block0(v0:f32x4): ; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF] ; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0] ; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0] +; run: %fcvt_to_uint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0] From f4512602a11acdc7add293241485b58ff6c042fb Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Mon, 15 Aug 2022 14:58:56 -0700 Subject: [PATCH 6/6] Remove TODO about extra move --- cranelift/codegen/src/isa/x64/lower.isle | 3 --- 1 file changed, 3 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index cebfd382a4ab..759a956ec6f9 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3090,9 +3090,6 @@ ;; Convert the packed float to packed doubleword. (dst Xmm (x64_cvttps2dq $F32X4 dst)) - ;; TODO: regalloc2 introduces a useless move here, is that - ;; acceptable? - ;; Set top bit only if < 0 (tmp Xmm (x64_pand dst tmp)) (tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31))))