Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3047,6 +3047,10 @@
(_ Unit (emit (MInst.GprToXmm (SseOpcode.Cvtsi2sd) x dst size))))
dst))

(decl x64_cvttps2dq (Type XmmMem) Xmm)
(rule (x64_cvttps2dq ty x)
(xmm_unary_rm_r (SseOpcode.Cvttps2dq) x))

(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
(rule (cvt_u64_to_float_seq ty src)
(let ((size OperandSize (raw_operand_size_of_type ty))
Expand All @@ -3058,6 +3062,34 @@
(_ Unit (emit (MInst.CvtUint64ToFloatSeq size src_copy dst tmp_gpr1 tmp_gpr2))))
dst))

(decl cvt_float_to_uint_seq (Type Value bool) Gpr)
(rule (cvt_float_to_uint_seq out_ty src @ (value_type src_ty) is_saturating)
(let ((out_size OperandSize (raw_operand_size_of_type out_ty))
(src_size OperandSize (raw_operand_size_of_type src_ty))

(tmp WritableXmm (temp_writable_xmm))
(_ Unit (emit (gen_move src_ty tmp src)))

(dst WritableGpr (temp_writable_gpr))
(tmp_xmm WritableXmm (temp_writable_xmm))
(tmp_gpr WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.CvtFloatToUintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
dst))

(decl cvt_float_to_sint_seq (Type Value bool) Gpr)
(rule (cvt_float_to_sint_seq out_ty src @ (value_type src_ty) is_saturating)
(let ((out_size OperandSize (raw_operand_size_of_type out_ty))
(src_size OperandSize (raw_operand_size_of_type src_ty))

(tmp WritableXmm (temp_writable_xmm))
(_ Unit (emit (gen_move src_ty tmp src)))

(dst WritableGpr (temp_writable_gpr))
(tmp_xmm WritableXmm (temp_writable_xmm))
(tmp_gpr WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.CvtFloatToSintSeq out_size src_size is_saturating tmp dst tmp_gpr tmp_xmm))))
dst))

(decl fcvt_uint_mask_const () VCodeConstant)
(extern constructor fcvt_uint_mask_const fcvt_uint_mask_const)

Expand Down
66 changes: 8 additions & 58 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -408,58 +408,6 @@ impl Inst {
Inst::XmmCmpRmR { op, src, dst }
}

pub(crate) fn cvt_float_to_sint_seq(
src_size: OperandSize,
dst_size: OperandSize,
is_saturating: bool,
src: Writable<Reg>,
dst: Writable<Reg>,
tmp_gpr: Writable<Reg>,
tmp_xmm: Writable<Reg>,
) -> Inst {
debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
debug_assert!(src.to_reg().class() == RegClass::Float);
debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float);
debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int);
debug_assert!(dst.to_reg().class() == RegClass::Int);
Inst::CvtFloatToSintSeq {
src_size,
dst_size,
is_saturating,
src: WritableXmm::from_writable_reg(src).unwrap(),
dst: WritableGpr::from_writable_reg(dst).unwrap(),
tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(),
tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(),
}
}

pub(crate) fn cvt_float_to_uint_seq(
src_size: OperandSize,
dst_size: OperandSize,
is_saturating: bool,
src: Writable<Reg>,
dst: Writable<Reg>,
tmp_gpr: Writable<Reg>,
tmp_xmm: Writable<Reg>,
) -> Inst {
debug_assert!(src_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
debug_assert!(dst_size.is_one_of(&[OperandSize::Size32, OperandSize::Size64]));
debug_assert!(src.to_reg().class() == RegClass::Float);
debug_assert!(tmp_xmm.to_reg().class() == RegClass::Float);
debug_assert!(tmp_gpr.to_reg().class() == RegClass::Int);
debug_assert!(dst.to_reg().class() == RegClass::Int);
Inst::CvtFloatToUintSeq {
src_size,
dst_size,
is_saturating,
src: WritableXmm::from_writable_reg(src).unwrap(),
dst: WritableGpr::from_writable_reg(dst).unwrap(),
tmp_gpr: WritableGpr::from_writable_reg(tmp_gpr).unwrap(),
tmp_xmm: WritableXmm::from_writable_reg(tmp_xmm).unwrap(),
}
}

#[allow(dead_code)]
pub(crate) fn xmm_min_max_seq(
size: OperandSize,
Expand Down Expand Up @@ -1257,7 +1205,7 @@ impl PrettyPrint for Inst {
dst_size,
tmp_xmm,
tmp_gpr,
..
is_saturating,
} => {
let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
Expand All @@ -1266,9 +1214,10 @@ impl PrettyPrint for Inst {
format!(
"{} {}, {}, {}, {}",
ljustify(format!(
"cvt_float{}_to_sint{}_seq",
"cvt_float{}_to_sint{}{}_seq",
src_size.to_bits(),
dst_size.to_bits()
dst_size.to_bits(),
if *is_saturating { "_sat" } else { "" },
)),
src,
dst,
Expand All @@ -1284,7 +1233,7 @@ impl PrettyPrint for Inst {
dst_size,
tmp_gpr,
tmp_xmm,
..
is_saturating,
} => {
let src = pretty_print_reg(src.to_reg().to_reg(), src_size.to_bytes(), allocs);
let dst = pretty_print_reg(dst.to_reg().to_reg(), dst_size.to_bytes(), allocs);
Expand All @@ -1293,9 +1242,10 @@ impl PrettyPrint for Inst {
format!(
"{} {}, {}, {}, {}",
ljustify(format!(
"cvt_float{}_to_uint{}_seq",
"cvt_float{}_to_uint{}{}_seq",
src_size.to_bits(),
dst_size.to_bits()
dst_size.to_bits(),
if *is_saturating { "_sat" } else { "" },
)),
src,
dst,
Expand Down
127 changes: 127 additions & 0 deletions cranelift/codegen/src/isa/x64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -3062,3 +3062,130 @@

;; add together the two converted values
(x64_addps a_hi a_lo)))

;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type out_ty (fcvt_to_uint val @ (value_type (ty_scalar_float _)))))
(cvt_float_to_uint_seq out_ty val $false))

(rule (lower (has_type out_ty (fcvt_to_uint_sat val @ (value_type (ty_scalar_float _)))))
(cvt_float_to_uint_seq out_ty val $true))

(rule (lower (has_type out_ty (fcvt_to_sint val @ (value_type (ty_scalar_float _)))))
(cvt_float_to_sint_seq out_ty val $false))

(rule (lower (has_type out_ty (fcvt_to_sint_sat val @ (value_type (ty_scalar_float _)))))
(cvt_float_to_sint_seq out_ty val $true))

;; The x64 backend currently only supports these two type combinations.
(rule (lower (has_type $I32X4 (fcvt_to_sint_sat val @ (value_type $F32X4))))
(let (;; Sets tmp to zero if float is NaN
(tmp Xmm (x64_cmpps val val (FcmpImm.Equal)))
(dst Xmm (x64_andps val tmp))

;; Sets top bit of tmp if float is positive
;; Setting up to set top bit on negative float values
(tmp Xmm (x64_pxor tmp dst))

;; Convert the packed float to packed doubleword.
(dst Xmm (x64_cvttps2dq $F32X4 dst))

;; Set top bit only if < 0
(tmp Xmm (x64_pand dst tmp))
(tmp Xmm (x64_psrad tmp (RegMemImm.Imm 31))))

;; On overflow 0x80000000 is returned to a lane.
;; Below sets positive overflow lanes to 0x7FFFFFFF
;; Keeps negative overflow lanes as is.
(x64_pxor tmp dst)))

;; The algorithm for converting floats to unsigned ints is a little tricky. The
;; complication arises because we are converting from a signed 64-bit int with a positive
;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
;; precisely INT_MAX values we can correctly account for and convert every value in this range
;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
;; After the conversion we add INT_MAX+1 back to this converted value, noting again that
;; values we are trying to account for were already set to INT_MAX+1 during the original conversion.
;; We simply have to create a mask and make sure we are adding together only the lanes that need
;; to be accounted for. Digesting it all the steps then are:
;;
;; Step 1 - Account for NaN and negative floats by setting these src values to zero.
;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
;; reasons described above.
;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
;; values that were originally in the range (0..INT_MAX). This will come in handy during
;; step 7 when we zero negative lanes.
;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
;; UINT_MAX that are now less than INT_MAX thanks to the subtraction.
;; Step 6 - Convert the second set of values (tmp1)
;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000
;; Step 8 - Add the orginal converted src and the converted tmp1 where float values originally less
;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
;;
;;
;; The table below illustrates the result after each step where it matters for the converted set.
;; Note the original value range (original src set) is the final dst in Step 8:
;;
;; Original src set:
;; | Original Value Range | Step 1 | Step 3 | Step 8 |
;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
;;
;; Copied src set (tmp1):
;; | Step 2 | Step 4 |
;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
;;
;; | Step 6 | Step 7 |
;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
(rule (lower (has_type $I32X4 (fcvt_to_uint_sat val @ (value_type $F32X4))))
(let (;; Converting to unsigned int so if float src is negative or NaN
;; will first set to zero.
(tmp2 Xmm (x64_pxor val val)) ;; make a zero
(dst Xmm (x64_maxps val tmp2))

;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
;; single precision IEEE-754 floats can only accurately represent contingous
;; integers up to 2^23 and outside of this range it rounds to the closest
;; integer that it can represent. In the case of INT_MAX, this value gets
;; represented as 0x4f000000 which is the integer value (INT_MAX+1).
(tmp2 Xmm (x64_pcmpeqd tmp2 tmp2))
(tmp2 Xmm (x64_psrld tmp2 (RegMemImm.Imm 1)))
(tmp2 Xmm (x64_cvtdq2ps tmp2))

;; Make a copy of these lanes and then do the first conversion.
;; Overflow lanes greater than the maximum allowed signed value will
;; set to 0x80000000. Negative and NaN lanes will be 0x0
(tmp1 Xmm dst)
(dst Xmm (x64_cvttps2dq $F32X4 dst))

;; Set lanes to src - max_signed_int
(tmp1 Xmm (x64_subps tmp1 tmp2))

;; Create mask for all positive lanes to saturate (i.e. greater than
;; or equal to the maxmimum allowable unsigned int).
(tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual)))

;; Convert those set of lanes that have the max_signed_int factored out.
(tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1))

;; Prepare converted lanes by zeroing negative lanes and prepping lanes
;; that have positive overflow (based on the mask) by setting these lanes
;; to 0x7FFFFFFF
(tmp1 Xmm (x64_pxor tmp1 tmp2))
(tmp2 Xmm (x64_pxor tmp2 tmp2)) ;; make another zero
(tmp1 Xmm (x64_pmaxsd tmp1 tmp2)))

;; Add this second set of converted lanes to the original to properly handle
;; values greater than max signed int.
(x64_paddd tmp1 dst)))
Loading