Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ impl VecAluOpRRImm5 {
VecAluOpRRImm5::VorVI => 0b001010,
VecAluOpRRImm5::VxorVI => 0b001011,
VecAluOpRRImm5::VslidedownVI => 0b001111,
VecAluOpRRImm5::VssrlVI => 0b101010,
VecAluOpRRImm5::VmergeVIM => 0b010111,
VecAluOpRRImm5::VsadduVI => 0b100000,
VecAluOpRRImm5::VsaddVI => 0b100001,
Expand All @@ -526,6 +527,7 @@ impl VecAluOpRRImm5 {
| VecAluOpRRImm5::VandVI
| VecAluOpRRImm5::VorVI
| VecAluOpRRImm5::VxorVI
| VecAluOpRRImm5::VssrlVI
| VecAluOpRRImm5::VslidedownVI
| VecAluOpRRImm5::VmergeVIM
| VecAluOpRRImm5::VsadduVI
Expand All @@ -539,6 +541,7 @@ impl VecAluOpRRImm5 {
match self {
VecAluOpRRImm5::VsllVI
| VecAluOpRRImm5::VsrlVI
| VecAluOpRRImm5::VssrlVI
| VecAluOpRRImm5::VsraVI
| VecAluOpRRImm5::VslidedownVI
| VecAluOpRRImm5::VrgatherVI
Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst_vector.isle
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
(VandVI)
(VorVI)
(VxorVI)
(VssrlVI)
(VslidedownVI)
(VmergeVIM)
(VrgatherVI)
Expand Down Expand Up @@ -663,6 +664,15 @@
(rule (rv_vxor_vi vs2 imm mask vstate)
(vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate))

;; Helper for emitting the `vssrl.vi` instruction.
;;
;; vd[i] = (unsigned(vs2[i]) >> imm) + r
;;
;; `r` here is the rounding mode currently selected.
(decl rv_vssrl_vi (VReg UImm5 VecOpMasking VState) VReg)
(rule (rv_vssrl_vi vs2 imm mask vstate)
(vec_alu_rr_uimm5 (VecAluOpRRImm5.VssrlVI) vs2 imm mask vstate))

;; Helper for emitting the `vnot.v` instruction.
;; This is just a mnemonic for `vxor.vi vd, vs, -1`
(decl rv_vnot_v (VReg VecOpMasking VState) VReg)
Expand Down
25 changes: 25 additions & 0 deletions cranelift/codegen/src/isa/riscv64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1817,3 +1817,28 @@
(rhs_hi VReg (rv_vcompress_vm y even_mask ty))
(rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty)))
(rv_vadd_vv lhs rhs (unmasked) ty)))

;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `avg_round` computes the unsigned average with rounding: a := (x + y + 1) // 2
;;
;; See Section "2–5 Average of Two Integers" of the Hacker's Delight book
;;
;; The floor average of two integers without overflow can be computed as:
;; t = (x & y) + ((x ^ y) >> 1)
;;
;; The right shift should be a logical shift if the integers are unsigned.
;;
;; We are however interested in the ceiling average (x + y + 1). For that
;; we use a special rounding mode in the right shift instruction.
;;
;; For the right shift instruction we use `vssrl` which is a Scaling Shift
;; Right Logical instruction using the `vxrm` fixed-point rouding mode. The
;; default rounding mode is `rnu` (round-to-nearest-up (add +0.5 LSB)).
;; Which is coincidentally the rounding mode we want for `avg_round`.
(rule (lower (has_type (ty_vec_fits_in_register ty) (avg_round x y)))
(if-let one (u64_to_uimm5 1))
(let ((lhs VReg (rv_vand_vv x y (unmasked) ty))
(xor VReg (rv_vxor_vv x y (unmasked) ty))
(rhs VReg (rv_vssrl_vi xor one (unmasked) ty)))
(rv_vadd_vv lhs rhs (unmasked) ty)))
194 changes: 194 additions & 0 deletions cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
test compile precise-output
set unwind_info=false
target riscv64 has_v

function %avg_round_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=16, #vtype=(e8, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=16, #vtype=(e8, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %avg_round_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=8, #vtype=(e16, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=8, #vtype=(e16, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x84, 0xcc
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %avg_round_i32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=4, #vtype=(e32, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=4, #vtype=(e32, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %avg_round_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=2, #vtype=(e64, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=2, #vtype=(e64, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x81, 0xcd
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; the interpreter does not currently support SIMD `avg_round`.
test run
target aarch64
target riscv64 has_v
; x86_64 and s390x do not currently support 64-bit vectors, or
; `avg_round` on `i64x2` values.
; x86_64 also does not currently support `avg_round.i32x4`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ target s390x
set enable_simd
target x86_64
target x86_64 skylake
target riscv64 has_v

function %average_rounding_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
Expand Down