Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use rustc_codegen_ssa::common::TypeKind;
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
use rustc_middle::bug;
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize};
use rustc_middle::ty::offload_meta::{DynamicSize, MappingFlags, OffloadMetadata, OffloadSize};

use crate::builder::Builder;
use crate::common::CodegenCx;
Expand Down Expand Up @@ -448,14 +448,18 @@ pub(crate) fn gen_define_handling<'ll>(
transfer.iter().map(|m| m.intersection(valid_begin_mappings).bits()).collect();
let transfer_from: Vec<u64> =
transfer.iter().map(|m| m.intersection(MappingFlags::FROM).bits()).collect();
let valid_kernel_mappings = MappingFlags::LITERAL | MappingFlags::IMPLICIT;
// FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()];
let transfer_kernel: Vec<u64> = transfer
.iter()
.map(|m| (m.intersection(valid_kernel_mappings) | MappingFlags::TARGET_PARAM).bits())
.collect();

let actual_sizes = sizes
.iter()
.map(|s| match s {
OffloadSize::Static(sz) => *sz,
OffloadSize::Dynamic => 0,
OffloadSize::Dynamic(_) => 0,
})
.collect::<Vec<_>>();
let offload_sizes =
Expand Down Expand Up @@ -542,12 +546,20 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
}

fn get_runtime_size<'ll, 'tcx>(
_cx: &CodegenCx<'ll, 'tcx>,
_val: &'ll Value,
_meta: &OffloadMetadata,
builder: &mut Builder<'_, 'll, 'tcx>,
args: &[&'ll Value],
index: usize,
meta: &OffloadMetadata,
) -> &'ll Value {
// FIXME(Sa4dUs): handle dynamic-size data (e.g. slices)
bug!("offload does not support dynamic sizes yet");
match meta.payload_size {
OffloadSize::Dynamic(DynamicSize::Slice { element_size }) => {
let length_idx = index + 1;
let length = args[length_idx];
let length_i64 = builder.intcast(length, builder.cx.type_i64(), false);
builder.mul(length_i64, builder.cx.get_const_i64(element_size))
}
OffloadSize::Static(_) => bug!("expected dynamic size"),
}
}

// For each kernel *call*, we now use some of our previous declared globals to move data to and from
Expand Down Expand Up @@ -588,7 +600,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
offload_dims;

let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic));
let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic(_)));

let tgt_decl = offload_globals.launcher_fn;
let tgt_target_kernel_ty = offload_globals.launcher_ty;
Expand Down Expand Up @@ -683,9 +695,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
builder.store(geps[i as usize], gep2, Align::EIGHT);

if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) {
if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic(_)) {
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]);
let size_val = get_runtime_size(builder, args, i as usize, &metadata[i as usize]);
builder.store(size_val, gep3, Align::EIGHT);
}
}
Expand Down
15 changes: 13 additions & 2 deletions compiler/rustc_codegen_llvm/src/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1437,9 +1437,20 @@ fn codegen_offload<'ll, 'tcx>(
let sig = tcx.instantiate_bound_regions_with_erased(sig);
let inputs = sig.inputs();

let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::<Vec<_>>();
let fn_abi = cx.fn_abi_of_instance(fn_target, ty::List::empty());

let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::<Vec<_>>();
let mut metadata = Vec::new();
let mut types = Vec::new();

for (i, arg_abi) in fn_abi.args.iter().enumerate() {
let ty = inputs[i];
let decomposed = OffloadMetadata::handle_abi(cx, tcx, ty, arg_abi);

for (meta, entry_ty) in decomposed {
metadata.push(meta);
types.push(bx.cx.layout_of(entry_ty).llvm_type(bx.cx));
}
}

let offload_globals_ref = cx.offload_globals.borrow();
let offload_globals = match offload_globals_ref.as_ref() {
Expand Down
39 changes: 37 additions & 2 deletions compiler/rustc_middle/src/ty/offload_meta.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,29 @@
use bitflags::bitflags;
use rustc_abi::{BackendRepr, TyAbiInterface};
use rustc_target::callconv::ArgAbi;

use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};

#[derive(Debug, Copy, Clone)]
pub struct OffloadMetadata {
pub payload_size: OffloadSize,
pub mode: MappingFlags,
}

#[derive(Debug, Copy, Clone)]
pub enum OffloadSize {
Dynamic,
Static(u64),
Dynamic(DynamicSize),
}

#[derive(Debug, Copy, Clone)]
pub enum DynamicSize {
Slice { element_size: u64 },
}

bitflags! {
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
#[derive(Debug, Copy, Clone)]
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[repr(transparent)]
pub struct MappingFlags: u64 {
/// No flags.
Expand Down Expand Up @@ -62,11 +70,38 @@ impl OffloadMetadata {
mode: MappingFlags::from_ty(tcx, ty),
}
}

pub fn handle_abi<'tcx, C>(
cx: &C,
tcx: TyCtxt<'tcx>,
ty: Ty<'tcx>,
arg_abi: &ArgAbi<'tcx, Ty<'tcx>>,
) -> Vec<(Self, Ty<'tcx>)>
where
Ty<'tcx>: TyAbiInterface<'tcx, C>,
{
match arg_abi.layout.backend_repr {
BackendRepr::ScalarPair(_, _) => (0..2)
.map(|i| {
let ty = arg_abi.layout.field(cx, i).ty;
(OffloadMetadata::from_ty(tcx, ty), ty)
})
.collect(),
_ => vec![(OffloadMetadata::from_ty(tcx, ty), ty)],
}
}
}

// FIXME(Sa4dUs): implement a solid logic to determine the payload size
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize {
match ty.kind() {
ty::Slice(elem_ty) => {
let layout = tcx.layout_of(PseudoCanonicalInput {
typing_env: TypingEnv::fully_monomorphized(),
value: *elem_ty,
});
OffloadSize::Dynamic(DynamicSize::Slice { element_size: layout.unwrap().size.bytes() })
}
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
_ => OffloadSize::Static(
tcx.layout_of(PseudoCanonicalInput {
Expand Down
27 changes: 27 additions & 0 deletions tests/codegen-llvm/gpu_offload/slice_device.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
//@ add-minicore
//@ revisions: amdgpu nvptx
//@[nvptx] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target nvptx64-nvidia-cuda --crate-type=rlib
//@[nvptx] needs-llvm-components: nvptx
//@[amdgpu] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 --crate-type=rlib
//@[amdgpu] needs-llvm-components: amdgpu
//@ no-prefer-dynamic
//@ needs-offload

#![feature(abi_gpu_kernel, rustc_attrs, no_core)]
#![no_core]

extern crate minicore;

// CHECK: ; Function Attrs
// nvptx-NEXT: define ptx_kernel void @foo
// amdgpu-NEXT: define amdgpu_kernel void @foo
// CHECK-SAME: ptr readnone captures(none) %dyn_ptr
// nvptx-SAME: [2 x i64] %0
// amdgpu-SAME: ptr noalias {{.*}} %0, i64 {{.*}} %1
// CHECK-NEXT: entry:
// CHECK-NEXT: ret void
// CHECK-NEXT: }

#[unsafe(no_mangle)]
#[rustc_offload_kernel]
pub unsafe extern "gpu-kernel" fn foo(x: &[f32]) {}
35 changes: 35 additions & 0 deletions tests/codegen-llvm/gpu_offload/slice_host.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//@ compile-flags: -Zoffload=Test -Zunstable-options -C opt-level=1 -Clto=fat
//@ no-prefer-dynamic
//@ needs-offload

// This test verifies that offload is properly handling slices passing them properly to the device

#![feature(abi_gpu_kernel)]
#![feature(rustc_attrs)]
#![feature(core_intrinsics)]
#![no_main]

// CHECK: @anon.[[ID:.*]].0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1

// CHECK-DAG: @.offload_sizes.[[K:[^ ]*foo]] = private unnamed_addr constant [2 x i64] [i64 0, i64 8]
// CHECK-DAG: @.offload_maptypes.[[K]].begin = private unnamed_addr constant [2 x i64] [i64 1, i64 768]
// CHECK-DAG: @.offload_maptypes.[[K]].kernel = private unnamed_addr constant [2 x i64] [i64 32, i64 800]
// CHECK-DAG: @.offload_maptypes.[[K]].end = private unnamed_addr constant [2 x i64] [i64 2, i64 0]

// CHECK: define{{( dso_local)?}} void @main()
// CHECK: %.offload_sizes = alloca [2 x i64], align 8
// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}} %.offload_sizes, ptr {{.*}} @.offload_sizes.foo, i64 16, i1 false)
// CHECK: store i64 16, ptr %.offload_sizes, align 8
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
// CHECK: %11 = call i32 @__tgt_target_kernel(ptr nonnull @anon.[[ID]].1, i64 -1, i32 1, i32 1, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args)
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)

#[unsafe(no_mangle)]
fn main() {
let mut x = [0.0, 0.0, 0.0, 0.0];
core::intrinsics::offload::<_, _, ()>(foo, [1, 1, 1], [1, 1, 1], ((&mut x) as &mut [f64],));
}

unsafe extern "C" {
pub fn foo(x: &mut [f32]);
}
Loading