Skip to content

Commit e8b7b6a

Browse files
committed
Add offload slice support
1 parent 1f77372 commit e8b7b6a

File tree

5 files changed

+135
-15
lines changed

5 files changed

+135
-15
lines changed

compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use rustc_codegen_ssa::common::TypeKind;
88
use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
99
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods};
1010
use rustc_middle::bug;
11-
use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize};
11+
use rustc_middle::ty::offload_meta::{DynamicSize, MappingFlags, OffloadMetadata, OffloadSize};
1212

1313
use crate::builder::Builder;
1414
use crate::common::CodegenCx;
@@ -448,14 +448,18 @@ pub(crate) fn gen_define_handling<'ll>(
448448
transfer.iter().map(|m| m.intersection(valid_begin_mappings).bits()).collect();
449449
let transfer_from: Vec<u64> =
450450
transfer.iter().map(|m| m.intersection(MappingFlags::FROM).bits()).collect();
451+
let valid_kernel_mappings = MappingFlags::LITERAL | MappingFlags::IMPLICIT;
451452
// FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary
452-
let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()];
453+
let transfer_kernel: Vec<u64> = transfer
454+
.iter()
455+
.map(|m| (m.intersection(valid_kernel_mappings) | MappingFlags::TARGET_PARAM).bits())
456+
.collect();
453457

454458
let actual_sizes = sizes
455459
.iter()
456460
.map(|s| match s {
457461
OffloadSize::Static(sz) => *sz,
458-
OffloadSize::Dynamic => 0,
462+
OffloadSize::Dynamic(_) => 0,
459463
})
460464
.collect::<Vec<_>>();
461465
let offload_sizes =
@@ -542,12 +546,20 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 {
542546
}
543547

544548
fn get_runtime_size<'ll, 'tcx>(
545-
_cx: &CodegenCx<'ll, 'tcx>,
546-
_val: &'ll Value,
547-
_meta: &OffloadMetadata,
549+
builder: &mut Builder<'_, 'll, 'tcx>,
550+
args: &[&'ll Value],
551+
index: usize,
552+
meta: &OffloadMetadata,
548553
) -> &'ll Value {
549-
// FIXME(Sa4dUs): handle dynamic-size data (e.g. slices)
550-
bug!("offload does not support dynamic sizes yet");
554+
match meta.payload_size {
555+
OffloadSize::Dynamic(DynamicSize::Slice { element_size }) => {
556+
let length_idx = index + 1;
557+
let length = args[length_idx];
558+
let length_i64 = builder.intcast(length, builder.cx.type_i64(), false);
559+
builder.mul(length_i64, builder.cx.get_const_i64(element_size))
560+
}
561+
OffloadSize::Static(_) => bug!("expected dynamic size"),
562+
}
551563
}
552564

553565
// For each kernel *call*, we now use some of our previous declared globals to move data to and from
@@ -588,7 +600,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
588600
let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } =
589601
offload_dims;
590602

591-
let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic));
603+
let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic(_)));
592604

593605
let tgt_decl = offload_globals.launcher_fn;
594606
let tgt_target_kernel_ty = offload_globals.launcher_ty;
@@ -683,9 +695,9 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
683695
let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]);
684696
builder.store(geps[i as usize], gep2, Align::EIGHT);
685697

686-
if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) {
698+
if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic(_)) {
687699
let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]);
688-
let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]);
700+
let size_val = get_runtime_size(builder, args, i as usize, &metadata[i as usize]);
689701
builder.store(size_val, gep3, Align::EIGHT);
690702
}
691703
}

compiler/rustc_codegen_llvm/src/intrinsic.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,9 +1392,20 @@ fn codegen_offload<'ll, 'tcx>(
13921392
let sig = tcx.instantiate_bound_regions_with_erased(sig);
13931393
let inputs = sig.inputs();
13941394

1395-
let metadata = inputs.iter().map(|ty| OffloadMetadata::from_ty(tcx, *ty)).collect::<Vec<_>>();
1395+
let fn_abi = cx.fn_abi_of_instance(fn_target, ty::List::empty());
13961396

1397-
let types = inputs.iter().map(|ty| cx.layout_of(*ty).llvm_type(cx)).collect::<Vec<_>>();
1397+
let mut metadata = Vec::new();
1398+
let mut types = Vec::new();
1399+
1400+
for (i, arg_abi) in fn_abi.args.iter().enumerate() {
1401+
let ty = inputs[i];
1402+
let decomposed = OffloadMetadata::handle_abi(cx, tcx, ty, arg_abi);
1403+
1404+
for (meta, entry_ty) in decomposed {
1405+
metadata.push(meta);
1406+
types.push(bx.cx.layout_of(entry_ty).llvm_type(bx.cx));
1407+
}
1408+
}
13981409

13991410
let offload_globals_ref = cx.offload_globals.borrow();
14001411
let offload_globals = match offload_globals_ref.as_ref() {

compiler/rustc_middle/src/ty/offload_meta.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,29 @@
11
use bitflags::bitflags;
2+
use rustc_abi::{BackendRepr, TyAbiInterface};
3+
use rustc_target::callconv::ArgAbi;
24

35
use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv};
46

7+
#[derive(Debug, Copy, Clone)]
58
pub struct OffloadMetadata {
69
pub payload_size: OffloadSize,
710
pub mode: MappingFlags,
811
}
912

1013
#[derive(Debug, Copy, Clone)]
1114
pub enum OffloadSize {
12-
Dynamic,
1315
Static(u64),
16+
Dynamic(DynamicSize),
17+
}
18+
19+
#[derive(Debug, Copy, Clone)]
20+
pub enum DynamicSize {
21+
Slice { element_size: u64 },
1422
}
1523

1624
bitflags! {
1725
/// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP.
18-
#[derive(Debug, Copy, Clone)]
26+
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1927
#[repr(transparent)]
2028
pub struct MappingFlags: u64 {
2129
/// No flags.
@@ -62,11 +70,38 @@ impl OffloadMetadata {
6270
mode: MappingFlags::from_ty(tcx, ty),
6371
}
6472
}
73+
74+
pub fn handle_abi<'tcx, C>(
75+
cx: &C,
76+
tcx: TyCtxt<'tcx>,
77+
ty: Ty<'tcx>,
78+
arg_abi: &ArgAbi<'tcx, Ty<'tcx>>,
79+
) -> Vec<(Self, Ty<'tcx>)>
80+
where
81+
Ty<'tcx>: TyAbiInterface<'tcx, C>,
82+
{
83+
match arg_abi.layout.backend_repr {
84+
BackendRepr::ScalarPair(_, _) => (0..2)
85+
.map(|i| {
86+
let ty = arg_abi.layout.field(cx, i).ty;
87+
(OffloadMetadata::from_ty(tcx, ty), ty)
88+
})
89+
.collect(),
90+
_ => vec![(OffloadMetadata::from_ty(tcx, ty), ty)],
91+
}
92+
}
6593
}
6694

6795
// FIXME(Sa4dUs): implement a solid logic to determine the payload size
6896
fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize {
6997
match ty.kind() {
98+
ty::Slice(elem_ty) => {
99+
let layout = tcx.layout_of(PseudoCanonicalInput {
100+
typing_env: TypingEnv::fully_monomorphized(),
101+
value: *elem_ty,
102+
});
103+
OffloadSize::Dynamic(DynamicSize::Slice { element_size: layout.unwrap().size.bytes() })
104+
}
70105
ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner),
71106
_ => OffloadSize::Static(
72107
tcx.layout_of(PseudoCanonicalInput {
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
//@ add-minicore
2+
//@ revisions: amdgpu nvptx
3+
//@[nvptx] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target nvptx64-nvidia-cuda --crate-type=rlib
4+
//@[nvptx] needs-llvm-components: nvptx
5+
//@[amdgpu] compile-flags: -Copt-level=3 -Zunstable-options -Zoffload=Device --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 --crate-type=rlib
6+
//@[amdgpu] needs-llvm-components: amdgpu
7+
//@ no-prefer-dynamic
8+
//@ needs-offload
9+
10+
#![feature(abi_gpu_kernel, rustc_attrs, no_core)]
11+
#![no_core]
12+
13+
extern crate minicore;
14+
15+
// CHECK: ; Function Attrs
16+
// nvptx-NEXT: define ptx_kernel void @foo
17+
// amdgpu-NEXT: define amdgpu_kernel void @foo
18+
// CHECK-SAME: ptr readnone captures(none) %dyn_ptr
19+
// nvptx-SAME: [2 x i64] %0
20+
// amdgpu-SAME: ptr noalias {{.*}} %0, i64 {{.*}} %1
21+
// CHECK-NEXT: entry:
22+
// CHECK-NEXT: ret void
23+
// CHECK-NEXT: }
24+
25+
#[unsafe(no_mangle)]
26+
#[rustc_offload_kernel]
27+
pub unsafe extern "gpu-kernel" fn foo(x: &[f32]) {}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
//@ compile-flags: -Zoffload=Test -Zunstable-options -C opt-level=1 -Clto=fat
2+
//@ no-prefer-dynamic
3+
//@ needs-offload
4+
5+
// This test verifies that offload is properly handling slices passing them properly to the device
6+
7+
#![feature(abi_gpu_kernel)]
8+
#![feature(rustc_attrs)]
9+
#![feature(core_intrinsics)]
10+
#![no_main]
11+
12+
// CHECK: @anon.[[ID:.*]].0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
13+
14+
// CHECK-DAG: @.offload_sizes.[[K:[^ ]*foo]] = private unnamed_addr constant [2 x i64] [i64 0, i64 8]
15+
// CHECK-DAG: @.offload_maptypes.[[K]].begin = private unnamed_addr constant [2 x i64] [i64 1, i64 768]
16+
// CHECK-DAG: @.offload_maptypes.[[K]].kernel = private unnamed_addr constant [2 x i64] [i64 32, i64 800]
17+
// CHECK-DAG: @.offload_maptypes.[[K]].end = private unnamed_addr constant [2 x i64] [i64 2, i64 0]
18+
19+
// CHECK: define{{( dso_local)?}} void @main()
20+
// CHECK: %.offload_sizes = alloca [2 x i64], align 8
21+
// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}} %.offload_sizes, ptr {{.*}} @.offload_sizes.foo, i64 16, i1 false)
22+
// CHECK: store i64 16, ptr %.offload_sizes, align 8
23+
// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null)
24+
// CHECK: %11 = call i32 @__tgt_target_kernel(ptr nonnull @anon.[[ID]].1, i64 -1, i32 1, i32 1, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args)
25+
// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.[[ID]].1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null)
26+
27+
#[unsafe(no_mangle)]
28+
fn main() {
29+
let mut x = [0.0, 0.0, 0.0, 0.0];
30+
core::intrinsics::offload::<_, _, ()>(foo, [1, 1, 1], [1, 1, 1], ((&mut x) as &mut [f64],));
31+
}
32+
33+
unsafe extern "C" {
34+
pub fn foo(x: &mut [f32]);
35+
}

0 commit comments

Comments
 (0)