From 3b779b6d32466b3b97514eace0a43bd58619216c Mon Sep 17 00:00:00 2001
From: Flakebi <flakebi@t-online.de>
Date: Wed, 18 Feb 2026 06:22:27 +0100
Subject: [PATCH 1/4] Add intrinsic for launch-sized workgroup memory on GPUs

Workgroup memory is a memory region that is shared between all
threads in a workgroup on GPUs. Workgroup memory can be allocated
statically or after compilation, when launching a gpu-kernel.
The intrinsic added here returns the pointer to the memory that is
allocated at launch-time.

# Interface

With this change, workgroup memory can be accessed in Rust by
calling the new `gpu_launch_sized_workgroup_mem<T>() -> *mut T`
intrinsic.

It returns the pointer to workgroup memory guaranteeing that it is
aligned to at least the alignment of `T`.
The pointer is dereferencable for the size specified when launching the
current gpu-kernel (which may be the size of `T` but can also be larger
or smaller or zero).

All calls to this intrinsic return a pointer to the same address.

See the intrinsic documentation for more details.

## Alternative Interfaces

It was also considered to expose dynamic workgroup memory as extern
static variables in Rust, like they are represented in LLVM IR.
However, due to the pointer not being guaranteed to be dereferencable
(that depends on the allocated size at runtime), such a global must be
zero-sized, which makes global variables a bad fit.

# Implementation Details

Workgroup memory in amdgpu and nvptx lives in address space 3.
Workgroup memory from a launch is implemented by creating an
external global variable in address space 3. The global is declared with
size 0, as the actual size is only known at runtime. It is defined
behavior in LLVM to access an external global outside the defined size.

There is no similar way to get the allocated size of launch-sized
workgroup memory on amdgpu an nvptx, so users have to pass this
out-of-band or rely on target specific ways for now.
---
 compiler/rustc_abi/src/lib.rs                 |  3 ++
 compiler/rustc_codegen_llvm/src/declare.rs    | 23 ++++++++++
 compiler/rustc_codegen_llvm/src/intrinsic.rs  | 43 ++++++++++++++++++-
 compiler/rustc_codegen_llvm/src/llvm/ffi.rs   |  7 +++
 .../rustc_codegen_ssa/src/mir/intrinsic.rs    |  1 +
 .../rustc_hir_analysis/src/check/intrinsic.rs |  2 +
 .../rustc_llvm/llvm-wrapper/RustWrapper.cpp   | 26 ++++++++---
 compiler/rustc_span/src/symbol.rs             |  1 +
 library/core/src/intrinsics/gpu.rs            | 40 +++++++++++++++++
 src/tools/tidy/src/style.rs                   |  4 ++
 .../gpu-launch-sized-workgroup-memory.rs      | 32 ++++++++++++++
 11 files changed, 175 insertions(+), 7 deletions(-)
 create mode 100644 tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs
diff --git a/compiler/rustc_abi/src/lib.rs b/compiler/rustc_abi/src/lib.rs
index 253dff6f8e75c..7bf28640818b5 100644
--- a/compiler/rustc_abi/src/lib.rs
+++ b/compiler/rustc_abi/src/lib.rs
@@ -1700,6 +1700,9 @@ pub struct AddressSpace(pub u32);
 impl AddressSpace {
     /// LLVM's `0` address space.
     pub const ZERO: Self = AddressSpace(0);
+    /// The address space for workgroup memory on nvptx and amdgpu.
+    /// See e.g. the `gpu_launch_sized_workgroup_mem` intrinsic for details.
+    pub const GPU_WORKGROUP: Self = AddressSpace(3);
 }
 
 /// The way we represent values to the backend
diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs
index 8f69f176138cf..8c5fcd36fa69b 100644
--- a/compiler/rustc_codegen_llvm/src/declare.rs
+++ b/compiler/rustc_codegen_llvm/src/declare.rs
@@ -14,6 +14,7 @@
 use std::borrow::Borrow;
 
 use itertools::Itertools;
+use rustc_abi::AddressSpace;
 use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods;
 use rustc_data_structures::fx::FxIndexSet;
 use rustc_middle::ty::{Instance, Ty};
@@ -97,6 +98,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
             )
         }
     }
+
+    /// Declare a global value in a specific address space.
+    ///
+    /// If there’s a value with the same name already declared, the function will
+    /// return its Value instead.
+    pub(crate) fn declare_global_in_addrspace(
+        &self,
+        name: &str,
+        ty: &'ll Type,
+        addr_space: AddressSpace,
+    ) -> &'ll Value {
+        debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
+        unsafe {
+            llvm::LLVMRustGetOrInsertGlobalInAddrspace(
+                (**self).borrow().llmod,
+                name.as_c_char_ptr(),
+                name.len(),
+                ty,
+                addr_space.0,
+            )
+        }
+    }
 }
 
 impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index cc6ecee60b0e4..182dd162958ca 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -3,7 +3,8 @@ use std::ffi::c_uint;
 use std::{assert_matches, ptr};
 
 use rustc_abi::{
-    Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size, WrappingRange,
+    AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size,
+    WrappingRange,
 };
 use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
 use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
@@ -23,7 +24,7 @@ use rustc_session::config::CrateType;
 use rustc_span::{Span, Symbol, sym};
 use rustc_symbol_mangling::{mangle_internal_symbol, symbol_name_for_instance_in_crate};
 use rustc_target::callconv::PassMode;
-use rustc_target::spec::Os;
+use rustc_target::spec::{Arch, Os};
 use tracing::debug;
 
 use crate::abi::FnAbiLlvmExt;
@@ -590,6 +591,44 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                 return Ok(());
             }
 
+            sym::gpu_launch_sized_workgroup_mem => {
+                // Generate an anonymous global per call, with these properties:
+                // 1. The global is in the address space for workgroup memory
+                // 2. It is an `external` global
+                // 3. It is correctly aligned for the pointee `T`
+                // All instances of extern addrspace(gpu_workgroup) globals are merged in the LLVM backend.
+                // The name is irrelevant.
+                // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
+                // FIXME Workaround an nvptx backend issue that extern globals must have a name
+                let name = if tcx.sess.target.arch == Arch::Nvptx64 {
+                    "gpu_launch_sized_workgroup_mem"
+                } else {
+                    ""
+                };
+                let global = self.declare_global_in_addrspace(
+                    name,
+                    self.type_array(self.type_i8(), 0),
+                    AddressSpace::GPU_WORKGROUP,
+                );
+                let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
+                // The alignment of the global is used to specify the *minimum* alignment that
+                // must be obeyed by the GPU runtime.
+                // When multiple of these global variables are used by a kernel, the maximum alignment is taken.
+                // See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
+                let alignment = self.align_of(*inner_ty).bytes() as u32;
+                unsafe {
+                    // FIXME Workaround the above issue by taking maximum alignment if the global existed
+                    if tcx.sess.target.arch == Arch::Nvptx64 {
+                        if alignment > llvm::LLVMGetAlignment(global) {
+                            llvm::LLVMSetAlignment(global, alignment);
+                        }
+                    } else {
+                        llvm::LLVMSetAlignment(global, alignment);
+                    }
+                }
+                self.cx().const_pointercast(global, self.type_ptr())
+            }
+
             sym::amdgpu_dispatch_ptr => {
                 let val = self.call_intrinsic("llvm.amdgcn.dispatch.ptr", &[], &[]);
                 // Relying on `LLVMBuildPointerCast` to produce an addrspacecast
diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
index 77438472644fc..70653e3a85d34 100644
--- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
+++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs
@@ -1986,6 +1986,13 @@ unsafe extern "C" {
         NameLen: size_t,
         T: &'a Type,
     ) -> &'a Value;
+    pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
+        M: &'a Module,
+        Name: *const c_char,
+        NameLen: size_t,
+        T: &'a Type,
+        AddressSpace: c_uint,
+    ) -> &'a Value;
     pub(crate) fn LLVMRustGetNamedValue(
         M: &Module,
         Name: *const c_char,
diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
index fd0c7c656ac21..f4a5e8baa2a5f 100644
--- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
@@ -111,6 +111,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                 sym::abort
                 | sym::unreachable
                 | sym::cold_path
+                | sym::gpu_launch_sized_workgroup_mem
                 | sym::breakpoint
                 | sym::amdgpu_dispatch_ptr
                 | sym::assert_zero_valid
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index 47420997a509a..86dc3f1d85420 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -134,6 +134,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
         | sym::forget
         | sym::frem_algebraic
         | sym::fsub_algebraic
+        | sym::gpu_launch_sized_workgroup_mem
         | sym::is_val_statically_known
         | sym::log2f16
         | sym::log2f32
@@ -301,6 +302,7 @@ pub(crate) fn check_intrinsic_type(
         sym::field_offset => (1, 0, vec![], tcx.types.usize),
         sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
         sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
+        sym::gpu_launch_sized_workgroup_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
         sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
             (1, 0, vec![], tcx.types.unit)
         }
diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
index 63ff0b2a0a0df..298a26ad9b91c 100644
--- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
+++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
@@ -294,10 +294,12 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
                   .getCallee());
 }
 
-extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
-                                                  const char *Name,
-                                                  size_t NameLen,
-                                                  LLVMTypeRef Ty) {
+// Get the global variable with the given name if it exists or create a new
+// external global.
+extern "C" LLVMValueRef
+LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
+                                     size_t NameLen, LLVMTypeRef Ty,
+                                     unsigned int AddressSpace) {
   Module *Mod = unwrap(M);
   auto NameRef = StringRef(Name, NameLen);
 
@@ -308,10 +310,24 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
   GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
   if (!GV)
     GV = new GlobalVariable(*Mod, unwrap(Ty), false,
-                            GlobalValue::ExternalLinkage, nullptr, NameRef);
+                            GlobalValue::ExternalLinkage, nullptr, NameRef,
+                            nullptr, GlobalValue::NotThreadLocal, AddressSpace);
   return wrap(GV);
 }
 
+// Get the global variable with the given name if it exists or create a new
+// external global.
+extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
+                                                  const char *Name,
+                                                  size_t NameLen,
+                                                  LLVMTypeRef Ty) {
+  Module *Mod = unwrap(M);
+  unsigned int AddressSpace =
+      Mod->getDataLayout().getDefaultGlobalsAddressSpace();
+  return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
+                                              AddressSpace);
+}
+
 // Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
 enum class LLVMRustAttributeKind {
   AlwaysInline = 0,
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
index 286fea7d90505..1ac1f98f3c3df 100644
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@@ -1026,6 +1026,7 @@ symbols! {
         global_asm,
         global_registration,
         globs,
+        gpu_launch_sized_workgroup_mem,
         gt,
         guard_patterns,
         half_open_range_patterns,
diff --git a/library/core/src/intrinsics/gpu.rs b/library/core/src/intrinsics/gpu.rs
index 9e7624841d0c6..ce73008d1ab14 100644
--- a/library/core/src/intrinsics/gpu.rs
+++ b/library/core/src/intrinsics/gpu.rs
@@ -5,6 +5,46 @@
 
 #![unstable(feature = "gpu_intrinsics", issue = "none")]
 
+/// Returns the pointer to workgroup memory allocated at launch-time on GPUs.
+///
+/// Workgroup memory is a memory region that is shared between all threads in
+/// the same workgroup. It is faster to access than other memory but pointers do not
+/// work outside the workgroup where they were obtained.
+/// Workgroup memory can be allocated statically or after compilation, when
+/// launching a gpu-kernel. `gpu_launch_sized_workgroup_mem` returns the pointer to
+/// the memory that is allocated at launch-time.
+/// The size of this memory can differ between launches of a gpu-kernel, depending on
+/// what is specified at launch-time.
+/// However, the alignment is fixed by the kernel itself, at compile-time.
+///
+/// The returned pointer is the start of the workgroup memory region that is
+/// allocated at launch-time.
+/// All calls to `gpu_launch_sized_workgroup_mem` in a workgroup, independent of the
+/// generic type, return the same address, so alias the same memory.
+/// The returned pointer is aligned by at least the alignment of `T`.
+///
+/// # Safety
+///
+/// The pointer is safe to dereference from the start (the returned pointer) up to the
+/// size of workgroup memory that was specified when launching the current gpu-kernel.
+/// This allocated size is not related in any way to `T`.
+///
+/// The user must take care of synchronizing access to workgroup memory between
+/// threads in a workgroup. The usual data race requirements apply.
+///
+/// # Other APIs
+///
+/// CUDA and HIP call this dynamic shared memory, shared between threads in a block.
+/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
+/// GLSL calls this shared memory, shared between invocations in a work group.
+/// DirectX calls this groupshared memory, shared between threads in a thread-group.
+#[must_use = "returns a pointer that does nothing unless used"]
+#[rustc_intrinsic]
+#[rustc_nounwind]
+#[unstable(feature = "gpu_launch_sized_workgroup_mem", issue = "135513")]
+#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
+pub fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;
+
 /// Returns a pointer to the HSA kernel dispatch packet.
 ///
 /// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet.
diff --git a/src/tools/tidy/src/style.rs b/src/tools/tidy/src/style.rs
index 77c15672e0022..26e1f3d0d94fc 100644
--- a/src/tools/tidy/src/style.rs
+++ b/src/tools/tidy/src/style.rs
@@ -222,6 +222,10 @@ fn should_ignore(line: &str) -> bool {
         || static_regex!(
             "\\s*//@ \\!?(count|files|has|has-dir|hasraw|matches|matchesraw|snapshot)\\s.*"
         ).is_match(line)
+        // Matching for FileCheck checks
+        || static_regex!(
+            "\\s*// [a-zA-Z0-9-_]*:\\s.*"
+        ).is_match(line)
 }
 
 /// Returns `true` if `line` is allowed to be longer than the normal limit.
diff --git a/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs b/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs
new file mode 100644
index 0000000000000..8f198d0b26ab3
--- /dev/null
+++ b/tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs
@@ -0,0 +1,32 @@
+// Checks that the GPU intrinsic to get launch-sized workgroup memory works
+// and correctly aligns the `external addrspace(...) global`s over multiple calls.
+
+//@ revisions: amdgpu nvptx
+//@ compile-flags: --crate-type=rlib -Copt-level=1
+//
+//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
+//@ [amdgpu] needs-llvm-components: amdgpu
+//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda
+//@ [nvptx] needs-llvm-components: nvptx
+//@ add-minicore
+#![feature(intrinsics, no_core, rustc_attrs)]
+#![no_core]
+
+extern crate minicore;
+
+#[rustc_intrinsic]
+#[rustc_nounwind]
+fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;
+
+// amdgpu-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4
+// amdgpu-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
+// amdgpu: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
+
+// nvptx: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
+// nvptx: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
+#[unsafe(no_mangle)]
+pub fn fun() -> (*mut i32, *mut f64) {
+    let small = gpu_launch_sized_workgroup_mem::<i32>();
+    let big = gpu_launch_sized_workgroup_mem::<f64>(); // Increase alignment to 8
+    (small, big)
+}

From 27a7052eca5d988068bdbe988e13bbffa3ecfb4f Mon Sep 17 00:00:00 2001
From: Manuel Drehwald <git@manuel.drehwald.info>
Date: Sat, 4 Apr 2026 16:39:41 -0700
Subject: [PATCH 2/4] test allocating 128bytes of shared memory on each kernel
 launch

---
 compiler/rustc_codegen_llvm/src/builder.rs    | 25 ++++++++++++
 .../src/builder/gpu_offload.rs                | 39 +++++++++++--------
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 2d91caf40f3c9..d4ad88aabdd73 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -152,6 +152,31 @@ impl<'a, 'll, CX: Borrow<SCx<'ll>>> GenericBuilder<'a, 'll, CX> {
         val
     }
 
+    pub(crate) fn named_inbounds_gep(
+        &mut self,
+        ty: &'ll Type,
+        ptr: &'ll Value,
+        indices: &[&'ll Value],
+        name: &str,
+    ) -> &'ll Value {
+        let val = unsafe {
+            llvm::LLVMBuildGEPWithNoWrapFlags(
+                self.llbuilder,
+                ty,
+                ptr,
+                indices.as_ptr(),
+                indices.len() as c_uint,
+                UNNAMED,
+                GEPNoWrapFlags::InBounds,
+            )
+        };
+        if name != "" {
+            let name = std::ffi::CString::new(name).unwrap();
+            llvm::set_value_name(val, &name.as_bytes());
+        }
+        val
+    }
+
     pub(crate) fn inbounds_gep(
         &mut self,
         ty: &'ll Type,
diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
index 18a95e810bee4..014b138497d02 100644
--- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
+++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
@@ -319,25 +319,25 @@ impl KernelArgsTy {
         geps: [&'ll Value; 3],
         workgroup_dims: &'ll Value,
         thread_dims: &'ll Value,
-    ) -> [(Align, &'ll Value); 13] {
+    ) -> [(Align, &'ll str, &'ll Value); 13] {
         let four = Align::from_bytes(4).expect("4 Byte alignment should work");
         let eight = Align::EIGHT;
 
         [
-            (four, cx.get_const_i32(KernelArgsTy::OFFLOAD_VERSION)),
-            (four, cx.get_const_i32(num_args)),
-            (eight, geps[0]),
-            (eight, geps[1]),
-            (eight, geps[2]),
-            (eight, memtransfer_types),
-            // The next two are debug infos. FIXME(offload): set them
-            (eight, cx.const_null(cx.type_ptr())), // dbg
-            (eight, cx.const_null(cx.type_ptr())), // dbg
-            (eight, cx.get_const_i64(KernelArgsTy::TRIPCOUNT)),
-            (eight, cx.get_const_i64(KernelArgsTy::FLAGS)),
-            (four, workgroup_dims),
-            (four, thread_dims),
-            (four, cx.get_const_i32(0)),
+            (four, "Version", cx.get_const_i32(KernelArgsTy::OFFLOAD_VERSION)),
+            (four, "NumArgs", cx.get_const_i32(num_args)),
+            (eight, "ArgBasePtrs", geps[0]),
+            (eight, "ArgPtrs", geps[1]),
+            (eight, "ArgSizes", geps[2]),
+            (eight, "ArgTypes", memtransfer_types),
+            // The "", next two are debug infos. FIXME(offload): set them
+            (eight, "ArgNames", cx.const_null(cx.type_ptr())), // dbg
+            (eight, "ArgMappers", cx.const_null(cx.type_ptr())), // dbg
+            (eight, "Tripcount", cx.get_const_i64(KernelArgsTy::TRIPCOUNT)),
+            (eight, "Flags", cx.get_const_i64(KernelArgsTy::FLAGS)),
+            (four, "NumTeams", workgroup_dims),
+            (four, "ThreadLimit", thread_dims),
+            (four, "DynCGroupMem", cx.get_const_i32(128)),
         ]
     }
 }
@@ -746,8 +746,13 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
     // Step 3)
     // Here we fill the KernelArgsTy, see the documentation above
     for (i, value) in values.iter().enumerate() {
-        let ptr = builder.inbounds_gep(tgt_kernel_decl, a5, &[i32_0, cx.get_const_i32(i as u64)]);
-        builder.store(value.1, ptr, value.0);
+        let ptr = builder.named_inbounds_gep(
+            tgt_kernel_decl,
+            a5,
+            &[i32_0, cx.get_const_i32(i as u64)],
+            value.1,
+        );
+        builder.store(value.2, ptr, value.0);
     }
 
     let args = vec![

From 4705da81ca33a0067dd798552abd7ced80bb69cf Mon Sep 17 00:00:00 2001
From: Manuel Drehwald <git@manuel.drehwald.info>
Date: Sun, 5 Apr 2026 12:56:02 -0700
Subject: [PATCH 3/4] wip

---
 .../src/builder/gpu_offload.rs                  | 15 ++++++++++++---
 compiler/rustc_codegen_llvm/src/intrinsic.rs    | 17 +++++++++++++++--
 .../rustc_hir_analysis/src/check/intrinsic.rs   |  1 +
 library/core/src/intrinsics/mod.rs              |  1 +
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
index 014b138497d02..0772d32f612e0 100644
--- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
+++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs
@@ -319,6 +319,7 @@ impl KernelArgsTy {
         geps: [&'ll Value; 3],
         workgroup_dims: &'ll Value,
         thread_dims: &'ll Value,
+        dyn_cache: &'ll Value,
     ) -> [(Align, &'ll str, &'ll Value); 13] {
         let four = Align::from_bytes(4).expect("4 Byte alignment should work");
         let eight = Align::EIGHT;
@@ -337,7 +338,7 @@ impl KernelArgsTy {
             (eight, "Flags", cx.get_const_i64(KernelArgsTy::FLAGS)),
             (four, "NumTeams", workgroup_dims),
             (four, "ThreadLimit", thread_dims),
-            (four, "DynCGroupMem", cx.get_const_i32(128)),
+            (four, "DynCGroupMem", dyn_cache),
         ]
     }
 }
@@ -576,6 +577,7 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
     metadata: &[OffloadMetadata],
     offload_globals: &OffloadGlobals<'ll>,
     offload_dims: &OffloadKernelDims<'ll>,
+    dyn_cache: &'ll Value,
 ) {
     let cx = builder.cx;
     let OffloadKernelGlobals {
@@ -740,8 +742,15 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>(
         num_args,
         s_ident_t,
     );
-    let values =
-        KernelArgsTy::new(&cx, num_args, memtransfer_kernel, geps, workgroup_dims, thread_dims);
+    let values = KernelArgsTy::new(
+        &cx,
+        num_args,
+        memtransfer_kernel,
+        geps,
+        workgroup_dims,
+        thread_dims,
+        dyn_cache,
+    );
 
     // Step 3)
     // Here we fill the KernelArgsTy, see the documentation above
diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index 182dd162958ca..6c6ac584fce8c 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -1462,7 +1462,11 @@ fn codegen_offload<'ll, 'tcx>(
     };
 
     let offload_dims = OffloadKernelDims::from_operands(bx, &args[1], &args[2]);
-    let args = get_args_from_tuple(bx, args[3], fn_target);
+    let args = get_args_from_tuple(bx, args[4], fn_target);
+    //match tuple_op.val {
+    //    OperandValue::Immediate(val) => vec![val],
+    //let dyn_cache = args[3];
+    let dyn_cache = bx.const_i32(512);
     let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE);
 
     let sig = tcx.fn_sig(fn_target.def_id()).skip_binder();
@@ -1483,7 +1487,16 @@ fn codegen_offload<'ll, 'tcx>(
     };
     register_offload(cx);
     let offload_data = gen_define_handling(&cx, &metadata, target_symbol, offload_globals);
-    gen_call_handling(bx, &offload_data, &args, &types, &metadata, offload_globals, &offload_dims);
+    gen_call_handling(
+        bx,
+        &offload_data,
+        &args,
+        &types,
+        &metadata,
+        offload_globals,
+        &offload_dims,
+        &dyn_cache,
+    );
 }
 
 fn get_args_from_tuple<'ll, 'tcx>(
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index 86dc3f1d85420..619460f03e214 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -359,6 +359,7 @@ pub(crate) fn check_intrinsic_type(
                 param(0),
                 Ty::new_array_with_const_len(tcx, tcx.types.u32, Const::from_target_usize(tcx, 3)),
                 Ty::new_array_with_const_len(tcx, tcx.types.u32, Const::from_target_usize(tcx, 3)),
+                tcx.types.u32,
                 param(1),
             ],
             param(2),
diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs
index 68e4f1c2aa787..37dcc61f63eb8 100644
--- a/library/core/src/intrinsics/mod.rs
+++ b/library/core/src/intrinsics/mod.rs
@@ -3530,6 +3530,7 @@ pub const fn offload<F, T: crate::marker::Tuple, R>(
     f: F,
     workgroup_dim: [u32; 3],
     thread_dim: [u32; 3],
+    dyn_cache: u32,
     args: T,
 ) -> R;
 

From 1c7ea71723dfaa662e17db838e6f96b7dd3b376e Mon Sep 17 00:00:00 2001
From: Manuel Drehwald <git@manuel.drehwald.info>
Date: Sun, 5 Apr 2026 13:19:41 -0700
Subject: [PATCH 4/4] wip

---
 compiler/rustc_codegen_llvm/src/intrinsic.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs
index 6c6ac584fce8c..33c8ce7d5efc9 100644
--- a/compiler/rustc_codegen_llvm/src/intrinsic.rs
+++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs
@@ -1462,11 +1462,15 @@ fn codegen_offload<'ll, 'tcx>(
     };
 
     let offload_dims = OffloadKernelDims::from_operands(bx, &args[1], &args[2]);
+    let dyn_cache = match args[3].val {
+        OperandValue::Immediate(val) => val,
+        _ => panic!("unparsable"),
+    };
+    //let dyn_cache = args[3]; //bx.const_i32(512);
+    dbg!(&dyn_cache);
     let args = get_args_from_tuple(bx, args[4], fn_target);
-    //match tuple_op.val {
-    //    OperandValue::Immediate(val) => vec![val],
     //let dyn_cache = args[3];
-    let dyn_cache = bx.const_i32(512);
+    //llvm::Dump(&dyn_cache);
     let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE);
 
     let sig = tcx.fn_sig(fn_target.def_id()).skip_binder();