Skip to content

Commit b60ff30

Browse files
committed
Remove the need to call clang for std::offload usages
1 parent 864339a commit b60ff30

File tree

7 files changed

+165
-40
lines changed

7 files changed

+165
-40
lines changed

compiler/rustc_codegen_llvm/src/back/write.rs

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -707,11 +707,9 @@ pub(crate) unsafe fn llvm_optimize(
707707
llvm::set_value_name(new_fn, &name);
708708
}
709709

710-
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
710+
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) {
711711
let cx =
712712
SimpleCx::new(module.module_llvm.llmod(), module.module_llvm.llcx, cgcx.pointer_size);
713-
// For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is
714-
// introducing a proper offload intrinsic to solve this limitation.
715713
for func in cx.get_functions() {
716714
let offload_kernel = "offload-kernel";
717715
if attributes::has_string_attr(func, offload_kernel) {
@@ -773,12 +771,79 @@ pub(crate) unsafe fn llvm_optimize(
773771
)
774772
};
775773

776-
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
774+
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Device) {
775+
let device_path = cgcx.output_filenames.path(OutputType::Object);
776+
let device_dir = device_path.parent().unwrap();
777+
let device_out = device_dir.join("host.out");
778+
let device_out_c = path_to_c_string(device_out.as_path());
777779
unsafe {
778-
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
780+
// 1) Bundle device module into offload image host.out (device TM)
781+
let ok = llvm::LLVMRustBundleImages(
782+
module.module_llvm.llmod(),
783+
module.module_llvm.tm.raw(),
784+
device_out_c.as_ptr(),
785+
);
786+
assert!(ok, "LLVMRustBundleImages (device -> host.out) failed");
787+
if !device_out.exists() {
788+
panic!("BundleImages failed, `host.out` was not created!");
789+
}
779790
}
780791
}
781792

793+
// This assumes that we previously compiled our kernels for a gpu target, which created a
794+
// `host.out` artifact. The user is supposed to provide us with a path to this artifact, we
795+
// don't need any other artifacts from the previous run. We will embed this artifact into our
796+
// LLVM-IR host module, to create a `host.o` ObjectFile, which we will write to disk.
797+
// The last, not yet automated steps uses the `clang-linker-wrapper` to process `host.o`.
798+
if !cgcx.target_is_like_gpu {
799+
if let Some(device_path) = config
800+
.offload
801+
.iter()
802+
.find_map(|o| if let config::Offload::Host(path) = o { Some(path) } else { None })
803+
{
804+
let device_pathbuf = PathBuf::from(device_path);
805+
if device_pathbuf.is_relative() {
806+
panic!("Absolute path is needed");
807+
} else if device_pathbuf
808+
.file_name()
809+
.and_then(|n| n.to_str())
810+
.is_some_and(|n| n != "host.out")
811+
{
812+
panic!("Need path to the host.out file");
813+
}
814+
assert!(device_pathbuf.exists());
815+
let host_path = cgcx.output_filenames.path(OutputType::Object);
816+
let host_dir = host_path.parent().unwrap();
817+
let out_obj = host_dir.join("host.o");
818+
let host_out_c = path_to_c_string(host_path.as_path());
819+
820+
// 2) Finalize host: lib.bc + host.out -> host.o (host TM)
821+
// We create a full clone of our LLVM host module, since we will embed the device IR
822+
// into it, and this might break caching or incremental compilation otherwise.
823+
let llmod2 = llvm::LLVMCloneModule(module.module_llvm.llmod());
824+
let ok =
825+
unsafe { llvm::LLVMRustOffloadEmbedBufferInModule(llmod2, host_out_c.as_ptr()) };
826+
assert!(ok, "LLVMRustOffloadEmbedBufferInModule failed");
827+
write_output_file(
828+
dcx,
829+
module.module_llvm.tm.raw(),
830+
config.no_builtins,
831+
llmod2,
832+
&out_obj,
833+
None,
834+
llvm::FileType::ObjectFile,
835+
&cgcx.prof,
836+
true,
837+
);
838+
if !out_obj.exists() {
839+
dbg!("{:?} does not exist!", out_obj);
840+
panic!("FinalizeOffload failed!");
841+
}
842+
// We ignore cgcx.save_temps here and unconditionally always keep our `host.out` artifact.
843+
// Otherwise, recompiling the host code would fail since we deleted that device artifact
844+
// in the previous host compilation, which would be confusing at best.
845+
}
846+
}
782847
result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
783848
}
784849

compiler/rustc_codegen_llvm/src/intrinsic.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -202,13 +202,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
202202
return Ok(());
203203
}
204204
sym::offload => {
205-
if !tcx
206-
.sess
207-
.opts
208-
.unstable_opts
209-
.offload
210-
.contains(&rustc_session::config::Offload::Enable)
211-
{
205+
if tcx.sess.opts.unstable_opts.offload.is_empty() {
212206
let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable);
213207
}
214208

compiler/rustc_codegen_llvm/src/llvm/ffi.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1722,7 +1722,15 @@ mod Offload {
17221722
use super::*;
17231723
unsafe extern "C" {
17241724
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
1725-
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
1725+
pub(crate) fn LLVMRustBundleImages<'a>(
1726+
M: &'a Module,
1727+
TM: &'a TargetMachine,
1728+
host_out: *const c_char,
1729+
) -> bool;
1730+
pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>(
1731+
_M: &'a Module,
1732+
_host_out: *const c_char,
1733+
) -> bool;
17261734
pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value);
17271735
}
17281736
}
@@ -1736,7 +1744,17 @@ mod Offload_fallback {
17361744
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
17371745
/// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI.
17381746
#[allow(unused_unsafe)]
1739-
pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool {
1747+
pub(crate) unsafe fn LLVMRustBundleImages<'a>(
1748+
_M: &'a Module,
1749+
_TM: &'a TargetMachine,
1750+
_host_out: *const c_char,
1751+
) -> bool {
1752+
unimplemented!("This rustc version was not built with LLVM Offload support!");
1753+
}
1754+
pub(crate) unsafe fn LLVMRustOffloadEmbedBufferInModule<'a>(
1755+
_M: &'a Module,
1756+
_host_out: *const c_char,
1757+
) -> bool {
17401758
unimplemented!("This rustc version was not built with LLVM Offload support!");
17411759
}
17421760
#[allow(unused_unsafe)]

compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,10 @@
4343
// available. As such, we only try to build it in the first place, if
4444
// llvm.offload is enabled.
4545
#ifdef OFFLOAD
46+
#include "llvm/Bitcode/BitcodeReader.h"
4647
#include "llvm/Object/OffloadBinary.h"
4748
#include "llvm/Target/TargetMachine.h"
49+
#include "llvm/Transforms/Utils/ModuleUtils.h"
4850
#endif
4951

5052
// for raw `write` in the bad-alloc handler
@@ -174,12 +176,13 @@ static Error writeFile(StringRef Filename, StringRef Data) {
174176
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
175177
// The input module is the rust code compiled for a gpu target like amdgpu.
176178
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
177-
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
179+
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM,
180+
const char *HostOutPath) {
178181
std::string Storage;
179182
llvm::raw_string_ostream OS1(Storage);
180183
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
181184
OS1.flush();
182-
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");
185+
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "device.bc");
183186

184187
SmallVector<char, 1024> BinaryData;
185188
raw_svector_ostream OS2(BinaryData);
@@ -188,19 +191,48 @@ extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
188191
ImageBinary.TheImageKind = object::IMG_Bitcode;
189192
ImageBinary.Image = std::move(MB);
190193
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
191-
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
192-
ImageBinary.StringData["arch"] = TM.getTargetCPU();
194+
195+
std::string TripleStr = TM.getTargetTriple().str();
196+
llvm::StringRef CPURef = TM.getTargetCPU();
197+
ImageBinary.StringData["triple"] = TripleStr;
198+
ImageBinary.StringData["arch"] = CPURef;
193199
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
194200
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
195201
// Offload binary has invalid size alignment
196202
return false;
197203
OS2 << Buffer;
198-
if (Error E = writeFile("host.out",
204+
if (Error E = writeFile(HostOutPath,
199205
StringRef(BinaryData.begin(), BinaryData.size())))
200206
return false;
201207
return true;
202208
}
203209

210+
Expected<std::unique_ptr<Module>>
211+
loadHostModuleFromBitcode(LLVMContext &Ctx, StringRef LibBCPath) {
212+
auto MBOrErr = MemoryBuffer::getFile(LibBCPath);
213+
if (!MBOrErr)
214+
return errorCodeToError(MBOrErr.getError());
215+
216+
MemoryBufferRef Ref = (*MBOrErr)->getMemBufferRef();
217+
return parseBitcodeFile(Ref, Ctx);
218+
}
219+
220+
extern "C" bool LLVMRustOffloadEmbedBufferInModule(LLVMModuleRef HostM,
221+
const char *HostOutPath) {
222+
auto MBOrErr = MemoryBuffer::getFile(HostOutPath);
223+
if (!MBOrErr) {
224+
auto E = MBOrErr.getError();
225+
auto _B = errorCodeToError(E);
226+
return false;
227+
}
228+
MemoryBufferRef Buf = (*MBOrErr)->getMemBufferRef();
229+
Module *M = unwrap(HostM);
230+
StringRef SectionName = ".llvm.offloading";
231+
Align Alignment = Align(8);
232+
llvm::embedBufferInModule(*M, Buf, SectionName, Alignment);
233+
return true;
234+
}
235+
204236
extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
205237
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
206238
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);

compiler/rustc_session/src/config.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,12 @@ pub enum CoverageLevel {
193193
}
194194

195195
// The different settings that the `-Z offload` flag can have.
196-
#[derive(Clone, Copy, PartialEq, Hash, Debug)]
196+
#[derive(Clone, PartialEq, Hash, Debug)]
197197
pub enum Offload {
198-
/// Enable the llvm offload pipeline
199-
Enable,
198+
/// Entry point for `std::offload`, enables kernel compilation for a gpu device
199+
Device,
200+
/// Second step in the offload pipeline, generates the host code to call kernels.
201+
Host(String),
200202
}
201203

202204
/// The different settings that the `-Z autodiff` flag can have.
@@ -2631,9 +2633,7 @@ pub fn build_session_options(early_dcx: &mut EarlyDiagCtxt, matches: &getopts::M
26312633
)
26322634
}
26332635

2634-
if !nightly_options::is_unstable_enabled(matches)
2635-
&& unstable_opts.offload.contains(&Offload::Enable)
2636-
{
2636+
if !nightly_options::is_unstable_enabled(matches) && !unstable_opts.offload.is_empty() {
26372637
early_dcx.early_fatal(
26382638
"`-Zoffload=Enable` also requires `-Zunstable-options` \
26392639
and a nightly compiler",

compiler/rustc_session/src/options.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,8 +1459,27 @@ pub mod parse {
14591459
let mut v: Vec<&str> = v.split(",").collect();
14601460
v.sort_unstable();
14611461
for &val in v.iter() {
1462-
let variant = match val {
1463-
"Enable" => Offload::Enable,
1462+
// Split each entry on '=' if it has an argument
1463+
let (key, arg) = match val.split_once('=') {
1464+
Some((k, a)) => (k, Some(a)),
1465+
None => (val, None),
1466+
};
1467+
1468+
let variant = match key {
1469+
"Host" => {
1470+
if let Some(p) = arg {
1471+
Offload::Host(p.to_string())
1472+
} else {
1473+
return false;
1474+
}
1475+
}
1476+
"Device" => {
1477+
if let Some(_) = arg {
1478+
// Device does not accept a value
1479+
return false;
1480+
}
1481+
Offload::Device
1482+
}
14641483
_ => {
14651484
// FIXME(ZuseZ4): print an error saying which value is not recognized
14661485
return false;

src/doc/rustc-dev-guide/src/offload/usage.md

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -77,28 +77,25 @@ pub extern "gpu-kernel" fn kernel_1(x: *mut [f64; 256]) {
7777
## Compile instructions
7878
It is important to use a clang compiler build on the same llvm as rustc. Just calling clang without the full path will likely use your system clang, which probably will be incompatible. So either substitute clang/lld invocations below with absolute path, or set your `PATH` accordingly.
7979

80-
First we generate the host (cpu) code. The first build is just to compile libc, take note of the hashed path. Then we call rustc directly to build our host code, while providing the libc artifact to rustc.
80+
First we generate the device (gpu) code. Replace the target-cpu with the right code for your gpu.
8181
```
82-
cargo +offload build -r -v
83-
rustc +offload --edition 2024 src/lib.rs -g --crate-type cdylib -C opt-level=3 -C panic=abort -C lto=fat -L dependency=/absolute_path_to/target/release/deps --extern libc=/absolute_path_to/target/release/deps/liblibc-<HASH>.rlib --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options
82+
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Device -Csave-temps -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
8483
```
84+
You might afterwards need to copy your target/release/deps/<lib_name>.bc to lib.bc for now, before the next step.
8585

86-
Now we generate the device code. Replace the target-cpu with the right code for your gpu.
86+
Now we generate the host (cpu) code.
8787
```
88-
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
88+
RUSTFLAGS="--emit=llvm-bc,llvm-ir -Csave-temps -Zoffload=Host=/p/lustre1/drehwald1/prog/offload/r/target/amdgcn-amd-amdhsa/release/deps/host.out -Zunstable-options" cargo +offload build -r
8989
```
90-
90+
This call also does a lot of work and generates multiple intermediate files for llvm offload.
91+
While we integrated most offload steps into rustc by now, one binary invocation still remains for now:
9192

9293
```
93-
"clang-21" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-S" "-save-temps=cwd" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "lib.rs" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-resource-dir" "/<ABSOLUTE_PATH_TO>/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21" "-ferror-limit" "19" "-fopenmp" "-fopenmp-offload-mandatory" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fembed-offload-object=host.out" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "host.s" "-x" "ir" "lib.bc"
94-
95-
"clang-21" "-cc1as" "-triple" "x86_64-unknown-linux-gnu" "-filetype" "obj" "-main-file-name" "lib.rs" "-target-cpu" "x86-64" "-mrelocation-model" "pic" "-o" "host.o" "host.s"
96-
97-
"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
94+
"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "target/<GPU_DIR>/release/host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
9895
```
9996

100-
Especially for the last three commands I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
101-
You can ignore other steps, e.g. the invocation of a "clang-offload-packager".
97+
You can try to find the paths to those files on your system. However, I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
98+
It will show multiple steps, just look for the clang-linker-wrapper example. Make sure to still include the path to the `host.o` file, and not whatever tmp file you got when compiling your c++ example with the following call.
10299
```
103100
myclang++ -fuse-ld=lld -O3 -fopenmp -fopenmp-offload-mandatory --offload-arch=gfx90a omp_bare.cpp -o main -###
104101
```

0 commit comments

Comments
 (0)