@@ -707,11 +707,9 @@ pub(crate) unsafe fn llvm_optimize(
707707 llvm:: set_value_name ( new_fn, & name) ;
708708 }
709709
710- if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Enable ) {
710+ if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Device ) {
711711 let cx =
712712 SimpleCx :: new ( module. module_llvm . llmod ( ) , module. module_llvm . llcx , cgcx. pointer_size ) ;
713- // For now we only support up to 10 kernels named kernel_0 ... kernel_9, a follow-up PR is
714- // introducing a proper offload intrinsic to solve this limitation.
715713 for func in cx. get_functions ( ) {
716714 let offload_kernel = "offload-kernel" ;
717715 if attributes:: has_string_attr ( func, offload_kernel) {
@@ -773,12 +771,79 @@ pub(crate) unsafe fn llvm_optimize(
773771 )
774772 } ;
775773
776- if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Enable ) {
774+ if cgcx. target_is_like_gpu && config. offload . contains ( & config:: Offload :: Device ) {
775+ let device_path = cgcx. output_filenames . path ( OutputType :: Object ) ;
776+ let device_dir = device_path. parent ( ) . unwrap ( ) ;
777+ let device_out = device_dir. join ( "host.out" ) ;
778+ let device_out_c = path_to_c_string ( device_out. as_path ( ) ) ;
777779 unsafe {
778- llvm:: LLVMRustBundleImages ( module. module_llvm . llmod ( ) , module. module_llvm . tm . raw ( ) ) ;
780+ // 1) Bundle device module into offload image host.out (device TM)
781+ let ok = llvm:: LLVMRustBundleImages (
782+ module. module_llvm . llmod ( ) ,
783+ module. module_llvm . tm . raw ( ) ,
784+ device_out_c. as_ptr ( ) ,
785+ ) ;
786+ assert ! ( ok, "LLVMRustBundleImages (device -> host.out) failed" ) ;
787+ if !device_out. exists ( ) {
788+ panic ! ( "BundleImages failed, `host.out` was not created!" ) ;
789+ }
779790 }
780791 }
781792
793+ // This assumes that we previously compiled our kernels for a gpu target, which created a
794+ // `host.out` artifact. The user is supposed to provide us with a path to this artifact, we
795+ // don't need any other artifacts from the previous run. We will embed this artifact into our
796+ // LLVM-IR host module, to create a `host.o` ObjectFile, which we will write to disk.
797+ // The last, not yet automated steps uses the `clang-linker-wrapper` to process `host.o`.
798+ if !cgcx. target_is_like_gpu {
799+ if let Some ( device_path) = config
800+ . offload
801+ . iter ( )
802+ . find_map ( |o| if let config:: Offload :: Host ( path) = o { Some ( path) } else { None } )
803+ {
804+ let device_pathbuf = PathBuf :: from ( device_path) ;
805+ if device_pathbuf. is_relative ( ) {
806+ panic ! ( "Absolute path is needed" ) ;
807+ } else if device_pathbuf
808+ . file_name ( )
809+ . and_then ( |n| n. to_str ( ) )
810+ . is_some_and ( |n| n != "host.out" )
811+ {
812+ panic ! ( "Need path to the host.out file" ) ;
813+ }
814+ assert ! ( device_pathbuf. exists( ) ) ;
815+ let host_path = cgcx. output_filenames . path ( OutputType :: Object ) ;
816+ let host_dir = host_path. parent ( ) . unwrap ( ) ;
817+ let out_obj = host_dir. join ( "host.o" ) ;
818+ let host_out_c = path_to_c_string ( device_pathbuf. as_path ( ) ) ;
819+
820+ // 2) Finalize host: lib.bc + host.out -> host.o (host TM)
821+ // We create a full clone of our LLVM host module, since we will embed the device IR
822+ // into it, and this might break caching or incremental compilation otherwise.
823+ let llmod2 = llvm:: LLVMCloneModule ( module. module_llvm . llmod ( ) ) ;
824+ let ok =
825+ unsafe { llvm:: LLVMRustOffloadEmbedBufferInModule ( llmod2, host_out_c. as_ptr ( ) ) } ;
826+ assert ! ( ok, "LLVMRustOffloadEmbedBufferInModule failed" ) ;
827+ write_output_file (
828+ dcx,
829+ module. module_llvm . tm . raw ( ) ,
830+ config. no_builtins ,
831+ llmod2,
832+ & out_obj,
833+ None ,
834+ llvm:: FileType :: ObjectFile ,
835+ & cgcx. prof ,
836+ true ,
837+ ) ;
838+ if !out_obj. exists ( ) {
839+ dbg ! ( "{:?} does not exist!" , out_obj) ;
840+ panic ! ( "FinalizeOffload failed!" ) ;
841+ }
842+ // We ignore cgcx.save_temps here and unconditionally always keep our `host.out` artifact.
843+ // Otherwise, recompiling the host code would fail since we deleted that device artifact
844+ // in the previous host compilation, which would be confusing at best.
845+ }
846+ }
782847 result. into_result ( ) . unwrap_or_else ( |( ) | llvm_err ( dcx, LlvmError :: RunLlvmPasses ) )
783848}
784849
0 commit comments