From ddea915d6772fdf836106850951f615389ef7e24 Mon Sep 17 00:00:00 2001 From: Rafal Rudnicki Date: Fri, 5 Dec 2025 10:30:33 +0100 Subject: [PATCH 1/5] bump default SM for CUDA to 75 --- clang/include/clang/Basic/OffloadArch.h | 2 +- clang/lib/Driver/Driver.cpp | 6 +++--- libdevice/cmake/modules/SYCLLibdevice.cmake | 2 +- sycl-jit/jit-compiler/lib/translation/Translation.cpp | 5 +++-- sycl/doc/GetStartedGuide.md | 4 ++-- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/OffloadArch.h b/clang/include/clang/Basic/OffloadArch.h index 1d528863e9f29..7bee6af3f3a6d 100644 --- a/clang/include/clang/Basic/OffloadArch.h +++ b/clang/include/clang/Basic/OffloadArch.h @@ -167,7 +167,7 @@ enum class OffloadArch { LNL_M, LAST, - CudaDefault = OffloadArch::SM_52, + CudaDefault = OffloadArch::SM_75, HIPDefault = OffloadArch::GFX906, }; diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 475c1fddb0363..7df946f80b486 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -5927,11 +5927,11 @@ class OffloadingActionBuilder final { // Handle defaults architectures for (auto &Triple : SYCLTripleList) { - // For NVIDIA use SM_50 as a default + // For NVIDIA use SM_75 as a default if (Triple.isNVPTX() && llvm::none_of(GpuArchList, [&](auto &P) { return P.first.isNVPTX(); })) { - const char *DefaultArch = OffloadArchToString(OffloadArch::SM_50); + const char *DefaultArch = OffloadArchToString(OffloadArch::SM_75); GpuArchList.emplace_back(Triple, DefaultArch); } @@ -7650,7 +7650,7 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args, // The default arch is set for NVPTX if not provided. For AMDGPU, emit // an error as the user is responsible to set the arch. if (TC.getTriple().isNVPTX()) - Archs.insert(OffloadArchToString(OffloadArch::SM_50)); + Archs.insert(OffloadArchToString(OffloadArch::SM_75)); else if (TC.getTriple().isAMDGPU()) C.getDriver().Diag(clang::diag::err_drv_sycl_missing_amdgpu_arch) << 1 << TC.getTriple().str(); diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index fa33ef7b1664d..bfb572fa94d03 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -97,7 +97,7 @@ set(imf_build_archs) if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) list(APPEND full_build_archs nvptx64-nvidia-cuda) set(compile_opts_nvptx64-nvidia-cuda "-fsycl-targets=nvptx64-nvidia-cuda" - "-Xsycl-target-backend" "--cuda-gpu-arch=sm_50" "-nocudalib" "-fno-sycl-libspirv" "-Wno-unsafe-libspirv-not-linked") + "-Xsycl-target-backend" "--cuda-gpu-arch=sm_75" "-fno-sycl-libspirv" "-Wno-unsafe-libspirv-not-linked") set(opt_flags_nvptx64-nvidia-cuda "-O3" "--nvvm-reflect-enable=false") endif() if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.cpp b/sycl-jit/jit-compiler/lib/translation/Translation.cpp index ab49001a588b3..43e8090ad1a82 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.cpp +++ b/sycl-jit/jit-compiler/lib/translation/Translation.cpp @@ -197,13 +197,14 @@ std::pair Translator::getTargetCPUAndFeatureAttrs( if (CPU.empty()) { // Set to the lowest tested target according to the GetStartedGuide, section // "Build DPC++ toolchain with support for HIP AMD" - CPU = Format == BinaryFormat::AMDGCN ? "gfx90a" : "sm_50"; + CPU = Format == BinaryFormat::AMDGCN ? "gfx90a" : "sm_75"; if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); } } if (Features.empty()) { - Features = Format == BinaryFormat::PTX ? "+sm_50,+ptx76" : ""; + // Turing architecture + PTX 6.3 + Features = Format == BinaryFormat::PTX ? "+sm_75,+ptx63" : ""; if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) .getValueAsString(); diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 85d3db3256a1e..506c58eb6284f 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -198,8 +198,8 @@ extensions that require sm_80 and later architectures also require at least CUDA 11.0. The CUDA backend should work on Windows or Linux operating systems with any GPU -with compute capability (SM version) sm_50 or above. The default SM version for -the NVIDIA CUDA backend is sm_50. Users of sm_3X devices can attempt to specify +with compute capability (SM version) sm_75 or above. The default SM version for +the NVIDIA CUDA backend is sm_75. Users of sm_3X devices can attempt to specify the target architecture [ahead of time](#aot-target-architectures), provided that they use a 11.X or earlier CUDA toolkit version, but some features may not be supported. The CUDA backend has been tested with different Ubuntu Linux From 20f4f5611c1fe40c8d1c38a93a285a8969f31427 Mon Sep 17 00:00:00 2001 From: Rafal Rudnicki Date: Fri, 5 Dec 2025 10:31:15 +0100 Subject: [PATCH 2/5] make CUDA 10.0 the default toolkit for tests --- .../CUDA/v10.0/bin/.keep | 0 .../CUDA/v10.0/bin/version.txt | 2 ++ .../CUDA/v10.0/include/.keep | 0 .../CUDA/v10.0/lib/.keep | 0 .../CUDA/v10.0/nvvm/libdevice/libdevice.10.bc | 0 .../nvvm/libdevice/libdevice.compute_30.10.bc | 0 .../nvvm/libdevice/libdevice.compute_35.10.bc | 0 .../nvvm/libdevice/libdevice.compute_50.10.bc | 0 .../CUDA/v10.0/version.txt | 2 ++ .../Inputs/CUDA/usr/local/cuda/bin/fatbinary | 0 .../Inputs/CUDA/usr/local/cuda/include/cuda.h | 7 +++++++ .../local/cuda/nvvm/libdevice/libdevice.10.bc | 0 .../nvvm/libdevice/libdevice.compute_20.10.bc | 0 .../nvvm/libdevice/libdevice.compute_50.10.bc | 0 .../Inputs/CUDA_100/usr/local/cuda/bin/.keep | 0 .../Inputs/CUDA_100/usr/local/cuda/include/.keep | 0 .../CUDA_100/usr/local/cuda/include/cuda.h | 7 +++++++ .../Inputs/CUDA_100/usr/local/cuda/lib/.keep | 0 .../Inputs/CUDA_100/usr/local/cuda/lib64/.keep | 0 .../local/cuda/nvvm/libdevice/libdevice.10.bc | 0 .../nvvm/libdevice/libdevice.compute_20.10.bc | 0 .../nvvm/libdevice/libdevice.compute_30.10.bc | 0 .../nvvm/libdevice/libdevice.compute_35.10.bc | 0 .../nvvm/libdevice/libdevice.compute_50.10.bc | 0 .../Inputs/CUDA_70/usr/local/cuda/bin/.keep | 0 .../Inputs/CUDA_70/usr/local/cuda/include/.keep | 0 .../Inputs/CUDA_70/usr/local/cuda/include/cuda.h | 7 +++++++ .../Inputs/CUDA_70/usr/local/cuda/lib/.keep | 0 .../Inputs/CUDA_70/usr/local/cuda/lib64/.keep | 0 .../nvvm/libdevice/libdevice.compute_20.10.bc | 0 .../nvvm/libdevice/libdevice.compute_30.10.bc | 0 .../nvvm/libdevice/libdevice.compute_35.10.bc | 0 .../nvvm/libdevice/libdevice.compute_50.10.bc | 0 clang/test/Driver/Inputs/SYCL/objnvptx64-sm_75.o | Bin 0 -> 3736 bytes 34 files changed, 25 insertions(+) create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/.keep create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/version.txt create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/include/.keep create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/lib/.keep create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_30.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_35.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_50.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/version.txt create mode 100755 clang/test/Driver/Inputs/CUDA/usr/local/cuda/bin/fatbinary create mode 100644 clang/test/Driver/Inputs/CUDA/usr/local/cuda/include/cuda.h create mode 100644 clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/bin/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/include/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/include/cuda.h create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/lib/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/lib64/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/bin/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/include/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/include/cuda.h create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/lib/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/lib64/.keep create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc create mode 100644 clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc create mode 100644 clang/test/Driver/Inputs/SYCL/objnvptx64-sm_75.o diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/.keep b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/version.txt b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/version.txt new file mode 100644 index 0000000000000..16a5a2337f83f --- /dev/null +++ b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/version.txt @@ -0,0 +1,2 @@ +CUDA Version 10.0.130 + diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/include/.keep b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/include/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/lib/.keep b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/lib/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.10.bc b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_30.10.bc b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_30.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_35.10.bc b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_35.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_50.10.bc b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/nvvm/libdevice/libdevice.compute_50.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/version.txt b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/version.txt new file mode 100644 index 0000000000000..16a5a2337f83f --- /dev/null +++ b/clang/test/Driver/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/version.txt @@ -0,0 +1,2 @@ +CUDA Version 10.0.130 + diff --git a/clang/test/Driver/Inputs/CUDA/usr/local/cuda/bin/fatbinary b/clang/test/Driver/Inputs/CUDA/usr/local/cuda/bin/fatbinary new file mode 100755 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA/usr/local/cuda/include/cuda.h b/clang/test/Driver/Inputs/CUDA/usr/local/cuda/include/cuda.h new file mode 100644 index 0000000000000..c576bebd470dc --- /dev/null +++ b/clang/test/Driver/Inputs/CUDA/usr/local/cuda/include/cuda.h @@ -0,0 +1,7 @@ +// +// Placeholder file for testing CUDA version detection +// + +#define CUDA_VERSION 10000 + +// diff --git a/clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.10.bc b/clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc b/clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc b/clang/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/bin/.keep b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/bin/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/include/.keep b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/include/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/include/cuda.h b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/include/cuda.h new file mode 100644 index 0000000000000..c576bebd470dc --- /dev/null +++ b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/include/cuda.h @@ -0,0 +1,7 @@ +// +// Placeholder file for testing CUDA version detection +// + +#define CUDA_VERSION 10000 + +// diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/lib/.keep b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/lib/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/lib64/.keep b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/lib64/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.10.bc b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc b/clang/test/Driver/Inputs/CUDA_100/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/bin/.keep b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/bin/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/include/.keep b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/include/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/include/cuda.h b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/include/cuda.h new file mode 100644 index 0000000000000..558f2e2d02093 --- /dev/null +++ b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/include/cuda.h @@ -0,0 +1,7 @@ +// +// Placeholder file for testing CUDA version detection +// + +#define CUDA_VERSION 7000 + +// diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/lib/.keep b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/lib/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/lib64/.keep b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/lib64/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc b/clang/test/Driver/Inputs/CUDA_70/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/SYCL/objnvptx64-sm_75.o b/clang/test/Driver/Inputs/SYCL/objnvptx64-sm_75.o new file mode 100644 index 0000000000000000000000000000000000000000..bfac6d6de63cc8fc8da5cc948350dcae84d15ec6 GIT binary patch literal 3736 zcmcf^ZA@F&^%;YoN!({Ap)GD~_c^eLwlSDTLTqd%e>qsX{G6c4G*sXhPGhs0 z^!LC~;8$W_e*36xK34G`?3aPFa}&RK7VyZDB+__D%Vslo8eckMJ8H3X9yK1anNGcQ zsPnMRhFqr#Yg%os`&H-sXZ?KxepRp6e=e*#<3EQrGVU;gA&>doeptBNH26D+G43E^ z3;HzbMkh^UKD*bCHTwG)cjJ+lPN{;7-FX&kM1Hl|T`ma68LTnj?zK0%du*P7-RH&{ zgG_+2I{***5dJ>vigEUbXeJEi`9ofp*RFD&bJiQRd z^0&8r|Gav1ce?%%kE+6yB8U*UVS9zL=#4!fhsWxEX-rrCNFbrK7kI;>A5;mYiprlm zqN2kUrVHCmE(ukm5!Sx2jiN?6CR|GM`H{*aGRjonpBC<@f5vb=A$fsExrT)oWMrLr zWEqo6O_kDT8=g%b>muv^VmL37a`*?2OQ}Sn9+y&%ecE(#M_qd72qABC9XgVh>@iy^ zq)%aQl3*9uv&T~@`IBkreaUf{lKs=xj&0R@&9hY|YOhfARvi|9zhZpbM}|omuL0Y& zcVGe!r|?%N_-#G>feDr&Sw_gV*0K?jr57gf<*Z=68;_{*Knf3%xGzbB40xm)XR?CZ z5d*y7K0X2&kF7onTQ1S)Q;(bHS*D?>rehw|GU?3 z`DB>`=Q;M|a71B9mfz zN`7Np=22G_#p1g&{PwiDJ;86!gcA&nzaqp3J6M{=BQ!r0W@(a@W_A#3Ng}Tn`_%_X1Xpzjwmz1&vrF`11@TlL^U61RsX6;qJ{(4+L%-3bsYeL=lD?RwZ9yS1Tl%qfaIIP}4g?(clv1SfjW%hIRW03By~@NhTF)Ur&9 zmEJfZzJrPHS&2fDxM>jQNn#CdK{m)2rxeph#k^9s7*j0fWQDx^bDqLedrUK7Zo3%Q zWvun3i*&OCVLUu`tn+?Sj69(DnPSc^TUE-JrsS{j6v$wueBCZM`9@rm;dB$`c5ujp zX??-08v&9_yt@r<$_{`%LOjxd2gYJuHzmZ{B(ZEIzEams%^MYk{F4iL`4Sk=A=OR9 z+porV4|AG?S)1YX*QT^-4(_PWbX@Hh;$T%jIPf$p{dAUC>lS~J6ReZsWwm&j6n||c z{1W1(ngH^voZxnjC|HSa)ODY&*=5r)`9l85HKlw-DW6fwA%d&Qr#T#BGE4b_!16c>LgH9PVNpgH%BtX0i~<*sxKX zKg^U`Py%tm4<3Su}uLvLm z;X4qEmYvWFXsGhb_+SHsZ0f2}Gr`wg=QIy0+OKnMqhMQdv0IT-$_rD9#b^zr!IDui z9aGHb<%>`by?tu&dXl);E%syE^BcCol}Yg>D*@$m4=OQ9+|3bRVb~yvVZBhh7Ze?& zc|;_wcDd5;ld(i4WwBUx#Z0N>mosUh)^tAM7+Muz*fy8y#WvB0`~hc=JzxuZBW`~v z zcztxQn>0W79C^}YG-6nt(5N1g24DO6?~ zJ>rS|H@k^jR%kuq0%}0wTUO%!L^Oty40XmZ3ssH!!fE;u%mf3rkJ+Nvg5M!Mlsu`c;fqzWKa)LV*AKOQB{4Yff8DPM#(`*>!5=`e^2vu z`x>DB=izJc%KAuxVUSW7p3z^5>n=hg<+rEedT6Ri4z{s=a^v+w5t2QXKqGL&Ti|KH zJNcvQh~~pUGzFs*2GTnX0|CN_rV#;f48|@PE*LvuAUy=-eY8gMWhn0>{4Id6eq-ll zut5n+u;FXOvHi4JW3vTmZy;prbN1Tk03@5odaA{Ef_lLj`R)n(-#$$r@l Date: Thu, 4 Dec 2025 09:56:05 +0100 Subject: [PATCH 3/5] update tests to use CUDA sm_75 / ptx63 as default --- clang/test/Driver/cuda-detect.cu | 35 ++++--- .../Driver/cuda-flush-denormals-to-zero.cu | 8 +- clang/test/Driver/cuda-march.cu | 6 +- clang/test/Driver/cuda-options.cu | 6 +- clang/test/Driver/cuda-ptxas-path.cu | 4 +- clang/test/Driver/cuda-short-ptr.cu | 2 +- clang/test/Driver/cuda-version-check.cu | 20 ++-- clang/test/Driver/cuda-windows.cu | 4 +- clang/test/Driver/lto.cu | 2 +- clang/test/Driver/sycl-offload-new-driver.cpp | 33 ++++--- clang/test/Driver/sycl-offload-nvptx.cpp | 40 ++++---- clang/test/Driver/sycl-offload-old-model.c | 92 +++++++++---------- .../sycl-offload-static-lib-2-old-model.cpp | 56 +++++------ clang/test/Driver/sycl-offload.c | 20 ++-- .../Driver/sycl-target-mismatch-nvptx.cpp | 2 +- 15 files changed, 169 insertions(+), 161 deletions(-) diff --git a/clang/test/Driver/cuda-detect.cu b/clang/test/Driver/cuda-detect.cu index 23b6ba2fcc09d..66e1a25e70eda 100644 --- a/clang/test/Driver/cuda-detect.cu +++ b/clang/test/Driver/cuda-detect.cu @@ -60,7 +60,7 @@ // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ -// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30 +// RUN: -check-prefixes PTX63,LIBDEVICE,LIBDEVICE10 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ @@ -96,14 +96,14 @@ // Verify that -nocudainc prevents adding include path to CUDA headers. -// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_75 \ // RUN: -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC \ -// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35 -// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \ +// RUN: -check-prefixes PTX63,LIBDEVICE,LIBDEVICE10 +// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_75 \ // RUN: -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC \ -// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35 +// RUN: -check-prefixes PTX63,LIBDEVICE,LIBDEVICE10 // We should not add any CUDA include paths if there's no valid CUDA installation // RUN: not %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ @@ -123,10 +123,10 @@ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE // Verify that -nocudalib prevents linking libdevice bitcode in. -// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_75 \ // RUN: -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \ +// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_75 \ // RUN: -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON @@ -152,10 +152,10 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix CUDA80 -// Verify that if no version file is found, we report the default of 7.0. +// Verify that if no version file is found, we report the default of 10.0. // RUN: %clang -### -v --target=x86_64-linux-gnu --cuda-gpu-arch=sm_50 \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ -// RUN: | FileCheck %s -check-prefix CUDA70 +// RUN: | FileCheck %s -check-prefix CUDA100 // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda // NO-LIBDEVICE: Found CUDA installation: {{.*}}/Inputs/CUDA-nolibdevice/usr/local/cuda @@ -174,6 +174,7 @@ // LIBDEVICE50-SAME: libdevice.compute_50.10.bc // PTX42-SAME: "-target-feature" "+ptx42" // PTX60-SAME: "-target-feature" "+ptx60" +// PTX63-SAME: "-target-feature" "+ptx63" // CUDAINC-SAME: "-include" "__clang_cuda_runtime_wrapper.h" // NOCUDAINC-NOT: "-include" "__clang_cuda_runtime_wrapper.h" // CUDAINC-SAME: "-internal-isystem" "{{.*}}/Inputs/CUDA{{[_0-9]+}}/usr/local/cuda/include" @@ -188,14 +189,20 @@ // CHECK-CXXINCLUDE-SAME: {{.*}}"-internal-isystem" "{{.+}}/include/c++/4.8" // CHECK-CXXINCLUDE: ld{{.*}}" +// CUDA70: "-cc1" "-triple" "nvptx64-nvidia-cuda" +// CUDA70-SAME: -target-sdk-version=7.0 +// CUDA70: "-cc1" "-triple" "x86_64-unknown-linux-gnu" +// CUDA70-SAME: -target-sdk-version=7.0 +// CUDA70: ld{{.*}}" + // CUDA80: "-cc1" "-triple" "nvptx64-nvidia-cuda" // CUDA80-SAME: -target-sdk-version=8.0 // CUDA80: "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CUDA80-SAME: -target-sdk-version=8.0 // CUDA80: ld{{.*}}" -// CUDA70: "-cc1" "-triple" "nvptx64-nvidia-cuda" -// CUDA70-SAME: -target-sdk-version=7.0 -// CUDA70: "-cc1" "-triple" "x86_64-unknown-linux-gnu" -// CUDA70-SAME: -target-sdk-version=7.0 -// CUDA70: ld{{.*}}" +// CUDA100: "-cc1" "-triple" "nvptx64-nvidia-cuda" +// CUDA100-SAME: -target-sdk-version=10.0 +// CUDA100: "-cc1" "-triple" "x86_64-unknown-linux-gnu" +// CUDA100-SAME: -target-sdk-version=10.0 +// CUDA100: ld{{.*}}" diff --git a/clang/test/Driver/cuda-flush-denormals-to-zero.cu b/clang/test/Driver/cuda-flush-denormals-to-zero.cu index ea808f2302fbb..adad6dfe632d3 100644 --- a/clang/test/Driver/cuda-flush-denormals-to-zero.cu +++ b/clang/test/Driver/cuda-flush-denormals-to-zero.cu @@ -2,14 +2,14 @@ // -fgpu-flush-denormals-to-zero. This should be translated to // -fdenormal-fp-math-f32=preserve-sign -// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fgpu-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=FTZ %s -// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fno-gpu-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=NOFTZ %s +// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fgpu-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fno-gpu-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=NOFTZ %s // RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fgpu-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s // RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fno-gpu-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s // Test alias options -f[no-]cuda-flush-denormals-to-zero -// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=FTZ %s -// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=NOFTZ %s +// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=FTZ %s +// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda %s 2>&1 | FileCheck -check-prefix=NOFTZ %s // Test explicit argument, with CUDA offload kind // RUN: %clang -x hip -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fgpu-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s diff --git a/clang/test/Driver/cuda-march.cu b/clang/test/Driver/cuda-march.cu index 2dbb9cdf6f589..7684b1df0d685 100644 --- a/clang/test/Driver/cuda-march.cu +++ b/clang/test/Driver/cuda-march.cu @@ -5,12 +5,12 @@ // RUN: %clang -### --target=x86_64-linux-gnu -c \ // RUN: -nogpulib -nogpuinc -march=haswell %s 2>&1 | FileCheck %s // RUN: %clang -### --target=x86_64-linux-gnu -c \ -// RUN: -nogpulib -nogpuinc -march=haswell --cuda-gpu-arch=sm_52 %s 2>&1 | FileCheck %s +// RUN: -nogpulib -nogpuinc -march=haswell --cuda-gpu-arch=sm_75 %s 2>&1 | FileCheck %s // CHECK: "-cc1"{{.*}} "-triple" "nvptx -// CHECK-SAME: "-target-cpu" "sm_52" +// CHECK-SAME: "-target-cpu" "sm_75" // CHECK: ptxas -// CHECK-SAME: "--gpu-name" "sm_52" +// CHECK-SAME: "--gpu-name" "sm_75" // CHECK: "-cc1"{{.*}} "-target-cpu" "haswell" diff --git a/clang/test/Driver/cuda-options.cu b/clang/test/Driver/cuda-options.cu index fc8e83a2bb279..312556707ef19 100644 --- a/clang/test/Driver/cuda-options.cu +++ b/clang/test/Driver/cuda-options.cu @@ -104,12 +104,12 @@ // RUN: | FileCheck -check-prefixes ARCH-SM52,NOARCH-SM60,NOARCH-SM70 %s // c) if --no-cuda-gpu-arch=X negates all preceding --cuda-gpu-arch=X -// we default to sm_52 -- same as if no --cuda-gpu-arch were passed. +// we default to sm_75 -- same as if no --cuda-gpu-arch were passed. // RUN: %clang -### --target=x86_64-linux-gnu --cuda-device-only \ // RUN: -nogpulib -nogpuinc --cuda-gpu-arch=sm_70 --cuda-gpu-arch=sm_60 \ // RUN: --no-cuda-gpu-arch=sm_70 --no-cuda-gpu-arch=sm_60 \ // RUN: -c %s 2>&1 \ -// RUN: | FileCheck -check-prefixes ARCH-SM52,NOARCH-SM60,NOARCH-SM70 %s +// RUN: | FileCheck -check-prefixes ARCH-SM75,NOARCH-SM60,NOARCH-SM70 %s // d) --no-cuda-gpu-arch=X is a no-op if there's no preceding --cuda-gpu-arch=X // RUN: %clang -### --target=x86_64-linux-gnu --cuda-device-only \ @@ -193,6 +193,8 @@ // NOARCH-SM60-NOT: "-cc1"{{.*}}"-target-cpu" "sm_60" // ARCH-SM70: "-cc1"{{.*}}"-target-cpu" "sm_70" // NOARCH-SM70-NOT: "-cc1"{{.*}}"-target-cpu" "sm_70" +// ARCH-SM75: "-cc1"{{.*}}"-target-cpu" "sm_75" +// NOARCH-SM75-NOT: "-cc1"{{.*}}"-target-cpu" "sm_75" // ARCHALLERROR: error: unsupported CUDA gpu architecture: all // Match device-side preprocessor and compiler phases with -save-temps. diff --git a/clang/test/Driver/cuda-ptxas-path.cu b/clang/test/Driver/cuda-ptxas-path.cu index f36dcc94558f1..7027984d07b2e 100644 --- a/clang/test/Driver/cuda-ptxas-path.cu +++ b/clang/test/Driver/cuda-ptxas-path.cu @@ -1,8 +1,8 @@ // RUN: %clang -### --target=i386-unknown-linux \ -// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda \ // RUN: --ptxas-path=/some/path/to/ptxas %s 2>&1 \ // RUN: | FileCheck %s // CHECK-NOT: "ptxas" // CHECK: "/some/path/to/ptxas" -// CHECK-SAME: "--gpu-name" "sm_52" +// CHECK-SAME: "--gpu-name" "sm_75" diff --git a/clang/test/Driver/cuda-short-ptr.cu b/clang/test/Driver/cuda-short-ptr.cu index e0ae4505e0b56..bf3c1c168b922 100644 --- a/clang/test/Driver/cuda-short-ptr.cu +++ b/clang/test/Driver/cuda-short-ptr.cu @@ -1,6 +1,6 @@ // Checks that cuda compilation does the right thing when passed -fcuda-short-ptr -// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fcuda-short-ptr -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 | FileCheck %s +// RUN: %clang -### --target=x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_20 -fcuda-short-ptr -nocudainc -nocudalib --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda %s 2>&1 | FileCheck %s // CHECK: "-mllvm" "--nvptx-short-ptr" // CHECK-SAME: "-fcuda-short-ptr" diff --git a/clang/test/Driver/cuda-version-check.cu b/clang/test/Driver/cuda-version-check.cu index 9eceb928ffabd..4b43012b39483 100644 --- a/clang/test/Driver/cuda-version-check.cu +++ b/clang/test/Driver/cuda-version-check.cu @@ -1,4 +1,4 @@ -// RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \ +// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=OK // RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_20 --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=OK @@ -15,33 +15,33 @@ // RUN: --cuda-path=%S/Inputs/CUDA-unknown/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=UNKNOWN_VERSION_CXX -// The installation at Inputs/CUDA is CUDA 7.0, which doesn't support sm_60. -// RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \ +// The installation at Inputs/CUDA_70 is CUDA 7.0, which doesn't support sm_60. +// RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=ERR_SM60 // This should only complain about sm_60, not sm_35. // RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_35 \ -// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \ +// RUN: --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=ERR_SM60 --check-prefix=OK_SM35 // We should get two errors here, one for sm_60 and one for sm_61. // RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_61 \ -// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \ +// RUN: --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=ERR_SM60 --check-prefix=ERR_SM61 // We should still get an error if we pass -nocudainc, because this compilation // would invoke ptxas, and we do a version check on that, too. -// RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \ +// RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 -nocudainc --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=ERR_SM60 // If with -nocudainc and -E, we don't touch the CUDA install, so we // shouldn't get an error. // RUN: %clang --target=x86_64-linux -v -### -E --cuda-device-only --cuda-gpu-arch=sm_60 -nocudainc \ -// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 %s | \ +// RUN: --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=OK // --no-cuda-version-check should suppress all of these errors. -// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 \ +// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda 2>&1 \ // RUN: --no-cuda-version-check %s | \ // RUN: FileCheck %s --check-prefix=OK @@ -49,9 +49,9 @@ // therefore we should not get an error in host-only mode. We use the -S here // to avoid the error being produced in case by the assembler tool, which does // the same check. -// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-host-only --cuda-path=%S/Inputs/CUDA/usr/local/cuda -S 2>&1 %s | \ +// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-host-only --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda -S 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=OK -// RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-device-only --cuda-path=%S/Inputs/CUDA/usr/local/cuda -S 2>&1 %s | \ +// RUN: not %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-device-only --cuda-path=%S/Inputs/CUDA_70/usr/local/cuda -S 2>&1 %s | \ // RUN: FileCheck %s --check-prefix=ERR_SM60 // OK-NOT: error: GPU arch diff --git a/clang/test/Driver/cuda-windows.cu b/clang/test/Driver/cuda-windows.cu index 4459e809072d9..4b28117b62524 100644 --- a/clang/test/Driver/cuda-windows.cu +++ b/clang/test/Driver/cuda-windows.cu @@ -1,6 +1,6 @@ -// RUN: %clang -v --target=i386-pc-windows-msvc \ +// RUN: %clang -v --target=i386-pc-windows-msvc --cuda-gpu-arch=sm_50 \ // RUN: --sysroot=%S/Inputs/CUDA-windows 2>&1 %s -### | FileCheck %s -// RUN: %clang -v --target=i386-pc-windows-mingw32 \ +// RUN: %clang -v --target=i386-pc-windows-mingw32 --cuda-gpu-arch=sm_50 \ // RUN: --sysroot=%S/Inputs/CUDA-windows 2>&1 %s -### | FileCheck %s // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA-windows/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0 diff --git a/clang/test/Driver/lto.cu b/clang/test/Driver/lto.cu index 596e6cfe07379..e4a773b487c6c 100644 --- a/clang/test/Driver/lto.cu +++ b/clang/test/Driver/lto.cu @@ -26,7 +26,7 @@ // llvm-bc and llvm-ll outputs need to match regular suffixes // (unfortunately). -// RUN: %clangxx %s --target=x86_64-unknown-linux-gnu --no-offload-new-driver -nocudainc -nocudalib -flto -save-temps --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda -### 2> %t +// RUN: %clangxx %s --target=x86_64-unknown-linux-gnu --no-offload-new-driver -nocudainc -nocudalib -flto -save-temps --cuda-path=%S/Inputs/CUDA/usr/local/cuda -### 2> %t // RUN: FileCheck -check-prefix=CHECK-COMPILELINK-SUFFIXES < %t %s // // CHECK-COMPILELINK-SUFFIXES: "-o" "[[CPP:.*lto-host.*\.cui]]" "-x" "cuda" "{{.*}}lto.cu" diff --git a/clang/test/Driver/sycl-offload-new-driver.cpp b/clang/test/Driver/sycl-offload-new-driver.cpp index f449b3839602d..295cbf65f11e5 100644 --- a/clang/test/Driver/sycl-offload-new-driver.cpp +++ b/clang/test/Driver/sycl-offload-new-driver.cpp @@ -5,23 +5,22 @@ // OFFLOAD-NEW-DRIVER: 0: input, "[[INPUT:.+\.cpp]]", c++, (host-sycl) // OFFLOAD-NEW_DRIVER: 1: preprocessor, {0}, c++-cpp-output, (host-sycl) // OFFLOAD-NEW_DRIVER: 2: compiler, {1}, ir, (host-sycl) -// OFFLOAD-NEW_DRIVER: 3: input, "[[INPUT]]", c++, (device-sycl) -// OFFLOAD-NEW_DRIVER: 4: preprocessor, {3}, c++-cpp-output, (device-sycl) -// OFFLOAD-NEW_DRIVER: 5: compiler, {4}, ir, (device-sycl) -// OFFLOAD-NEW_DRIVER: 6: backend, {5}, assembler, (device-sycl) -// OFFLOAD-NEW_DRIVER: 7: assembler, {6}, object, (device-sycl) -// OFFLOAD-NEW_DRIVER: 8: offload, "device-sycl (nvptx64-nvidia-cuda)" {7}, object +// OFFLOAD-NEW_DRIVER: 3: input, "[[INPUT]]", c++, (device-sycl, sm_75) +// OFFLOAD-NEW_DRIVER: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, sm_75) +// OFFLOAD-NEW_DRIVER: 5: compiler, {4}, ir, (device-sycl, sm_75) +// OFFLOAD-NEW_DRIVER: 6: backend, {5}, ir, (device-sycl, sm_75) +// OFFLOAD-NEW_DRIVER: 7: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {6}, ir +// OFFLOAD-NEW_DRIVER: 8: input, "[[INPUT]]", c++, (device-sycl) // OFFLOAD-NEW_DRIVER: 9: input, "[[INPUT]]", c++, (device-sycl) // OFFLOAD-NEW_DRIVER: 10: preprocessor, {9}, c++-cpp-output, (device-sycl) // OFFLOAD-NEW_DRIVER: 11: compiler, {10}, ir, (device-sycl) -// OFFLOAD-NEW_DRIVER: 12: backend, {11}, assembler, (device-sycl) -// OFFLOAD-NEW_DRIVER: 13: assembler, {12}, object, (device-sycl) -// OFFLOAD-NEW_DRIVER: 14: offload, "device-sycl (spir64-unknown-unknown)" {13}, object -// OFFLOAD-NEW_DRIVER: 15: llvm-offload-binary, {8, 14}, image, (device-sycl) -// OFFLOAD-NEW_DRIVER: 16: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (x86_64-unknown-linux-gnu)" {15}, ir -// OFFLOAD-NEW_DRIVER: 17: backend, {16}, assembler, (host-sycl) -// OFFLOAD-NEW_DRIVER: 18: assembler, {17}, object, (host-sycl) -// OFFLOAD-NEW_DRIVER: 19: clang-linker-wrapper, {18}, image, (host-sycl) +// OFFLOAD-NEW_DRIVER: 12: backend, {11}, ir, (device-sycl) +// OFFLOAD-NEW_DRIVER: 13: offload, "device-sycl (spir64-unknown-unknown)" {12}, ir +// OFFLOAD-NEW_DRIVER: 14: llvm-offload-binary, {7, 13}, image, (device-sycl) +// OFFLOAD-NEW_DRIVER: 15: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (x86_64-unknown-linux-gnu)" {14}, ir +// OFFLOAD-NEW_DRIVER: 16: backend, {15}, assembler, (host-sycl) +// OFFLOAD-NEW_DRIVER: 17: assembler, {16}, object, (host-sycl) +// OFFLOAD-NEW_DRIVER: 18: clang-linker-wrapper, {17}, image, (host-sycl) /// Check the toolflow for SYCL compilation using new offload model // RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl -fsycl-targets=spir64 --offload-new-driver %s 2>&1 | FileCheck -check-prefix=CHK-FLOW %s @@ -96,10 +95,10 @@ // RUN: | FileCheck -check-prefix=CHK_ARCH \ // RUN: -DTRIPLE=amdgcn-amd-amdhsa -DARCH=gfx900 %s // RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \ -// RUN: -fno-sycl-libspirv -fsycl-targets=nvidia_gpu_sm_50 \ +// RUN: -fno-sycl-libspirv -fsycl-targets=nvidia_gpu_sm_75 \ // RUN: -nogpulib --offload-new-driver %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK_ARCH \ -// RUN: -DTRIPLE=nvptx64-nvidia-cuda -DARCH=sm_50 %s +// RUN: -DTRIPLE=nvptx64-nvidia-cuda -DARCH=sm_75 %s // CHK_ARCH: clang{{.*}} "-triple" "[[TRIPLE]]" // CHK_ARCH-SAME: "-fsycl-is-device" {{.*}} "--offload-new-driver"{{.*}} "-o" "[[CC1DEVOUT:.+\.bc]]" // CHK_ARCH-NEXT: llvm-offload-binary{{.*}} "--image=file=[[CC1DEVOUT]],triple=[[TRIPLE]],arch=[[ARCH]]{{.*}},kind=sycl{{.*}}" @@ -174,7 +173,7 @@ // RUN: %clangxx -fsycl -### -fsycl-targets=nvptx64-nvidia-cuda \ // RUN: -fno-sycl-libspirv -nocudalib --offload-new-driver %s 2>&1 \ // RUN: | FileCheck -check-prefix NVPTX_DEF_ARCH %s -// NVPTX_DEF_ARCH: llvm-offload-binary{{.*}} "--image=file={{.*}},triple=nvptx64-nvidia-cuda,arch=sm_50,kind=sycl" +// NVPTX_DEF_ARCH: llvm-offload-binary{{.*}} "--image=file={{.*}},triple=nvptx64-nvidia-cuda,arch=sm_75,kind=sycl" /// check for -sycl-embed-ir transmission to clang-linker-wrapper tool // RUN: %clangxx -fsycl -### -fsycl-targets=nvptx64-nvidia-cuda \ diff --git a/clang/test/Driver/sycl-offload-nvptx.cpp b/clang/test/Driver/sycl-offload-nvptx.cpp index d2dcbada56def..bec5c9c1b2246 100644 --- a/clang/test/Driver/sycl-offload-nvptx.cpp +++ b/clang/test/Driver/sycl-offload-nvptx.cpp @@ -13,7 +13,7 @@ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-ACTIONS-WIN %s -// CHK-ACTIONS: "-cc1" "-triple" "nvptx64-nvidia-cuda" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-emit-llvm-bc" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl{{[/\\]+}}stl_wrappers"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libdevice{{.*}}.10.bc"{{.*}} "-target-sdk-version=[[CUDA_VERSION:[0-9.]+]]"{{.*}} "-target-cpu" "sm_50"{{.*}} "-target-feature" "+ptx42"{{.*}} "-std=c++11"{{.*}} +// CHK-ACTIONS: "-cc1" "-triple" "nvptx64-nvidia-cuda" "-{{.*}}"-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-emit-llvm-bc" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl{{[/\\]+}}stl_wrappers"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libdevice{{.*}}.10.bc"{{.*}} "-target-sdk-version=[[CUDA_VERSION:[0-9.]+]]"{{.*}} "-target-cpu" "sm_75"{{.*}} "-target-feature" "+ptx63"{{.*}} "-std=c++11"{{.*}} // CHK-ACTIONS: sycl-post-link{{.*}} "-split=auto" // CHK-ACTIONS: file-table-tform" "-extract=Code" "-drop_titles" // CHK-ACTIONS: llvm-foreach" {{.*}} "--" "{{.*}}clang-{{[0-9]+}}" @@ -23,7 +23,7 @@ // CHK-ACTIONS-NOT: "-mllvm -sycl-opt" // CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=nvptx64" "-kind=sycl"{{.*}} -// CHK-ACTIONS-WIN: "-cc1" "-triple" "nvptx64-nvidia-cuda" "-aux-triple" "x86_64-pc-windows-msvc"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-emit-llvm-bc" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl{{[/\\]+}}stl_wrappers"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libdevice{{.*}}.10.bc"{{.*}} "-target-sdk-version=[[CUDA_VERSION:[0-9.]+]]"{{.*}} "-target-cpu" "sm_50"{{.*}} "-target-feature" "+ptx42"{{.*}} +// CHK-ACTIONS-WIN: "-cc1" "-triple" "nvptx64-nvidia-cuda" "-{{.*}}"-aux-triple" "x86_64-pc-windows-msvc"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-emit-llvm-bc" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl{{[/\\]+}}stl_wrappers"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libdevice{{.*}}.10.bc"{{.*}} "-target-sdk-version=[[CUDA_VERSION:[0-9.]+]]"{{.*}} "-target-cpu" "sm_75"{{.*}} "-target-feature" "+ptx63"{{.*}} // CHK-ACTIONS-WIN: sycl-post-link{{.*}} "-split=auto" // CHK-ACTIONS-WIN: file-table-tform" "-extract=Code" "-drop_titles" // CHK-ACTIONS-WIN: llvm-foreach" {{.*}} "--" "{{.*}}clang-{{[0-9]+}}" @@ -46,25 +46,25 @@ // // CHK-PHASES-NO-CC: 0: input, "[[INPUT:.+\.cpp]]", c++, (host-sycl) // CHK-PHASES-NO-CC: 1: preprocessor, {0}, c++-cpp-output, (host-sycl) -// CHK-PHASES-NO-CC: 2: input, "[[INPUT]]", c++, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 3: preprocessor, {2}, c++-cpp-output, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 4: compiler, {3}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 5: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {4}, c++-cpp-output +// CHK-PHASES-NO-CC: 2: input, "[[INPUT]]", c++, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 3: preprocessor, {2}, c++-cpp-output, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 4: compiler, {3}, ir, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 5: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {4}, c++-cpp-output // CHK-PHASES-NO-CC: 6: compiler, {5}, ir, (host-sycl) // CHK-PHASES-NO-CC: 7: backend, {6}, assembler, (host-sycl) // CHK-PHASES-NO-CC: 8: assembler, {7}, object, (host-sycl) -// CHK-PHASES-NO-CC: 9: linker, {4}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 10: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 11: linker, {9, 10}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 16: linker, {14, 15}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 17: foreach, {13, 16}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 18: file-table-tform, {12, 17}, tempfiletable, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 19: clang-offload-wrapper, {18}, object, (device-sycl, sm_50) -// CHK-PHASES-NO-CC: 20: offload, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {19}, object +// CHK-PHASES-NO-CC: 9: linker, {4}, ir, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 10: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 11: linker, {9, 10}, ir, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 16: linker, {14, 15}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 17: foreach, {13, 16}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 18: file-table-tform, {12, 17}, tempfiletable, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 19: clang-offload-wrapper, {18}, object, (device-sycl, sm_75) +// CHK-PHASES-NO-CC: 20: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {19}, object // CHK-PHASES-NO-CC: 21: linker, {8, 20}, image, (host-sycl) // /// Check phases specifying a compute capability. @@ -139,5 +139,5 @@ // RUN: -fsycl-targets=nvptx64-nvidia-cuda --cuda-path=%S/Inputs/CUDA/usr/local/cuda \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-ACTIONS-CUDA-COMPAT %s -// CHK-ACTIONS-CUDA-COMPAT: "-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-cuda-compatibility"{{.*}} "-fdeclspec"{{.*}} "-fcuda-allow-variadic-functions"{{.*}} "-target-sdk-version=7.0"{{.*}} "-include" "__clang_cuda_runtime_wrapper.h" -// CHK-ACTIONS-CUDA-COMPAT: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-host"{{.*}} "-fsycl-cuda-compatibility"{{.*}} "-fdeclspec"{{.*}} "-fcuda-allow-variadic-functions"{{.*}} "-aux-triple" "nvptx64-nvidia-cuda"{{.*}} "-target-sdk-version=7.0"{{.*}} "-include" "__clang_cuda_runtime_wrapper.h" +// CHK-ACTIONS-CUDA-COMPAT: "-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-cuda-compatibility"{{.*}} "-fdeclspec"{{.*}} "-fcuda-allow-variadic-functions"{{.*}} "-target-sdk-version=10.0"{{.*}} "-include" "__clang_cuda_runtime_wrapper.h" +// CHK-ACTIONS-CUDA-COMPAT: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-host"{{.*}} "-fsycl-cuda-compatibility"{{.*}} "-fdeclspec"{{.*}} "-fcuda-allow-variadic-functions"{{.*}} "-aux-triple" "nvptx64-nvidia-cuda"{{.*}} "-target-sdk-version=10.0"{{.*}} "-include" "__clang_cuda_runtime_wrapper.h" diff --git a/clang/test/Driver/sycl-offload-old-model.c b/clang/test/Driver/sycl-offload-old-model.c index 8048d7438766c..122c0d2a12d58 100644 --- a/clang/test/Driver/sycl-offload-old-model.c +++ b/clang/test/Driver/sycl-offload-old-model.c @@ -616,21 +616,21 @@ // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 6: compiler, {5}, ir, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 7: backend, {6}, assembler, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 8: assembler, {7}, object, (host-sycl) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 9: input, "[[INPUT]]", c++, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 10: preprocessor, {9}, c++-cpp-output, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 11: compiler, {10}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 12: linker, {11}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 13: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 14: linker, {12, 13}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 15: sycl-post-link, {14}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 16: file-table-tform, {15}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 17: backend, {16}, assembler, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 18: assembler, {17}, object, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 19: linker, {17, 18}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 20: foreach, {16, 19}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 21: file-table-tform, {15, 20}, tempfiletable, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 22: clang-offload-wrapper, {21}, object, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 23: offload, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {22}, object +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 9: input, "[[INPUT]]", c++, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 10: preprocessor, {9}, c++-cpp-output, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 11: compiler, {10}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 12: linker, {11}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 13: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 14: linker, {12, 13}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 15: sycl-post-link, {14}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 16: file-table-tform, {15}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 17: backend, {16}, assembler, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 18: assembler, {17}, object, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 19: linker, {17, 18}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 20: foreach, {16, 19}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 21: file-table-tform, {15, 20}, tempfiletable, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 22: clang-offload-wrapper, {21}, object, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 23: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {22}, object // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 24: linker, {4}, ir, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 25: sycl-post-link, {24}, tempfiletable, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 26: file-table-tform, {25}, tempfilelist, (device-sycl) @@ -657,21 +657,21 @@ // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 6: compiler, {5}, ir, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 7: backend, {6}, assembler, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 8: assembler, {7}, object, (host-sycl) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 9: input, "[[INPUT]]", c++, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 10: preprocessor, {9}, c++-cpp-output, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 11: compiler, {10}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 12: linker, {11}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 13: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 14: linker, {12, 13}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 15: sycl-post-link, {14}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 16: file-table-tform, {15}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 17: backend, {16}, assembler, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 18: assembler, {17}, object, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 19: linker, {17, 18}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 20: foreach, {16, 19}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 21: file-table-tform, {15, 20}, tempfiletable, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 22: clang-offload-wrapper, {21}, object, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 23: offload, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {22}, object +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 9: input, "[[INPUT]]", c++, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 10: preprocessor, {9}, c++-cpp-output, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 11: compiler, {10}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 12: linker, {11}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 13: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 14: linker, {12, 13}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 15: sycl-post-link, {14}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 16: file-table-tform, {15}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 17: backend, {16}, assembler, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 18: assembler, {17}, object, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 19: linker, {17, 18}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 20: foreach, {16, 19}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 21: file-table-tform, {15, 20}, tempfiletable, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 22: clang-offload-wrapper, {21}, object, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 23: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {22}, object // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 24: linker, {4}, ir, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 25: sycl-post-link, {24}, tempfiletable, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 26: file-table-tform, {25}, tempfilelist, (device-sycl) @@ -691,10 +691,10 @@ // RUN: | FileCheck -check-prefix=CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED %s // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 0: input, "[[INPUT:.+\.c]]", c++, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 1: preprocessor, {0}, c++-cpp-output, (host-sycl) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 2: input, "[[INPUT]]", c++, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 3: preprocessor, {2}, c++-cpp-output, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 4: compiler, {3}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 5: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {4}, c++-cpp-output +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 2: input, "[[INPUT]]", c++, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 3: preprocessor, {2}, c++-cpp-output, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 4: compiler, {3}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 5: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {4}, c++-cpp-output // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 6: compiler, {5}, ir, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 7: backend, {6}, assembler, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 8: assembler, {7}, object, (host-sycl) @@ -708,18 +708,18 @@ // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 16: file-table-tform, {13, 15}, tempfiletable, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 17: clang-offload-wrapper, {16}, object, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 18: offload, "device-sycl (spir64-unknown-unknown)" {17}, object -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 19: linker, {4}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 20: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 21: linker, {19, 20}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 22: sycl-post-link, {21}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 23: file-table-tform, {22}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 24: backend, {23}, assembler, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 25: assembler, {24}, object, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 26: linker, {24, 25}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 27: foreach, {23, 26}, cuda-fatbin, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 28: file-table-tform, {22, 27}, tempfiletable, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 29: clang-offload-wrapper, {28}, object, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 30: offload, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {29}, object +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 19: linker, {4}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 20: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 21: linker, {19, 20}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 22: sycl-post-link, {21}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 23: file-table-tform, {22}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 24: backend, {23}, assembler, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 25: assembler, {24}, object, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 26: linker, {24, 25}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 27: foreach, {23, 26}, cuda-fatbin, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 28: file-table-tform, {22, 27}, tempfiletable, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 29: clang-offload-wrapper, {28}, object, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 30: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {29}, object // CHK-PHASE-MULTI-TARG-BOUND-ARCH-FLIPPED: 31: linker, {8, 18, 30}, image, (host-sycl) /// ########################################################################### diff --git a/clang/test/Driver/sycl-offload-static-lib-2-old-model.cpp b/clang/test/Driver/sycl-offload-static-lib-2-old-model.cpp index 69c8ffd1d0abc..04ed13cb81570 100644 --- a/clang/test/Driver/sycl-offload-static-lib-2-old-model.cpp +++ b/clang/test/Driver/sycl-offload-static-lib-2-old-model.cpp @@ -21,9 +21,9 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -L/dummy/dir %t_lib.lo -### %t_obj.o 2>&1 \ // RUN: | FileCheck %s -check-prefixes=STATIC_LIB,STATIC_LIB_DEF -DBUNDLE_TRIPLE=sycl-spir64-unknown-unknown // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -L/dummy/dir %t_lib.a -### %t_obj.o 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefixes=STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -L/dummy/dir %t_lib.lo -### %t_obj.o 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefixes=STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // STATIC_LIB: clang-offload-bundler{{.*}} "-type=o" "-targets={{.*}},[[BUNDLE_TRIPLE]]" "-input=[[INPUTO:.+\.o]]" "-output=[[HOSTOBJ:.+\.o]]" "-output={{.+\.o}}" // STATIC_LIB: clang-offload-deps{{.*}} "-targets=[[BUNDLE_TRIPLE]]" // STATIC_LIB_DEF: clang-offload-bundler{{.*}} "-type=aoo" "-targets=[[BUNDLE_TRIPLE]]" "-input={{.*}}" "-output=[[OUTFILE:.+\.txt]]" @@ -43,7 +43,7 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -Xlinker -Bstatic -L%t_dir -L%S/Inputs/SYCL -llin64 -### %t_obj.o 2>&1 \ // RUN: | FileCheck %s -check-prefixes=STATIC_L_LIB,STATIC_L_LIB_DEF -DBUNDLE_TRIPLE=sycl-spir64-unknown-unknown // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -L%S/Inputs/SYCL -llin64 -### %t_obj.o 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=STATIC_L_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefixes=STATIC_L_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // STATIC_L_LIB: clang-offload-bundler{{.*}} "-type=o" "-targets={{.*}},[[BUNDLE_TRIPLE]]" "-input=[[INPUTO:.+\.o]]" "-output=[[HOSTOBJ:.+\.o]]" "-output={{.+\.o}}" // STATIC_L_LIB: clang-offload-deps{{.*}} "-targets=[[BUNDLE_TRIPLE]]" // STATIC_L_LIB_DEF: clang-offload-bundler{{.*}} "-type=aoo" "-targets=[[BUNDLE_TRIPLE]]" "-input={{.*}}liblin64.a" "-output=[[OUTFILE:.+\.txt]]" @@ -71,7 +71,7 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver %t_lib.a -### %t-1.o %t-2.o %t-3.o 2>&1 \ // RUN: | FileCheck %s -check-prefixes=STATIC_LIB_MULTI_O,STATIC_LIB_MULTI_O_DEF -DBUNDLE_TRIPLE=sycl-spir64-unknown-unknown // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda %t_lib.a -### %t-1.o %t-2.o %t-3.o 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=STATIC_LIB_MULTI_O,STATIC_LIB_MULTI_O_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefixes=STATIC_LIB_MULTI_O,STATIC_LIB_MULTI_O_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // STATIC_LIB_MULTI_O: clang-offload-bundler{{.*}} "-type=o" "-targets={{.*}},[[BUNDLE_TRIPLE]]" "-input={{.+}}-1.o" // STATIC_LIB_MULTI_O: clang-offload-bundler{{.*}} "-type=o" "-targets={{.*}},[[BUNDLE_TRIPLE]]" "-input={{.+}}-2.o" // STATIC_LIB_MULTI_O: clang-offload-bundler{{.*}} "-type=o" "-targets={{.*}},[[BUNDLE_TRIPLE]]" "-input={{.+}}-3.o" @@ -123,10 +123,10 @@ // STATIC_LIB_SRC-CUDA: 0: input, "[[INPUTA:.+\.a]]", object, (host-sycl) // STATIC_LIB_SRC-CUDA: 1: input, "[[INPUTC:.+\.cpp]]", c++, (host-sycl) // STATIC_LIB_SRC-CUDA: 2: preprocessor, {1}, c++-cpp-output, (host-sycl) -// STATIC_LIB_SRC-CUDA: 3: input, "[[INPUTC]]", c++, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 5: compiler, {4}, ir, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {5}, c++-cpp-output +// STATIC_LIB_SRC-CUDA: 3: input, "[[INPUTC]]", c++, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 5: compiler, {4}, ir, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {5}, c++-cpp-output // STATIC_LIB_SRC-CUDA: 7: compiler, {6}, ir, (host-sycl) // STATIC_LIB_SRC-CUDA: 8: backend, {7}, assembler, (host-sycl) // STATIC_LIB_SRC-CUDA: 9: assembler, {8}, object, (host-sycl) @@ -134,18 +134,18 @@ // STATIC_LIB_SRC-CUDA: 11: clang-offload-deps, {10}, ir, (host-sycl) // STATIC_LIB_SRC-CUDA: 12: input, "[[INPUTA]]", archive // STATIC_LIB_SRC-CUDA: 13: clang-offload-unbundler, {12}, archive -// STATIC_LIB_SRC-CUDA: 14: linker, {5, 11, 13}, ir, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 15: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 16: linker, {14, 15}, ir, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 17: sycl-post-link, {16}, ir, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 18: file-table-tform, {17}, ir, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 19: backend, {18}, assembler, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 20: assembler, {19}, object, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 21: linker, {19, 20}, cuda-fatbin, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 22: foreach, {18, 21}, cuda-fatbin, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 23: file-table-tform, {17, 22}, tempfiletable, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 24: clang-offload-wrapper, {23}, object, (device-sycl, sm_50) -// STATIC_LIB_SRC-CUDA: 25: offload, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {24}, object +// STATIC_LIB_SRC-CUDA: 14: linker, {5, 11, 13}, ir, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 15: input, "{{.*}}libspirv-nvptx64{{.*}}", ir, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 16: linker, {14, 15}, ir, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 17: sycl-post-link, {16}, ir, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 18: file-table-tform, {17}, ir, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 19: backend, {18}, assembler, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 20: assembler, {19}, object, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 21: linker, {19, 20}, cuda-fatbin, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 22: foreach, {18, 21}, cuda-fatbin, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 23: file-table-tform, {17, 22}, tempfiletable, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 24: clang-offload-wrapper, {23}, object, (device-sycl, sm_75) +// STATIC_LIB_SRC-CUDA: 25: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {24}, object // STATIC_LIB_SRC-CUDA: 26: linker, {0, 9, 25}, image, (host-sycl) /// ########################################################################### @@ -154,7 +154,7 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver %t_lib.a -o output_name -lOpenCL -### %s 2>&1 \ // RUN: | FileCheck %s -check-prefix=STATIC_LIB_SRC2 -DBUNDLE_TRIPLE=sycl-spir64-unknown-unknown -DDEPS_TRIPLE=sycl-spir64-unknown-unknown // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda %t_lib.a -o output_name -lOpenCL -### %s 2>&1 \ -// RUN: | FileCheck %s -check-prefix=STATIC_LIB_SRC2 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 -DDEPS_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefix=STATIC_LIB_SRC2 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 -DDEPS_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // STATIC_LIB_SRC2: clang{{.*}} "-emit-obj" {{.*}} "-o" "[[HOSTOBJ:.+\.o]]" // STATIC_LIB_SRC2: ld{{(.exe)?}}" {{.*}} "-o" "[[HOSTEXE:.+\.out]]" {{.*}}"--unresolved-symbols=ignore-all" // STATIC_LIB_SRC2: clang-offload-deps{{.*}} "-targets=[[DEPS_TRIPLE]]" "-outputs=[[OUTDEPS:.+\.bc]]" "[[HOSTEXE]]" @@ -172,7 +172,7 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver %t_lib.a -o output_name -lstdc++ -z relro -### %s 2>&1 \ // RUN: | FileCheck %s -check-prefix=STATIC_LIB_SRC3 -DBUNDLE_TRIPLE=sycl-spir64-unknown-unknown // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda %t_lib.a -o output_name -lstdc++ -z relro -### %s 2>&1 \ -// RUN: | FileCheck %s -check-prefix=STATIC_LIB_SRC3 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefix=STATIC_LIB_SRC3 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // STATIC_LIB_SRC3: clang-offload-bundler{{.*}} "-type=a{{(oo)*}}" "-targets=[[BUNDLE_TRIPLE]]" // STATIC_LIB_SRC3: llvm-link{{.*}} "{{.*}}" // STATIC_LIB_SRC3: ld{{(.exe)?}}" {{.*}} "-o" "output_name" {{.*}} "-lstdc++" "-z" "relro" @@ -180,8 +180,8 @@ /// Test device linking behaviors with spir64 and nvptx targets // RUN: touch %t_lib.a // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda,spir64 %t_lib.a -### %s 2>&1 \ -// RUN: | FileCheck %s -check-prefix=STATIC_LIB_MIX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 -// STATIC_LIB_MIX: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-nvptx64-nvidia-cuda-sm_50,sycl-spir64-unknown-unknown" {{.*}} "-output=[[NVPTXLIST:.+\.txt]]" "-output=[[SYCLLIST:.+\.txt]]" +// RUN: | FileCheck %s -check-prefix=STATIC_LIB_MIX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 +// STATIC_LIB_MIX: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-nvptx64-nvidia-cuda-sm_75,sycl-spir64-unknown-unknown" {{.*}} "-output=[[NVPTXLIST:.+\.txt]]" "-output=[[SYCLLIST:.+\.txt]]" // STATIC_LIB_MIX: llvm-link{{.*}} "@[[NVPTXLIST]]" // STATIC_LIB_MIX: spirv-to-ir-wrapper{{.*}} "[[SYCLLIST]]" "-o" "[[SYCLLINKLIST:.+\.txt]]" // STATIC_LIB_MIX: llvm-link{{.*}} "@[[SYCLLINKLIST]]" @@ -199,9 +199,9 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -L/dummy/dir %t_obj.o -Wl,@%/t_arg.arg -### 2>&1 \ // RUN: | FileCheck %s -check-prefixes=WHOLE_STATIC_LIB,WHOLE_STATIC_LIB_DEF -DBUNDLE_TRIPLE=sycl-spir64-unknown-unknown // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -L/dummy/dir %t_obj.o -Wl,--whole-archive %t_lib.a %t_lib_2.a -Wl,--no-whole-archive -### 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=WHOLE_STATIC_LIB,WHOLE_STATIC_LIB_1,WHOLE_STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefixes=WHOLE_STATIC_LIB,WHOLE_STATIC_LIB_1,WHOLE_STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -L/dummy/dir %t_obj.o -Wl,@%/t_arg.arg -### 2>&1 \ -// RUN: | FileCheck %s -check-prefixes=WHOLE_STATIC_LIB,WHOLE_STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefixes=WHOLE_STATIC_LIB,WHOLE_STATIC_LIB_NVPTX -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // WHOLE_STATIC_LIB: clang-offload-bundler{{.*}} "-type=o" "-targets={{.*}},[[BUNDLE_TRIPLE]]" // WHOLE_STATIC_LIB_DEF: clang-offload-bundler{{.*}} "-type=aoo" "-targets=[[BUNDLE_TRIPLE]]" "-input=[[INPUTA:.+\.a]]" "-output=[[OUTPUTA:.+\.txt]]" // WHOLE_STATIC_LIB_DEF: llvm-foreach{{.*}} "--out-ext=txt" "--in-file-list=[[OUTPUTA]]" "--in-replace=[[OUTPUTA]]" "--out-file-list=[[OUTLISTA:.+\.txt]]" "--out-replace=[[OUTLISTA]]" "--" {{.*}}spirv-to-ir-wrapper{{.*}} "[[OUTPUTA]]" "-o" "[[OUTLISTA]]" @@ -231,9 +231,9 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-instrument-device-code --no-offloadlib -L/dummy/dir %t_lib.lo -### 2>&1 \ // RUN: | FileCheck %s -check-prefix=STATIC_LIB_NOSRC -check-prefix=STATIC_LIB_NOSRC-SPIR -DTARGET=spir64 -DBUNDLE_TRIPLE=sycl-spir64-unknown-unknown // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-instrument-device-code --no-offloadlib -L/dummy/dir %t_lib.a -### 2>&1 \ -// RUN: | FileCheck %s -check-prefix=STATIC_LIB_NOSRC -check-prefix=STATIC_LIB_NOSRC-CUDA -DTARGET=nvptx64 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefix=STATIC_LIB_NOSRC -check-prefix=STATIC_LIB_NOSRC-CUDA -DTARGET=nvptx64 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl --no-offload-new-driver -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-instrument-device-code --no-offloadlib -L/dummy/dir %t_lib.lo -### 2>&1 \ -// RUN: | FileCheck %s -check-prefix=STATIC_LIB_NOSRC -check-prefix=STATIC_LIB_NOSRC-CUDA -DTARGET=nvptx64 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 +// RUN: | FileCheck %s -check-prefix=STATIC_LIB_NOSRC -check-prefix=STATIC_LIB_NOSRC-CUDA -DTARGET=nvptx64 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_75 // STATIC_LIB_NOSRC-SPIR: clang-offload-bundler{{.*}} "-type=aoo" "-targets=[[BUNDLE_TRIPLE]]" "-input={{.*}}_lib.{{(a|lo)}}" "-output=[[DEVICELIB:.+\.txt]]" "-unbundle" // STATIC_LIB_NOSRC-SPIR: llvm-foreach{{.*}}spirv-to-ir-wrapper{{.*}} "[[DEVICELIB]]" "-o" "[[DEVICELIST:.+\.txt]]" // STATIC_LIB_NOSRC-SPIR: llvm-link{{.*}} "@[[DEVICELIST]]" "-o" "[[BCFILE:.+\.bc]]" diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c index ce045accaced1..e6ce6c06d9779 100644 --- a/clang/test/Driver/sycl-offload.c +++ b/clang/test/Driver/sycl-offload.c @@ -400,11 +400,11 @@ // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 0: input, "[[INPUT:.+\.c]]", c++, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 1: preprocessor, {0}, c++-cpp-output, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 2: compiler, {1}, ir, (host-sycl) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 3: input, "[[INPUT]]", c++, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 5: compiler, {4}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 6: backend, {5}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 7: offload, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {6}, ir +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 3: input, "[[INPUT]]", c++, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 5: compiler, {4}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 6: backend, {5}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH: 7: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {6}, ir // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 8: input, "[[INPUT]]", c++, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 9: preprocessor, {8}, c++-cpp-output, (device-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH: 10: compiler, {9}, ir, (device-sycl) @@ -425,11 +425,11 @@ // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 0: input, "[[INPUT:.+\.c]]", c++, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 1: preprocessor, {0}, c++-cpp-output, (host-sycl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 2: compiler, {1}, ir, (host-sycl) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 3: input, "[[INPUT]]", c++, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 5: compiler, {4}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 6: backend, {5}, ir, (device-sycl, sm_50) -// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 7: offload, "device-sycl (nvptx64-nvidia-cuda:sm_50)" {6}, ir +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 3: input, "[[INPUT]]", c++, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 5: compiler, {4}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 6: backend, {5}, ir, (device-sycl, sm_75) +// CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 7: offload, "device-sycl (nvptx64-nvidia-cuda:sm_75)" {6}, ir // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 8: input, "[[INPUT]]", c++, (device-sycl, skl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 9: preprocessor, {8}, c++-cpp-output, (device-sycl, skl) // CHK-PHASE-MULTI-TARG-BOUND-ARCH2: 10: compiler, {9}, ir, (device-sycl, skl) diff --git a/clang/test/Driver/sycl-target-mismatch-nvptx.cpp b/clang/test/Driver/sycl-target-mismatch-nvptx.cpp index b871bcbfec5b2..30b6915f3aa3e 100644 --- a/clang/test/Driver/sycl-target-mismatch-nvptx.cpp +++ b/clang/test/Driver/sycl-target-mismatch-nvptx.cpp @@ -21,7 +21,7 @@ // RUN: %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \ // RUN: | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG // RUN: %clangxx -fsycl -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda \ -// RUN: %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \ +// RUN: %S/Inputs/SYCL/objnvptx64-sm_75.o -### %s 2>&1 \ // RUN: | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG // RUN: %clangxx -fsycl -fno-sycl-libspirv -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60 \ // RUN: -Wno-sycl-target %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \ From 836e86cce2f6069a9164f23d311f7b0c5533e405 Mon Sep 17 00:00:00 2001 From: Rafal Rudnicki Date: Thu, 4 Dec 2025 09:59:21 +0100 Subject: [PATCH 4/5] add support for 3-operand atomic intrinsic --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 94 +++++++++++++++++++++--- 1 file changed, 83 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 40ab9fb562a86..2a9daf537e09e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2335,7 +2335,82 @@ multiclass ATOM2S_impl; + t, !listconcat(Preds, [hasAtomScope, hasMemoryOrdering])>; + } + } + } +} + +// Helper for 3-operand atomic intrinsic patterns (like CAS with semantics) +multiclass ATOM3_INTRINSIC_PATTERN Preds> { + defvar intrinsic = !cast( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!empty(SemStr), "", "_" # SemStr) + # !if(!eq(ScopeStr, "gpu"), "", "_" # ScopeStr)); + + defvar ordering = !cond( + !eq(SemStr, "acquire"): Ordering_acquire, + !eq(SemStr, "release"): Ordering_release, + !eq(SemStr, "acq_rel"): Ordering_acquire_release, + true: Ordering_not_atomic); + + defvar scope_pat = !cond( + !eq(ScopeStr, "gpu"): Scope_device, + !eq(ScopeStr, "cta"): Scope_cta, + !eq(ScopeStr, "sys"): Scope_sys, + true: Scope_device); + + defvar space_pat = !cond( + !eq(SpaceStr, "gen"): AddrSpace_gen, + !eq(SpaceStr, "global"): AddrSpace_global, + !eq(SpaceStr, "shared"): AddrSpace_shared, + true: AddrSpace_gen); + + let Predicates = Preds in { + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)), + (!cast(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, ordering, scope_pat, space_pat)>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c)), + (!cast(InstructionName # "_ir") ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, ordering, scope_pat, space_pat)>; + + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c))), + (!cast(InstructionName # "_ri") ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), ordering, scope_pat, space_pat)>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c))), + (!cast(InstructionName # "_ii") ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), ordering, scope_pat, space_pat)>; + } +} + +multiclass ATOM3S_impl Preds> { + // Similar to ATOM2S_impl but for 3-operand atomics like CAS + foreach scope = ["cta", "sys"] in { + foreach space = ["gen", "global", "shared"] in { + defm _#scope#space : ATOM3_INTRINSIC_PATTERN; + } + } + + foreach scope = ["gpu"] in { + foreach space = ["gen", "global", "shared"] in { + defm _#scope#space : ATOM3_INTRINSIC_PATTERN; + } + } + + // Intrinsics with semantics + foreach sem = ["acquire", "release", "acq_rel"] in { + foreach scope = ["gpu", "cta", "sys"] in { + foreach space = ["gen", "global", "shared"] in { + defm _#sem#scope#space : + ATOM3_INTRINSIC_PATTERN; } } } @@ -2401,12 +2476,9 @@ multiclass ATOM2_incdec_impl { // atom.cas multiclass ATOM3_cas_impl { - defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN; - defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN; - defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN; -// TODO: rewrite these two too: - // defm _f32 : ATOM3S_impl; - // defm _f64 : ATOM3S_impl; + defm _b16 : ATOM3S_impl; + defm _b32 : ATOM3S_impl; + defm _b64 : ATOM3S_impl; } defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; @@ -2472,9 +2544,9 @@ multiclass ATOM_LdA_impl Preds> { defm _relaxed_ : ATOM_LdN_spaces_impl; + regT, regclass, !listconcat(Preds,[hasMemoryOrdering])>; defm _acquire_ : ATOM_LdN_spaces_impl; + regT, regclass, !listconcat(Preds,[hasMemoryOrdering])>; } // Constructs variants for different scopes of atomic op. @@ -2557,9 +2629,9 @@ multiclass ATOM_StA_impl Preds> { defm _relaxed_ : ATOM_StN_spaces_impl; + regT, regclass, ImmType, Imm, ImmTy, !listconcat(Preds,[hasMemoryOrdering])>; defm _release_ : ATOM_StN_spaces_impl; + regT, regclass, ImmType, Imm, ImmTy, !listconcat(Preds,[hasMemoryOrdering])>; } multiclass ATOM_StS_impl Date: Thu, 4 Dec 2025 10:10:04 +0100 Subject: [PATCH 5/5] add support for tanh.approx.f16/f16x2 --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 28 ++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b0bedf3a140e7..097f1a46f7ae9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -121,6 +121,7 @@ class callSubtarget : Predicate<"Subtarget->" # Subtarge def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; def hasAtomSemantics : Predicate<"Subtarget->hasAtomSemantics()">; +def hasMemoryOrdering : Predicate<"Subtarget->hasMemoryOrdering()">; def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; def hasAtomSwap128 : Predicate<"Subtarget->hasAtomSwap128()">; @@ -1226,11 +1227,32 @@ def COS_APPROX_f32 : BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz), "cos.approx$ftz.f32", [(set f32:$dst, (UnaryOpAllowsApproxFn f32:$src))]>; -def TANH_APPROX_f32 : - BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f32", - [(set f32:$dst, (UnaryOpAllowsApproxFn f32:$src))]>, + +// NOTE: tanh.approx doesn't support the FTZ flag for f16/f16x2 +def TANH_APPROX_f16 : + BasicNVPTXInst<(outs B16:$dst), (ins B16:$src), "tanh.approx.f16", + [(set f16:$dst, (UnaryOpAllowsApproxFn f16:$src))]>, + Requires<[hasPTX<70>, hasSM<75>]>; + +def TANH_APPROX_f16x2 : + BasicNVPTXInst<(outs B32:$dst), (ins B32:$src), "tanh.approx.f16x2", + [(set v2f16:$dst, (UnaryOpAllowsApproxFn v2f16:$src))]>, Requires<[hasPTX<70>, hasSM<75>]>; +def TANH_APPROX_f32 : + BasicFlagsNVPTXInst<(outs B32:$dst), (ins B32:$src), (ins FTZFlag:$ftz), + "tanh.approx$ftz.f32", + [(set f32:$dst, (UnaryOpAllowsApproxFn f32:$src))]>, + Requires<[hasPTX<70>, hasSM<75>]>; + +// Patterns for NVVM tanh intrinsics +def : Pat<(f16 (int_nvvm_tanh_approx_f16 f16:$a)), + (TANH_APPROX_f16 f16:$a)>; +def : Pat<(v2f16 (int_nvvm_tanh_approx_f16x2 v2f16:$a)), + (TANH_APPROX_f16x2 v2f16:$a)>; +def : Pat<(f32 (int_nvvm_tanh_approx_f f32:$a)), + (TANH_APPROX_f32 f32:$a, 0)>; + //----------------------------------- // Bitwise operations //-----------------------------------