build: pin CUDA toolchain to local 12.8 install

shijiashuai · shijiashuai · commit 8802b43cc26c · 2026-03-24T22:04:38.000+08:00
Pin the project to /usr/local/cuda/bin/nvcc and require CUDA 12.8 for configure, Python packaging, and preset-driven builds. Remove the older CUDA 10.x compatibility path and update the README, install guide, and troubleshooting docs to match the local toolchain assumptions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -34,6 +34,10 @@ option(TC_ENABLE_CUDA "Enable CUDA support" ON)
 set(TC_CUDA_ENABLED OFF)
 
 if(TC_ENABLE_CUDA)
+    if(NOT DEFINED CMAKE_CUDA_COMPILER)
+        set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc" CACHE FILEPATH "Path to nvcc")
+    endif()
+
     include(CheckLanguage)
     check_language(CUDA)
     if(CMAKE_CUDA_COMPILER)
@@ -42,29 +46,19 @@ if(TC_ENABLE_CUDA)
         find_package(CUDAToolkit REQUIRED)
 
         # Check CUDA version
-        if(CUDAToolkit_VERSION VERSION_LESS "10.1")
-            message(FATAL_ERROR "CUDA 10.1 or higher is required. Found: ${CUDAToolkit_VERSION}")
+        if(CUDAToolkit_VERSION VERSION_LESS "12.8")
+            message(FATAL_ERROR "CUDA 12.8 or higher is required. Found: ${CUDAToolkit_VERSION}")
         endif()
 
+        message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
         message(STATUS "CUDA version: ${CUDAToolkit_VERSION}")
 
-        # Set a CUDA dialect compatible with the detected toolkit.
-        if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
-            set(CMAKE_CUDA_STANDARD 17)
-        else()
-            set(CMAKE_CUDA_STANDARD 14)
-        endif()
+        set(CMAKE_CUDA_STANDARD 17)
         set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
-        # CUDA architectures - keep defaults compatible with the detected toolkit.
+        # CUDA architectures
         if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
-            if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.0")
-                set(TC_DEFAULT_CUDA_ARCHITECTURES "75;80;86;89;90")
-            elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
-                set(TC_DEFAULT_CUDA_ARCHITECTURES "70;75;80;86")
-            else()
-                set(TC_DEFAULT_CUDA_ARCHITECTURES "70;75")
-            endif()
+            set(TC_DEFAULT_CUDA_ARCHITECTURES "75;80;86;89;90")
             set(CMAKE_CUDA_ARCHITECTURES "${TC_DEFAULT_CUDA_ARCHITECTURES}" CACHE STRING "CUDA architectures" FORCE)
         endif()
 
@@ -73,17 +67,10 @@ if(TC_ENABLE_CUDA)
             add_compile_definitions(TC_CUDA_13=1 TC_CUDA_12=1 TC_CUDA_11=1 TC_CUDA_10=1)
             add_compile_definitions(TC_HAS_TMA=1 TC_HAS_WGMMA=1 TC_HAS_FP8=1 TC_HAS_BF16=1 TC_HAS_WMMA=1)
             message(STATUS "CUDA 13.x features enabled (TMA, WGMMA, FP8, BF16, WMMA)")
-        elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.0")
+        else()
             add_compile_definitions(TC_CUDA_12=1 TC_CUDA_11=1 TC_CUDA_10=1)
             add_compile_definitions(TC_HAS_TMA=1 TC_HAS_WGMMA=1 TC_HAS_FP8=1 TC_HAS_BF16=1 TC_HAS_WMMA=1)
-            message(STATUS "CUDA 12.x features enabled (TMA, WGMMA, FP8, BF16, WMMA)")
-        elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
-            add_compile_definitions(TC_CUDA_11=1 TC_CUDA_10=1)
-            add_compile_definitions(TC_HAS_BF16=1 TC_HAS_WMMA=1)
-            message(STATUS "CUDA 11.x features enabled (BF16, WMMA)")
-        else()
-            add_compile_definitions(TC_CUDA_10=1)
-            message(STATUS "CUDA 10.x compatibility mode enabled")
+            message(STATUS "CUDA 12.8+ features enabled (TMA, WGMMA, FP8, BF16, WMMA)")
         endif()
     else()
         message(WARNING "Failed to find nvcc (CUDA compiler). Configure will continue without CUDA. To enable CUDA, set CUDAToolkit_ROOT or ensure nvcc is on PATH.")
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -12,7 +12,9 @@
             "generator": "Ninja",
             "binaryDir": "${sourceDir}/build/${presetName}",
             "cacheVariables": {
-                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
+                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+                "CMAKE_CUDA_COMPILER": "/usr/local/cuda/bin/nvcc",
+                "CUDAToolkit_ROOT": "/usr/local/cuda"
             }
         },
         {
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ English | [简体中文](README.zh-CN.md) | [Docs](https://lessup.github.io/mode
 [![CI](https://github.com/LessUp/modern-ai-kernels/actions/workflows/ci.yml/badge.svg)](https://github.com/LessUp/modern-ai-kernels/actions/workflows/ci.yml)
 [![Docs](https://img.shields.io/badge/Docs-GitHub%20Pages-blue?logo=github)](https://lessup.github.io/modern-ai-kernels/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
-![CUDA](https://img.shields.io/badge/CUDA-10.1%2B-76B900?logo=nvidia&logoColor=white)
+![CUDA](https://img.shields.io/badge/CUDA-12.8-76B900?logo=nvidia&logoColor=white)
 ![C++](https://img.shields.io/badge/C%2B%2B-17-00599C?logo=c%2B%2B&logoColor=white)
 ![CMake](https://img.shields.io/badge/CMake-3.20+-064F8C?logo=cmake&logoColor=white)
 ![Python](https://img.shields.io/badge/Python-3.8+-3776AB?logo=python&logoColor=white)
@@ -41,8 +41,8 @@ python -c "import tensorcraft_ops as tc; print(tc.__version__)"
 
 ## Build Notes
 
-- Minimum supported CUDA toolkit is `10.1`
-- CUDA `11.x`/`12.x` unlock more optimized feature paths than CUDA `10.x`
+- This repository targets the local CUDA `12.8` toolkit at `/usr/local/cuda/bin/nvcc`
+- CMake presets and Python builds pin `CMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc`
 - If CUDA is unavailable, CMake disables tests, benchmarks, and Python bindings automatically
 - If build pressure is high, prefer `dev`/`python-dev`, keep `--parallel` low, and set a single `CMAKE_CUDA_ARCHITECTURES` value for your GPU
 
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
@@ -6,7 +6,8 @@ This guide documents the current recommended build paths for TensorCraft-HPC.
 
 ### Required
 
-- **CUDA Toolkit**: `10.1` or later
+- **CUDA Toolkit**: `12.8`
+- **nvcc path**: `/usr/local/cuda/bin/nvcc`
 - **CMake**: `3.20` or later
 - **C++ Compiler**: C++17-capable host compiler
 - **NVIDIA GPU**: recommended for tests and Python bindings
@@ -101,9 +102,9 @@ Adjust `CMAKE_CUDA_ARCHITECTURES` to match your GPU.
 
 ## Compatibility Notes
 
-- TensorCraft-HPC now supports CUDA `10.1+`
-- On CUDA `10.x`, device compilation uses a CUDA-compatible dialect while host code remains on C++17
-- CUDA `11.x` and `12.x` enable newer feature paths beyond the CUDA `10.x` compatibility path
+- TensorCraft-HPC now targets CUDA `12.8`
+- The repository defaults to `/usr/local/cuda/bin/nvcc`
+- The codebase assumes full C++17 support in both host and CUDA compilation paths
 
 ## Verification
 
diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md
@@ -30,7 +30,7 @@ cmake --preset cpu-smoke
 If you do need CUDA, point CMake to the toolkit explicitly:
 
 ```bash
-cmake -B build -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
+cmake -B build -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCUDAToolkit_ROOT=/usr/local/cuda
 ```
 
 ## Unsupported GPU architecture
@@ -126,14 +126,14 @@ TensorCraft can still build successfully, but you may want to clean that environ
 
 ## CUDA version compatibility
 
-| Capability | Minimum CUDA |
-|------------|--------------|
-| Basic kernels and core build | 10.1 |
-| BF16-related paths | 11.x |
-| FP8-related paths | 12.x |
-| Hopper-specific features | 12.x |
+| Capability | Required CUDA |
+|------------|---------------|
+| Basic kernels and core build | 12.8 |
+| BF16-related paths | 12.8 |
+| FP8-related paths | 12.8 |
+| Hopper-specific features | 12.8 |
 
-If you are on CUDA `10.x`, prefer the compatibility path and avoid assuming newer CUDA features are available.
+This repository now assumes the local CUDA `12.8` toolchain and no longer carries a CUDA 10.x compatibility path.
 
 ## GPU runtime path vs CI
 
diff --git a/include/tensorcraft/core/features.hpp b/include/tensorcraft/core/features.hpp
@@ -23,10 +23,8 @@
     #define TC_CPP17 1
 #elif __cplusplus >= 201703L
     #define TC_CPP17 1
-#elif defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 11) && (__cplusplus >= 201402L)
-    #define TC_CUDA_LEGACY_CXX 1
 #else
-    #error "TensorCraft requires C++17 or later (CUDA 10.x device code may use C++14)"
+    #error "TensorCraft requires C++17 or later"
 #endif
 
 // ============================================================================
@@ -49,10 +47,8 @@
     #elif __CUDACC_VER_MAJOR__ >= 11
         #define TC_CUDA_11 1
         #define TC_CUDA_10 1
-    #elif __CUDACC_VER_MAJOR__ >= 10
-        #define TC_CUDA_10 1
     #else
-        #error "TensorCraft requires CUDA 10.1 or later"
+        #error "TensorCraft requires CUDA 12.8 or later"
     #endif
 
     // Feature availability based on CUDA version
diff --git a/include/tensorcraft/core/type_traits.hpp b/include/tensorcraft/core/type_traits.hpp
@@ -85,8 +85,8 @@ constexpr bool is_fp8_v = false;
  * @brief Type trait to check if T is any floating point type (including half)
  */
 template<typename T>
-struct is_floating : std::integral_constant<bool,
-    std::is_floating_point<T>::value || is_half_v<T> || is_fp8_v<T>
+struct is_floating : std::bool_constant<
+    std::is_floating_point_v<T> || is_half_v<T> || is_fp8_v<T>
 > {};
 
 template<typename T>
@@ -209,40 +209,36 @@ constexpr size_t type_bits_v = type_bits<T>::value;
  */
 template<typename T>
 TC_HOST_DEVICE_INLINE float to_float(T val) {
-    return static_cast<float>(val);
-}
-
-template<>
-TC_HOST_DEVICE_INLINE float to_float<__half>(__half val) {
-    return __half2float(val);
-}
-
+    if constexpr (std::is_same_v<T, __half>) {
+        return __half2float(val);
+    }
 #if defined(TC_HAS_BF16)
-template<>
-TC_HOST_DEVICE_INLINE float to_float<__nv_bfloat16>(__nv_bfloat16 val) {
-    return __bfloat162float(val);
-}
+    else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        return __bfloat162float(val);
+    }
 #endif
+    else {
+        return static_cast<float>(val);
+    }
+}
 
 /**
  * @brief Convert float to target type
  */
 template<typename T>
 TC_HOST_DEVICE_INLINE T from_float(float val) {
-    return static_cast<T>(val);
-}
-
-template<>
-TC_HOST_DEVICE_INLINE __half from_float<__half>(float val) {
-    return __float2half(val);
-}
-
+    if constexpr (std::is_same_v<T, __half>) {
+        return __float2half(val);
+    }
 #if defined(TC_HAS_BF16)
-template<>
-TC_HOST_DEVICE_INLINE __nv_bfloat16 from_float<__nv_bfloat16>(float val) {
-    return __float2bfloat16(val);
-}
+    else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        return __float2bfloat16(val);
+    }
 #endif
+    else {
+        return static_cast<T>(val);
+    }
+}
 
 // ============================================================================
 // Data Type Enumeration
@@ -266,45 +262,26 @@ enum class DataType {
  * @brief Get DataType enum from C++ type
  */
 template<typename T>
-struct dtype_of {
-    static constexpr DataType value = DataType::FP32;
-};
-
-template<>
-struct dtype_of<float> {
-    static constexpr DataType value = DataType::FP32;
-};
-
-template<>
-struct dtype_of<__half> {
-    static constexpr DataType value = DataType::FP16;
-};
-
+constexpr DataType get_dtype() {
+    if constexpr (std::is_same_v<T, float>) {
+        return DataType::FP32;
+    } else if constexpr (std::is_same_v<T, __half>) {
+        return DataType::FP16;
+    }
 #if defined(TC_HAS_BF16)
-template<>
-struct dtype_of<__nv_bfloat16> {
-    static constexpr DataType value = DataType::BF16;
-};
+    else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        return DataType::BF16;
+    }
 #endif
-
-template<>
-struct dtype_of<int8_t> {
-    static constexpr DataType value = DataType::INT8;
-};
-
-template<>
-struct dtype_of<int32_t> {
-    static constexpr DataType value = DataType::INT32;
-};
-
-template<>
-struct dtype_of<int64_t> {
-    static constexpr DataType value = DataType::INT64;
-};
-
-template<typename T>
-constexpr DataType get_dtype() {
-    return dtype_of<T>::value;
+    else if constexpr (std::is_same_v<T, int8_t>) {
+        return DataType::INT8;
+    } else if constexpr (std::is_same_v<T, int32_t>) {
+        return DataType::INT32;
+    } else if constexpr (std::is_same_v<T, int64_t>) {
+        return DataType::INT64;
+    } else {
+        return DataType::FP32;
+    }
 }
 
 /**
diff --git a/include/tensorcraft/memory/aligned_vector.hpp b/include/tensorcraft/memory/aligned_vector.hpp
@@ -137,15 +137,12 @@ TC_HOST_DEVICE_INLINE bool is_aligned(const T* ptr) {
  * Returns the largest vector size that fits in 16 bytes (LDS.128)
  */
 template<typename T>
-struct optimal_vec_size_value : std::integral_constant<int,
-    (sizeof(T) == 1 ? 8 :
-     sizeof(T) == 2 ? 8 :
-     sizeof(T) == 4 ? 4 :
-     sizeof(T) == 8 ? 2 : 1)> {};
-
-template<typename T>
 constexpr int optimal_vec_size() {
-    return optimal_vec_size_value<T>::value;
+    if constexpr (sizeof(T) == 1) return 8;
+    else if constexpr (sizeof(T) == 2) return 8;
+    else if constexpr (sizeof(T) == 4) return 4;
+    else if constexpr (sizeof(T) == 8) return 2;
+    else return 1;
 }
 
 // ============================================================================
diff --git a/include/tensorcraft/memory/tensor.hpp b/include/tensorcraft/memory/tensor.hpp
@@ -235,7 +235,14 @@ class Tensor {
      */
     void fill(T value) {
         if (size_ == 0 || !data_) return;
-        fill_impl(value, std::integral_constant<bool, sizeof(T) == 1>{});
+        if constexpr (sizeof(T) == 1) {
+            TC_CUDA_CHECK(cudaMemset(data_, static_cast<int>(value), bytes()));
+        } else {
+            constexpr int block = 256;
+            int grid = static_cast<int>((size_ + block - 1) / block);
+            detail::fill_kernel<<<grid, block>>>(data_, value, size_);
+            TC_CUDA_CHECK(cudaGetLastError());
+        }
     }
     
     /**
@@ -308,17 +315,6 @@ class Tensor {
     }
 
 private:
-    void fill_impl(T value, std::true_type) {
-        TC_CUDA_CHECK(cudaMemset(data_, static_cast<int>(value), bytes()));
-    }
-
-    void fill_impl(T value, std::false_type) {
-        constexpr int block = 256;
-        int grid = static_cast<int>((size_ + block - 1) / block);
-        detail::fill_kernel<<<grid, block>>>(data_, value, size_);
-        TC_CUDA_CHECK(cudaGetLastError());
-    }
-
     static size_type compute_size(const shape_type& shape) {
         if (shape.empty()) return 0;
         return std::accumulate(shape.begin(), shape.end(), 
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,8 @@ TC_ENABLE_CUDA = "ON"
 TC_BUILD_PYTHON = "ON"
 TC_BUILD_TESTS = "OFF"
 TC_BUILD_BENCHMARKS = "OFF"
+CMAKE_CUDA_COMPILER = "/usr/local/cuda/bin/nvcc"
+CUDAToolkit_ROOT = "/usr/local/cuda"
 CMAKE_CUDA_ARCHITECTURES = "75"
 CMAKE_CUDA_FLAGS = "-lineinfo -Xcompiler -O2"
 CMAKE_BUILD_TYPE = "RelWithDebInfo"

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,9 @@`
`12`	`12`	`"generator": "Ninja",`
`13`	`13`	`"binaryDir": "${sourceDir}/build/${presetName}",`
`14`	`14`	`"cacheVariables": {`
`15`		`- "CMAKE_EXPORT_COMPILE_COMMANDS": "ON"`
	`15`	`+ "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",`
	`16`	`+ "CMAKE_CUDA_COMPILER": "/usr/local/cuda/bin/nvcc",`
	`17`	`+ "CUDAToolkit_ROOT": "/usr/local/cuda"`
`16`	`18`	`}`
`17`	`19`	`},`
`18`	`20`	`{`