Skip to content

Commit 8802b43

Browse files
author
shijiashuai
committed
build: pin CUDA toolchain to local 12.8 install
Pin the project to /usr/local/cuda/bin/nvcc and require CUDA 12.8 for configure, Python packaging, and preset-driven builds. Remove the older CUDA 10.x compatibility path and update the README, install guide, and troubleshooting docs to match the local toolchain assumptions.
1 parent 4992e1b commit 8802b43

File tree

10 files changed

+88
-130
lines changed

10 files changed

+88
-130
lines changed

CMakeLists.txt

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ option(TC_ENABLE_CUDA "Enable CUDA support" ON)
3434
set(TC_CUDA_ENABLED OFF)
3535

3636
if(TC_ENABLE_CUDA)
37+
if(NOT DEFINED CMAKE_CUDA_COMPILER)
38+
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc" CACHE FILEPATH "Path to nvcc")
39+
endif()
40+
3741
include(CheckLanguage)
3842
check_language(CUDA)
3943
if(CMAKE_CUDA_COMPILER)
@@ -42,29 +46,19 @@ if(TC_ENABLE_CUDA)
4246
find_package(CUDAToolkit REQUIRED)
4347

4448
# Check CUDA version
45-
if(CUDAToolkit_VERSION VERSION_LESS "10.1")
46-
message(FATAL_ERROR "CUDA 10.1 or higher is required. Found: ${CUDAToolkit_VERSION}")
49+
if(CUDAToolkit_VERSION VERSION_LESS "12.8")
50+
message(FATAL_ERROR "CUDA 12.8 or higher is required. Found: ${CUDAToolkit_VERSION}")
4751
endif()
4852

53+
message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
4954
message(STATUS "CUDA version: ${CUDAToolkit_VERSION}")
5055

51-
# Set a CUDA dialect compatible with the detected toolkit.
52-
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
53-
set(CMAKE_CUDA_STANDARD 17)
54-
else()
55-
set(CMAKE_CUDA_STANDARD 14)
56-
endif()
56+
set(CMAKE_CUDA_STANDARD 17)
5757
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
5858

59-
# CUDA architectures - keep defaults compatible with the detected toolkit.
59+
# CUDA architectures
6060
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
61-
if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.0")
62-
set(TC_DEFAULT_CUDA_ARCHITECTURES "75;80;86;89;90")
63-
elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
64-
set(TC_DEFAULT_CUDA_ARCHITECTURES "70;75;80;86")
65-
else()
66-
set(TC_DEFAULT_CUDA_ARCHITECTURES "70;75")
67-
endif()
61+
set(TC_DEFAULT_CUDA_ARCHITECTURES "75;80;86;89;90")
6862
set(CMAKE_CUDA_ARCHITECTURES "${TC_DEFAULT_CUDA_ARCHITECTURES}" CACHE STRING "CUDA architectures" FORCE)
6963
endif()
7064

@@ -73,17 +67,10 @@ if(TC_ENABLE_CUDA)
7367
add_compile_definitions(TC_CUDA_13=1 TC_CUDA_12=1 TC_CUDA_11=1 TC_CUDA_10=1)
7468
add_compile_definitions(TC_HAS_TMA=1 TC_HAS_WGMMA=1 TC_HAS_FP8=1 TC_HAS_BF16=1 TC_HAS_WMMA=1)
7569
message(STATUS "CUDA 13.x features enabled (TMA, WGMMA, FP8, BF16, WMMA)")
76-
elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.0")
70+
else()
7771
add_compile_definitions(TC_CUDA_12=1 TC_CUDA_11=1 TC_CUDA_10=1)
7872
add_compile_definitions(TC_HAS_TMA=1 TC_HAS_WGMMA=1 TC_HAS_FP8=1 TC_HAS_BF16=1 TC_HAS_WMMA=1)
79-
message(STATUS "CUDA 12.x features enabled (TMA, WGMMA, FP8, BF16, WMMA)")
80-
elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.0")
81-
add_compile_definitions(TC_CUDA_11=1 TC_CUDA_10=1)
82-
add_compile_definitions(TC_HAS_BF16=1 TC_HAS_WMMA=1)
83-
message(STATUS "CUDA 11.x features enabled (BF16, WMMA)")
84-
else()
85-
add_compile_definitions(TC_CUDA_10=1)
86-
message(STATUS "CUDA 10.x compatibility mode enabled")
73+
message(STATUS "CUDA 12.8+ features enabled (TMA, WGMMA, FP8, BF16, WMMA)")
8774
endif()
8875
else()
8976
message(WARNING "Failed to find nvcc (CUDA compiler). Configure will continue without CUDA. To enable CUDA, set CUDAToolkit_ROOT or ensure nvcc is on PATH.")

CMakePresets.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
"generator": "Ninja",
1313
"binaryDir": "${sourceDir}/build/${presetName}",
1414
"cacheVariables": {
15-
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
15+
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
16+
"CMAKE_CUDA_COMPILER": "/usr/local/cuda/bin/nvcc",
17+
"CUDAToolkit_ROOT": "/usr/local/cuda"
1618
}
1719
},
1820
{

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ English | [简体中文](README.zh-CN.md) | [Docs](https://lessup.github.io/mode
55
[![CI](https://github.com/LessUp/modern-ai-kernels/actions/workflows/ci.yml/badge.svg)](https://github.com/LessUp/modern-ai-kernels/actions/workflows/ci.yml)
66
[![Docs](https://img.shields.io/badge/Docs-GitHub%20Pages-blue?logo=github)](https://lessup.github.io/modern-ai-kernels/)
77
[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
8-
![CUDA](https://img.shields.io/badge/CUDA-10.1%2B-76B900?logo=nvidia&logoColor=white)
8+
![CUDA](https://img.shields.io/badge/CUDA-12.8-76B900?logo=nvidia&logoColor=white)
99
![C++](https://img.shields.io/badge/C%2B%2B-17-00599C?logo=c%2B%2B&logoColor=white)
1010
![CMake](https://img.shields.io/badge/CMake-3.20+-064F8C?logo=cmake&logoColor=white)
1111
![Python](https://img.shields.io/badge/Python-3.8+-3776AB?logo=python&logoColor=white)
@@ -41,8 +41,8 @@ python -c "import tensorcraft_ops as tc; print(tc.__version__)"
4141

4242
## Build Notes
4343

44-
- Minimum supported CUDA toolkit is `10.1`
45-
- CUDA `11.x`/`12.x` unlock more optimized feature paths than CUDA `10.x`
44+
- This repository targets the local CUDA `12.8` toolkit at `/usr/local/cuda/bin/nvcc`
45+
- CMake presets and Python builds pin `CMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc`
4646
- If CUDA is unavailable, CMake disables tests, benchmarks, and Python bindings automatically
4747
- If build pressure is high, prefer `dev`/`python-dev`, keep `--parallel` low, and set a single `CMAKE_CUDA_ARCHITECTURES` value for your GPU
4848

docs/INSTALL.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ This guide documents the current recommended build paths for TensorCraft-HPC.
66

77
### Required
88

9-
- **CUDA Toolkit**: `10.1` or later
9+
- **CUDA Toolkit**: `12.8`
10+
- **nvcc path**: `/usr/local/cuda/bin/nvcc`
1011
- **CMake**: `3.20` or later
1112
- **C++ Compiler**: C++17-capable host compiler
1213
- **NVIDIA GPU**: recommended for tests and Python bindings
@@ -101,9 +102,9 @@ Adjust `CMAKE_CUDA_ARCHITECTURES` to match your GPU.
101102

102103
## Compatibility Notes
103104

104-
- TensorCraft-HPC now supports CUDA `10.1+`
105-
- On CUDA `10.x`, device compilation uses a CUDA-compatible dialect while host code remains on C++17
106-
- CUDA `11.x` and `12.x` enable newer feature paths beyond the CUDA `10.x` compatibility path
105+
- TensorCraft-HPC now targets CUDA `12.8`
106+
- The repository defaults to `/usr/local/cuda/bin/nvcc`
107+
- The codebase assumes full C++17 support in both host and CUDA compilation paths
107108

108109
## Verification
109110

docs/TROUBLESHOOTING.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ cmake --preset cpu-smoke
3030
If you do need CUDA, point CMake to the toolkit explicitly:
3131

3232
```bash
33-
cmake -B build -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
33+
cmake -B build -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCUDAToolkit_ROOT=/usr/local/cuda
3434
```
3535

3636
## Unsupported GPU architecture
@@ -126,14 +126,14 @@ TensorCraft can still build successfully, but you may want to clean that environ
126126

127127
## CUDA version compatibility
128128

129-
| Capability | Minimum CUDA |
130-
|------------|--------------|
131-
| Basic kernels and core build | 10.1 |
132-
| BF16-related paths | 11.x |
133-
| FP8-related paths | 12.x |
134-
| Hopper-specific features | 12.x |
129+
| Capability | Required CUDA |
130+
|------------|---------------|
131+
| Basic kernels and core build | 12.8 |
132+
| BF16-related paths | 12.8 |
133+
| FP8-related paths | 12.8 |
134+
| Hopper-specific features | 12.8 |
135135

136-
If you are on CUDA `10.x`, prefer the compatibility path and avoid assuming newer CUDA features are available.
136+
This repository now assumes the local CUDA `12.8` toolchain and no longer carries a CUDA 10.x compatibility path.
137137

138138
## GPU runtime path vs CI
139139

include/tensorcraft/core/features.hpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,8 @@
2323
#define TC_CPP17 1
2424
#elif __cplusplus >= 201703L
2525
#define TC_CPP17 1
26-
#elif defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ < 11) && (__cplusplus >= 201402L)
27-
#define TC_CUDA_LEGACY_CXX 1
2826
#else
29-
#error "TensorCraft requires C++17 or later (CUDA 10.x device code may use C++14)"
27+
#error "TensorCraft requires C++17 or later"
3028
#endif
3129

3230
// ============================================================================
@@ -49,10 +47,8 @@
4947
#elif __CUDACC_VER_MAJOR__ >= 11
5048
#define TC_CUDA_11 1
5149
#define TC_CUDA_10 1
52-
#elif __CUDACC_VER_MAJOR__ >= 10
53-
#define TC_CUDA_10 1
5450
#else
55-
#error "TensorCraft requires CUDA 10.1 or later"
51+
#error "TensorCraft requires CUDA 12.8 or later"
5652
#endif
5753

5854
// Feature availability based on CUDA version

include/tensorcraft/core/type_traits.hpp

Lines changed: 40 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ constexpr bool is_fp8_v = false;
8585
* @brief Type trait to check if T is any floating point type (including half)
8686
*/
8787
template<typename T>
88-
struct is_floating : std::integral_constant<bool,
89-
std::is_floating_point<T>::value || is_half_v<T> || is_fp8_v<T>
88+
struct is_floating : std::bool_constant<
89+
std::is_floating_point_v<T> || is_half_v<T> || is_fp8_v<T>
9090
> {};
9191

9292
template<typename T>
@@ -209,40 +209,36 @@ constexpr size_t type_bits_v = type_bits<T>::value;
209209
*/
210210
template<typename T>
211211
TC_HOST_DEVICE_INLINE float to_float(T val) {
212-
return static_cast<float>(val);
213-
}
214-
215-
template<>
216-
TC_HOST_DEVICE_INLINE float to_float<__half>(__half val) {
217-
return __half2float(val);
218-
}
219-
212+
if constexpr (std::is_same_v<T, __half>) {
213+
return __half2float(val);
214+
}
220215
#if defined(TC_HAS_BF16)
221-
template<>
222-
TC_HOST_DEVICE_INLINE float to_float<__nv_bfloat16>(__nv_bfloat16 val) {
223-
return __bfloat162float(val);
224-
}
216+
else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
217+
return __bfloat162float(val);
218+
}
225219
#endif
220+
else {
221+
return static_cast<float>(val);
222+
}
223+
}
226224

227225
/**
228226
* @brief Convert float to target type
229227
*/
230228
template<typename T>
231229
TC_HOST_DEVICE_INLINE T from_float(float val) {
232-
return static_cast<T>(val);
233-
}
234-
235-
template<>
236-
TC_HOST_DEVICE_INLINE __half from_float<__half>(float val) {
237-
return __float2half(val);
238-
}
239-
230+
if constexpr (std::is_same_v<T, __half>) {
231+
return __float2half(val);
232+
}
240233
#if defined(TC_HAS_BF16)
241-
template<>
242-
TC_HOST_DEVICE_INLINE __nv_bfloat16 from_float<__nv_bfloat16>(float val) {
243-
return __float2bfloat16(val);
244-
}
234+
else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
235+
return __float2bfloat16(val);
236+
}
245237
#endif
238+
else {
239+
return static_cast<T>(val);
240+
}
241+
}
246242

247243
// ============================================================================
248244
// Data Type Enumeration
@@ -266,45 +262,26 @@ enum class DataType {
266262
* @brief Get DataType enum from C++ type
267263
*/
268264
template<typename T>
269-
struct dtype_of {
270-
static constexpr DataType value = DataType::FP32;
271-
};
272-
273-
template<>
274-
struct dtype_of<float> {
275-
static constexpr DataType value = DataType::FP32;
276-
};
277-
278-
template<>
279-
struct dtype_of<__half> {
280-
static constexpr DataType value = DataType::FP16;
281-
};
282-
265+
constexpr DataType get_dtype() {
266+
if constexpr (std::is_same_v<T, float>) {
267+
return DataType::FP32;
268+
} else if constexpr (std::is_same_v<T, __half>) {
269+
return DataType::FP16;
270+
}
283271
#if defined(TC_HAS_BF16)
284-
template<>
285-
struct dtype_of<__nv_bfloat16> {
286-
static constexpr DataType value = DataType::BF16;
287-
};
272+
else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
273+
return DataType::BF16;
274+
}
288275
#endif
289-
290-
template<>
291-
struct dtype_of<int8_t> {
292-
static constexpr DataType value = DataType::INT8;
293-
};
294-
295-
template<>
296-
struct dtype_of<int32_t> {
297-
static constexpr DataType value = DataType::INT32;
298-
};
299-
300-
template<>
301-
struct dtype_of<int64_t> {
302-
static constexpr DataType value = DataType::INT64;
303-
};
304-
305-
template<typename T>
306-
constexpr DataType get_dtype() {
307-
return dtype_of<T>::value;
276+
else if constexpr (std::is_same_v<T, int8_t>) {
277+
return DataType::INT8;
278+
} else if constexpr (std::is_same_v<T, int32_t>) {
279+
return DataType::INT32;
280+
} else if constexpr (std::is_same_v<T, int64_t>) {
281+
return DataType::INT64;
282+
} else {
283+
return DataType::FP32;
284+
}
308285
}
309286

310287
/**

include/tensorcraft/memory/aligned_vector.hpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -137,15 +137,12 @@ TC_HOST_DEVICE_INLINE bool is_aligned(const T* ptr) {
137137
* Returns the largest vector size that fits in 16 bytes (LDS.128)
138138
*/
139139
template<typename T>
140-
struct optimal_vec_size_value : std::integral_constant<int,
141-
(sizeof(T) == 1 ? 8 :
142-
sizeof(T) == 2 ? 8 :
143-
sizeof(T) == 4 ? 4 :
144-
sizeof(T) == 8 ? 2 : 1)> {};
145-
146-
template<typename T>
147140
constexpr int optimal_vec_size() {
148-
return optimal_vec_size_value<T>::value;
141+
if constexpr (sizeof(T) == 1) return 8;
142+
else if constexpr (sizeof(T) == 2) return 8;
143+
else if constexpr (sizeof(T) == 4) return 4;
144+
else if constexpr (sizeof(T) == 8) return 2;
145+
else return 1;
149146
}
150147

151148
// ============================================================================

include/tensorcraft/memory/tensor.hpp

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,14 @@ class Tensor {
235235
*/
236236
void fill(T value) {
237237
if (size_ == 0 || !data_) return;
238-
fill_impl(value, std::integral_constant<bool, sizeof(T) == 1>{});
238+
if constexpr (sizeof(T) == 1) {
239+
TC_CUDA_CHECK(cudaMemset(data_, static_cast<int>(value), bytes()));
240+
} else {
241+
constexpr int block = 256;
242+
int grid = static_cast<int>((size_ + block - 1) / block);
243+
detail::fill_kernel<<<grid, block>>>(data_, value, size_);
244+
TC_CUDA_CHECK(cudaGetLastError());
245+
}
239246
}
240247

241248
/**
@@ -308,17 +315,6 @@ class Tensor {
308315
}
309316

310317
private:
311-
void fill_impl(T value, std::true_type) {
312-
TC_CUDA_CHECK(cudaMemset(data_, static_cast<int>(value), bytes()));
313-
}
314-
315-
void fill_impl(T value, std::false_type) {
316-
constexpr int block = 256;
317-
int grid = static_cast<int>((size_ + block - 1) / block);
318-
detail::fill_kernel<<<grid, block>>>(data_, value, size_);
319-
TC_CUDA_CHECK(cudaGetLastError());
320-
}
321-
322318
static size_type compute_size(const shape_type& shape) {
323319
if (shape.empty()) return 0;
324320
return std::accumulate(shape.begin(), shape.end(),

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ TC_ENABLE_CUDA = "ON"
3131
TC_BUILD_PYTHON = "ON"
3232
TC_BUILD_TESTS = "OFF"
3333
TC_BUILD_BENCHMARKS = "OFF"
34+
CMAKE_CUDA_COMPILER = "/usr/local/cuda/bin/nvcc"
35+
CUDAToolkit_ROOT = "/usr/local/cuda"
3436
CMAKE_CUDA_ARCHITECTURES = "75"
3537
CMAKE_CUDA_FLAGS = "-lineinfo -Xcompiler -O2"
3638
CMAKE_BUILD_TYPE = "RelWithDebInfo"

0 commit comments

Comments
 (0)