|
| 1 | +// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +// |
| 3 | +// SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +#include "tensor_map_cccl.h" |
| 6 | + |
| 7 | +#include <string.h> |
| 8 | + |
| 9 | +#include <algorithm> |
| 10 | +#include <exception> |
| 11 | + |
| 12 | +#if defined(__has_include) |
| 13 | +// Older CTK releases do not ship <cuda/tma>. When it is unavailable we keep |
| 14 | +// the CCCL helper compiled out and fall back to the direct driver path. |
| 15 | +# if __has_include(<cuda/tma>) |
| 16 | +# include <cuda/tma> |
| 17 | +# define CUDA_CORE_HAS_CUDA_TMA 1 |
| 18 | +# else |
| 19 | +# define CUDA_CORE_HAS_CUDA_TMA 0 |
| 20 | +# endif |
| 21 | +# if __has_include("dlpack.h") |
| 22 | +# include "dlpack.h" |
| 23 | +# define CUDA_CORE_HAS_DLPACK_H 1 |
| 24 | +# elif __has_include(<dlpack/dlpack.h>) |
| 25 | +# include <dlpack/dlpack.h> |
| 26 | +# define CUDA_CORE_HAS_DLPACK_H 1 |
| 27 | +# else |
| 28 | +# define CUDA_CORE_HAS_DLPACK_H 0 |
| 29 | +# endif |
| 30 | +#else |
| 31 | +# define CUDA_CORE_HAS_CUDA_TMA 0 |
| 32 | +# define CUDA_CORE_HAS_DLPACK_H 0 |
| 33 | +#endif |
| 34 | + |
| 35 | +static inline void cuda_core_write_err(char* err, size_t cap, const char* msg) noexcept |
| 36 | +{ |
| 37 | + if (!err || cap == 0) |
| 38 | + return; |
| 39 | + if (!msg) |
| 40 | + { |
| 41 | + err[0] = '\0'; |
| 42 | + return; |
| 43 | + } |
| 44 | + size_t n = ::strlen(msg); |
| 45 | + if (n >= cap) |
| 46 | + n = cap - 1; |
| 47 | + ::memcpy(err, msg, n); |
| 48 | + err[n] = '\0'; |
| 49 | +} |
| 50 | + |
| 51 | +int cuda_core_cccl_make_tma_descriptor_tiled( |
| 52 | + void* out_tensor_map, |
| 53 | + void* data, |
| 54 | + int device_type, |
| 55 | + int device_id, |
| 56 | + int ndim, |
| 57 | + const int64_t* shape, |
| 58 | + const int64_t* strides, |
| 59 | + uint8_t dtype_code, |
| 60 | + uint8_t dtype_bits, |
| 61 | + uint16_t dtype_lanes, |
| 62 | + const int* box_sizes, |
| 63 | + const int* elem_strides, |
| 64 | + int interleave_layout, |
| 65 | + int swizzle, |
| 66 | + int l2_fetch_size, |
| 67 | + int oob_fill, |
| 68 | + char* err, |
| 69 | + size_t err_cap) noexcept |
| 70 | +{ |
| 71 | +#if !(CUDA_CORE_HAS_CUDA_TMA && CUDA_CORE_HAS_DLPACK_H) |
| 72 | + (void)out_tensor_map; |
| 73 | + (void)data; |
| 74 | + (void)device_type; |
| 75 | + (void)device_id; |
| 76 | + (void)ndim; |
| 77 | + (void)shape; |
| 78 | + (void)strides; |
| 79 | + (void)dtype_code; |
| 80 | + (void)dtype_bits; |
| 81 | + (void)dtype_lanes; |
| 82 | + (void)box_sizes; |
| 83 | + (void)elem_strides; |
| 84 | + (void)interleave_layout; |
| 85 | + (void)swizzle; |
| 86 | + (void)l2_fetch_size; |
| 87 | + (void)oob_fill; |
| 88 | + cuda_core_write_err(err, err_cap, "CCCL <cuda/tma> and/or <dlpack/dlpack.h> not available at build time"); |
| 89 | + return 1; |
| 90 | +#else |
| 91 | + try |
| 92 | + { |
| 93 | + if (!out_tensor_map) |
| 94 | + { |
| 95 | + cuda_core_write_err(err, err_cap, "out_tensor_map is NULL"); |
| 96 | + return 1; |
| 97 | + } |
| 98 | + if (!data) |
| 99 | + { |
| 100 | + cuda_core_write_err(err, err_cap, "tensor data pointer is NULL"); |
| 101 | + return 1; |
| 102 | + } |
| 103 | + if (!shape || !box_sizes || ndim <= 0) |
| 104 | + { |
| 105 | + cuda_core_write_err(err, err_cap, "invalid rank/shape/box_sizes"); |
| 106 | + return 1; |
| 107 | + } |
| 108 | + |
| 109 | + DLTensor t{}; |
| 110 | + t.data = data; |
| 111 | + t.device = {static_cast<DLDeviceType>(device_type), device_id}; |
| 112 | + t.ndim = ndim; |
| 113 | + t.dtype.code = dtype_code; |
| 114 | + t.dtype.bits = dtype_bits; |
| 115 | + t.dtype.lanes = dtype_lanes; |
| 116 | + // CCCL promises not to mutate the arrays, but DLPack uses non-const pointers. |
| 117 | + t.shape = const_cast<int64_t*>(shape); |
| 118 | + t.strides = const_cast<int64_t*>(strides); |
| 119 | + t.byte_offset = 0; |
| 120 | + |
| 121 | + const auto layout = static_cast<cuda::tma_interleave_layout>(interleave_layout); |
| 122 | + const auto swz = static_cast<cuda::tma_swizzle>(swizzle); |
| 123 | + const auto l2 = static_cast<cuda::tma_l2_fetch_size>(l2_fetch_size); |
| 124 | + const auto oob = static_cast<cuda::tma_oob_fill>(oob_fill); |
| 125 | + |
| 126 | + auto box = cuda::std::span<const int>(box_sizes, static_cast<size_t>(ndim)); |
| 127 | + |
| 128 | + CUtensorMap desc{}; |
| 129 | + if (elem_strides) |
| 130 | + { |
| 131 | + auto es = cuda::std::span<const int>(elem_strides, static_cast<size_t>(ndim)); |
| 132 | + desc = cuda::make_tma_descriptor(t, box, es, layout, swz, l2, oob); |
| 133 | + } |
| 134 | + else |
| 135 | + { |
| 136 | + desc = cuda::make_tma_descriptor(t, box, layout, swz, l2, oob); |
| 137 | + } |
| 138 | + |
| 139 | + ::memcpy(out_tensor_map, &desc, sizeof(CUtensorMap)); |
| 140 | + cuda_core_write_err(err, err_cap, nullptr); |
| 141 | + return 0; |
| 142 | + } |
| 143 | + catch (const std::exception& e) |
| 144 | + { |
| 145 | + cuda_core_write_err(err, err_cap, e.what()); |
| 146 | + return 1; |
| 147 | + } |
| 148 | + catch (...) |
| 149 | + { |
| 150 | + cuda_core_write_err(err, err_cap, "unknown error while building TMA descriptor"); |
| 151 | + return 1; |
| 152 | + } |
| 153 | +#endif |
| 154 | +} |
0 commit comments