Just use cuBLAS for everything...

pwilkin · pwilkin · commit c8abbe005ee3 · 2025-12-08T13:52:45.000+01:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4619,9 +4619,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_OPT_STEP_SGD:
         case GGML_OP_CUMSUM:
         case GGML_OP_TRI:
-            return true;
         case GGML_OP_SOLVE_TRI:
             return true;
+
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu
@@ -1,303 +1,112 @@
 #include "common.cuh"
+#include "ggml-cuda/vendors/cuda.h"
+#include <cublas_api.h>
 #include "ggml.h"
 #include "solve_tri.cuh"
-
-#define MAX_N_FAST 64
-#define MAX_K_FAST 32
-
-// ======================
-// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
-// ======================
-// When ncols_template == 0 the bounds for the loops in this function are not
-// known and can't be unrolled. As we want to keep pragma unroll for all other
-// cases we supress the clang transformation warning here.
-#ifdef __clang__
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wpass-failed"
-#endif  // __clang__
-template <int n_template, int k_template>
-static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
-                                          const float * __restrict__ B,
-                                          float * __restrict__ X,
-                                          const uint3  ne02,
-                                          const size_t nb02,
-                                          const size_t nb03,
-                                          const size_t nb12,
-                                          const size_t nb13,
-                                          const size_t nb2,
-                                          const size_t nb3,
-                                          const int    n_arg,
-                                          const int    k_arg) {
-    const int n = n_template == 0 ? n_arg : n_template;
-    const int k = k_template == 0 ? k_arg : k_template;
-
-    const int batch_idx = blockIdx.x;
-    const int lane      = threadIdx.x;
-    const int col_idx   = threadIdx.y;
-
-    if (col_idx >= k) {
+#include <cublas_v2.h>
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+
+static __global__ void get_batch_pointers(const float * A, float * X, const float ** A_ptrs, float ** X_ptrs,
+                                          int64_t ne02, int64_t total_batches,
+                                          size_t s02, size_t s03, size_t s2, size_t s3) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_batches) {
         return;
     }
 
-    const uint2   i02_i03 = fast_div_modulo(batch_idx, ne02);
-    const int64_t i02     = i02_i03.y;
-    const int64_t i03     = i02_i03.x;
+    const int64_t i3 = idx / ne02;
+    const int64_t i2 = idx % ne02;
 
-    const float * const A_batch = (const float *) (A + i02 * nb02 + i03 * nb03);
-    const float * const B_batch = (const float *) (B + i02 * nb12 + i03 * nb13);
-    float *             X_batch = (float *) (X + i02 * nb2 + i03 * nb3);
-
-    __shared__ float sA[MAX_N_FAST * MAX_N_FAST];
-    __shared__ float sXt[MAX_N_FAST * (MAX_K_FAST + 1)];
-
-    const int offset = threadIdx.x + threadIdx.y * blockDim.x;
+    A_ptrs[idx] = A + i3 * s03 + i2 * s02;
+    X_ptrs[idx] = X + i3 * s3 + i2 * s2;
+}
 
-#pragma unroll
-    for (int i = 0; i < n * n; i += k * WARP_SIZE) {
-        int i0 = i + offset;
-        if (i0 < n * n) {
-            sA[i0] = A_batch[i0];
-        }
+static void solve_tri_f32_cublas(ggml_backend_cuda_context &ctx,
+                                 const float * A,
+                                 const float * B,
+                                 float *       X,
+                                 int           n,
+                                 int           k,
+                                 int64_t       ne02,
+                                 int64_t       ne03,
+                                 size_t        s02,
+                                 size_t        s03,
+                                 size_t        s12,
+                                 size_t        s13,
+                                 size_t        s2,
+                                 size_t        s3,
+                                 cudaStream_t  stream) {
+    const float alpha = 1.0f;
+    const int64_t total_batches = ne02 * ne03;
+    if (total_batches == 0) {
+        return;
     }
 
-    const int rows_per_warp = (n + WARP_SIZE - 1) / WARP_SIZE;
-
-#pragma unroll
-    for (int i = 0; i < rows_per_warp; i++) {
-        const int i0 = lane + i * WARP_SIZE;
-        if (i0 < n) {
-            sXt[col_idx * n + i0] = B_batch[i0 * k + col_idx];
-        }
+    // Bulk copy B -> X (contiguous tensors)
+    if (X != B) {
+        const int64_t total_elements_BX = n * k * total_batches;
+        CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float),
+                                   cudaMemcpyDeviceToDevice, stream));
     }
 
-    __syncthreads();
-
-#pragma unroll
-    for (int row = 0; row < n; ++row) {
-        float sum = 0.0f;
+    int id = ggml_cuda_get_device();
 
-        {
-            int j = lane;
-            if (j < row) {
-                sum += sA[row * n + j] * sXt[col_idx * n + j];
-            }
-        }
-        if (row >= WARP_SIZE) {
-            int j = WARP_SIZE + lane;
-            if (j < row) {
-                sum += sA[row * n + j] * sXt[col_idx * n + j];
-            }
-        }
+    ggml_cuda_pool_alloc<const float *> A_ptrs_alloc(ctx.pool(id), total_batches);
+    ggml_cuda_pool_alloc<float *> X_ptrs_alloc(ctx.pool(id), total_batches);
 
-        sum = warp_reduce_sum(sum);
+    const float ** A_ptrs_dev = A_ptrs_alloc.get();
+          float ** X_ptrs_dev = X_ptrs_alloc.get();
 
-        if (lane == 0) {
-            const float b_val      = sXt[col_idx * n + row];
-            const float a_diag     = sA[row * n + row];
-            // no safeguards for division by zero because that indicates corrupt
-            // data anyway
-            sXt[col_idx * n + row] = (b_val - sum) / a_diag;
-        }
-    }
+    get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(
+        A, X, A_ptrs_dev, X_ptrs_dev, ne02, total_batches, s02, s03, s2, s3);
 
-    __syncthreads();
+    CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
 
-#pragma unroll
-    for (int i = 0; i < rows_per_warp; i++) {
-        const int i0 = lane + i * WARP_SIZE;
-        if (i0 < n) {
-            X_batch[i0 * k + col_idx] = sXt[col_idx * n + i0];
-        }
-    }
-}
-#ifdef __clang__
-#    pragma clang diagnostic pop
-#endif  // __clang__
+    // Yes, this is necessary, without this we get RMSE errors
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH));
+    CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id),
+                                    CUBLAS_SIDE_RIGHT,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    CUBLAS_OP_N,
+                                    CUBLAS_DIAG_NON_UNIT,
+                                    k,
+                                    n,
+                                    &alpha,
+                                    A_ptrs_dev, n,
+                                    X_ptrs_dev, k,
+                                    total_batches));
 
-// ======================
-// General Kernel for larger matrices
-// Uses a simpler approach with fixed tile size
-// ======================
-#define GENERAL_TILE_SIZE 32
+    // revert to standard mode from common.cuh
+    CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH));
 
-template <int n_template, int k_template>
-static __global__ void solve_tri_f32_general(const float * __restrict__ A,
-                                             const float * __restrict__ B,
-                                             float * __restrict__ X,
-                                             const uint3  ne02,
-                                             const size_t nb02,
-                                             const size_t nb03,
-                                             const size_t nb12,
-                                             const size_t nb13,
-                                             const size_t nb2,
-                                             const size_t nb3,
-                                             const int    n_arg,
-                                             const int    k_arg) {
-    const int n = n_template == 0 ? n_arg : n_template;
-    const int k = k_template == 0 ? k_arg : k_template;
-
-    const int batch_idx = blockIdx.x;
-    const int col_idx   = blockIdx.y;
-    const int tid       = threadIdx.x;
-
-    if (col_idx >= k) {
-        return;
-    }
-    const uint2   i02_i03 = fast_div_modulo(batch_idx, ne02);
-    const int64_t i02     = i02_i03.y;
-    const int64_t i03     = i02_i03.x;
-
-    const float * const A_batch = (const float *) (A + i02 * nb02 + i03 * nb03);
-    const float * const B_batch = (const float *) (B + i02 * nb12 + i03 * nb13);
-    float *             X_batch = (float *) (X + i02 * nb2 + i03 * nb3);
-
-    // Shared memory for current tile
-    __shared__ float sA[GENERAL_TILE_SIZE * GENERAL_TILE_SIZE];
-    __shared__ float sB[GENERAL_TILE_SIZE];
-    __shared__ float sX[GENERAL_TILE_SIZE];
-
-    // Process in tiles
-    for (int tile_start = 0; tile_start < n; tile_start += GENERAL_TILE_SIZE) {
-        int tile_end = min(tile_start + GENERAL_TILE_SIZE, n);
-        int tile_n = tile_end - tile_start;
-        // Load tile of A matrix
-        for (int i = tid; i < tile_n * tile_n; i += blockDim.x) {
-            int local_row = i / tile_n;
-            int local_col = i % tile_n;
-            int global_row = tile_start + local_row;
-            int global_col = tile_start + local_col;
-            if (global_col <= global_row) {
-                sA[local_row * GENERAL_TILE_SIZE + local_col] = A_batch[global_row * n + global_col];
-            } else {
-                sA[local_row * GENERAL_TILE_SIZE + local_col] = 0.0f;
-            }
-        }
-        __syncthreads();
-        // Load corresponding part of B and initialize X
-        if (tid < tile_n) {
-            sB[tid] = B_batch[(tile_start + tid) * k + col_idx];
-            sX[tid] = sB[tid];
-        }
-        __syncthreads();
-        // Forward substitution for this tile
-        for (int row = 0; row < tile_n; ++row) {
-            if (tid == row) {
-                float sum = 0.0f;
-                // Sum contributions from previous rows in this tile
-                for (int j = 0; j < row; ++j) {
-                    sum += sA[row * GENERAL_TILE_SIZE + j] * sX[j];
-                }
-                // Sum contributions from previous tiles
-                if (tile_start > 0) {
-                    int global_row = tile_start + row;
-                    for (int j = 0; j < tile_start; ++j) {
-                        sum += A_batch[global_row * n + j] * X_batch[j * k + col_idx];
-                    }
-                }
-                const float a_diag = sA[row * GENERAL_TILE_SIZE + row];
-                sX[row] = (sB[row] - sum) / a_diag;
-            }
-            __syncthreads();
-        }
-        // Store results back to global memory
-        if (tid < tile_n) {
-            int global_row = tile_start + tid;
-            X_batch[global_row * k + col_idx] = sX[tid];
-        }
-        __syncthreads();
-    }
-}
-static void solve_tri_f32_cuda(const float * A,
-                               const float * B,
-                               float *       X,
-                               int           n,
-                               int           k,
-                               int64_t       ne02,
-                               int64_t       ne03,
-                               size_t        nb02,
-                               size_t        nb03,
-                               size_t        nb12,
-                               size_t        nb13,
-                               size_t        nb2,
-                               size_t        nb3,
-                               cudaStream_t  stream) {
-    const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
-    // Choose kernel based on matrix size
-    if (n <= MAX_N_FAST && k <= MAX_K_FAST) {
-        // Use fast kernel for small matrices
-        dim3        threads(WARP_SIZE, k);
-        dim3        grid(ne02 * ne03);
-        if (n == 64) {
-            switch (k) {
-                case 32:
-                    solve_tri_f32_fast<64, 32>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 16:
-                    solve_tri_f32_fast<64, 16>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 14:
-                    solve_tri_f32_fast<64, 14>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 12:
-                    solve_tri_f32_fast<64, 12>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 10:
-                    solve_tri_f32_fast<64, 10>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 8:
-                    solve_tri_f32_fast<64, 8>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 6:
-                    solve_tri_f32_fast<64, 6>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 4:
-                    solve_tri_f32_fast<64, 4>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 2:
-                    solve_tri_f32_fast<64, 2>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                case 1:
-                    solve_tri_f32_fast<64, 1>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
-                    break;
-                default:
-                    solve_tri_f32_fast<0, 0>
-                        <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, n, k);
-            }
-        } else {  // run general case
-            solve_tri_f32_fast<0, 0>
-                <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, n, k);
-        }
-    } else {
-        // Use general kernel for larger matrices
-        dim3 threads(256, 1);  // 256 threads per block
-        dim3 grid(ne02 * ne03, k);  // One block per column
-        solve_tri_f32_general<0, 0>
-            <<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, n, k);
-    }
+    GGML_UNUSED_VARS(s12, s13);
 }
 
+
+// ----------------------------------------------------------------------------
+// Public entry point
+// ----------------------------------------------------------------------------
 void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];  // A (triangular n x x matrix)
-    const ggml_tensor * src1 = dst->src[1];  // B (right hand side of n x k equation columns)
+    const ggml_tensor * src0 = dst->src[0];   // A (n×n, lower triangular)
+    const ggml_tensor * src1 = dst->src[1];   // B (n×k)
 
     ggml_is_contiguous(src0);
     ggml_is_contiguous(src1);
 
     const int64_t n = src0->ne[0];
     const int64_t k = src1->ne[0];
-
-    solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
-                       src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
-                       src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
-                       dst->nb[3] / sizeof(float), ctx.stream());
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    solve_tri_f32_cublas(ctx,
+        (const float *) src0->data,
+        (const float *) src1->data,
+        (float *) dst->data,
+        n, k,
+        ne02, ne03,
+        src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
+        src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float),
+        dst->nb[2]  / sizeof(float), dst->nb[3]  / sizeof(float),
+        ctx.stream());
 }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp