Merge pull request #52 from NumPower/fix/cublas_matmul

henrique-borba · web-flow · commit 9806ea128fff · 2024-06-20T04:34:29.000-03:00
fix: Uses cuBLAS instead of the CUDA kernel to perform matmul
diff --git a/numpower.c b/numpower.c
@@ -237,7 +237,7 @@ static void ndarray_destructor(zend_object* object) {
     NDArrayObject* my_object = (NDArrayObject*)object;
     if (GC_REFCOUNT(object) <= 1) {
         zval *obj_uuid = OBJ_PROP_NUM(object, 0);
-        buffer_ndarray_free(Z_LVAL_P(obj_uuid));
+        buffer_ndarray_free((int)Z_LVAL_P(obj_uuid));
         zend_object_std_dtor(object);
     }
 }
@@ -754,8 +754,8 @@ PHP_METHOD(NDArray, greater_equal) {
     NDArray *nda, *ndb, *rtn = NULL;
     zval *a, *b;
     ZEND_PARSE_PARAMETERS_START(2, 2)
-    Z_PARAM_ZVAL(a)
-    Z_PARAM_ZVAL(b)
+        Z_PARAM_ZVAL(a)
+        Z_PARAM_ZVAL(b)
     ZEND_PARSE_PARAMETERS_END();
     nda = ZVAL_TO_NDARRAY(a);
     ndb = ZVAL_TO_NDARRAY(b);
@@ -3528,7 +3528,9 @@ PHP_METHOD(NDArray, matmul) {
         return;
     }
     rtn = NDArray_Matmul(nda, ndb);
-
+    if (rtn == NULL) {
+        return;
+    }
     CHECK_INPUT_AND_FREE(a, nda);
     CHECK_INPUT_AND_FREE(b, ndb);
     RETURN_NDARRAY(rtn, return_value);
diff --git a/src/ndmath/cuda/cuda_math.cu b/src/ndmath/cuda/cuda_math.cu
@@ -807,23 +807,6 @@ void array_sum_float(float *a, float *result, int n) {
     if (tid == 0) atomicAdd(result, sdata[0]);
 }
 
-
-
-// CUDA Kernel for Matrix Multiplication for non-square matrices
-__global__ void
-matmul_float_kernel(float* A, float* B, float* C, int widthA, int heightA, int widthB) {
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (row < heightA && col < widthB) {
-        float sum = 0;
-        for(int i = 0; i < widthA; ++i) {
-            sum += A[row * widthA + i] * B[i * widthB + col];
-        }
-        C[row * widthB + col] = sum;
-    }
-}
-
 __global__
 void fill_float_kernel(float* array, int n, float value) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -916,15 +899,6 @@ extern "C" {
         cudaDeviceSynchronize();
     }
 
-    void
-    cuda_matmul_float(int nblocks, float *a, float *b, float *rtn, int widthA, int heightA, int widthB) {
-        dim3 blockSize(32, 32);
-        dim3 gridSize((widthB + blockSize.x - 1) / blockSize.x, (heightA + blockSize.y - 1) / blockSize.y);
-
-        matmul_float_kernel<<<gridSize, blockSize>>>(a, b, rtn, widthA, heightA, widthB);
-        cudaDeviceSynchronize();
-    }
-
     void
     cuda_sum_float(int nblocks, float *a, float *rtn, int nelements) {
         float *d_sum;
diff --git a/src/ndmath/linalg.c b/src/ndmath/linalg.c
@@ -52,11 +52,36 @@ NDArray_FMatmul(NDArray *a, NDArray *b) {
     if (NDArray_DEVICE(a) == NDARRAY_DEVICE_GPU) {
         // Perform GPU matrix multiplication
 #ifdef HAVE_CUBLAS
-        NDArray* result_gpu = NDArray_ToGPU(result);
-        NDArray_FREE(result);
-        cuda_matmul_float(NDArray_NUMELEMENTS(a), NDArray_FDATA(a), NDArray_FDATA(b), NDArray_FDATA(result_gpu),
-                          NDArray_SHAPE(a)[1], NDArray_SHAPE(a)[0], NDArray_SHAPE(b)[1]);
-        return result_gpu;
+        cublasHandle_t handle;
+        cublasCreate(&handle);
+
+        float* d_A;
+        float* d_B;
+        float* d_C;
+        size_t size_A = NDArray_NUMELEMENTS(a) * sizeof(float);
+        size_t size_B = NDArray_NUMELEMENTS(b) * sizeof(float);
+        size_t size_C = NDArray_NUMELEMENTS(result) * sizeof(float);
+
+        cudaMalloc((void**)&d_A, size_A);
+        cudaMalloc((void**)&d_B, size_B);
+        cudaMalloc((void**)&d_C, size_C);
+
+        cudaMemcpy(d_A, NDArray_FDATA(a), size_A, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_B, NDArray_FDATA(b), size_B, cudaMemcpyHostToDevice);
+
+        int m = NDArray_SHAPE(a)[0];
+        int n = NDArray_SHAPE(b)[1];
+        int k = NDArray_SHAPE(a)[1];
+        float alpha = 1.0f;
+        float beta = 0.0f;
+
+        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_B, n, d_A, k, &beta, d_C, n);
+        cudaMemcpy(NDArray_FDATA(result), d_C, size_C, cudaMemcpyDeviceToHost);
+
+        cudaFree(d_A);
+        cudaFree(d_B);
+        cudaFree(d_C);
+        cublasDestroy(handle);
 #endif
     } else {
         // Perform CPU matrix multiplication
@@ -222,6 +247,7 @@ NDArray_Matmul(NDArray *a, NDArray *b) {
 
     if (NDArray_SHAPE(a)[NDArray_NDIM(a) - 1] != NDArray_SHAPE(b)[NDArray_NDIM(b) - 2]) {
         zend_throw_error(NULL, "Shape mismatch for matmul. cols(a) != rows(b)");
+        return NULL;
     }
 
     if (NDArray_NDIM(a) > 2 && NDArray_NDIM(b) > 2) {