Skip to content

Commit 9806ea1

Browse files
Merge pull request #52 from NumPower/fix/cublas_matmul
fix: Uses cuBLAS instead of the CUDA kernel to perform matmul
2 parents 913b059 + fc97f9f commit 9806ea1

3 files changed

Lines changed: 37 additions & 35 deletions

File tree

numpower.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ static void ndarray_destructor(zend_object* object) {
237237
NDArrayObject* my_object = (NDArrayObject*)object;
238238
if (GC_REFCOUNT(object) <= 1) {
239239
zval *obj_uuid = OBJ_PROP_NUM(object, 0);
240-
buffer_ndarray_free(Z_LVAL_P(obj_uuid));
240+
buffer_ndarray_free((int)Z_LVAL_P(obj_uuid));
241241
zend_object_std_dtor(object);
242242
}
243243
}
@@ -754,8 +754,8 @@ PHP_METHOD(NDArray, greater_equal) {
754754
NDArray *nda, *ndb, *rtn = NULL;
755755
zval *a, *b;
756756
ZEND_PARSE_PARAMETERS_START(2, 2)
757-
Z_PARAM_ZVAL(a)
758-
Z_PARAM_ZVAL(b)
757+
Z_PARAM_ZVAL(a)
758+
Z_PARAM_ZVAL(b)
759759
ZEND_PARSE_PARAMETERS_END();
760760
nda = ZVAL_TO_NDARRAY(a);
761761
ndb = ZVAL_TO_NDARRAY(b);
@@ -3528,7 +3528,9 @@ PHP_METHOD(NDArray, matmul) {
35283528
return;
35293529
}
35303530
rtn = NDArray_Matmul(nda, ndb);
3531-
3531+
if (rtn == NULL) {
3532+
return;
3533+
}
35323534
CHECK_INPUT_AND_FREE(a, nda);
35333535
CHECK_INPUT_AND_FREE(b, ndb);
35343536
RETURN_NDARRAY(rtn, return_value);

src/ndmath/cuda/cuda_math.cu

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -807,23 +807,6 @@ void array_sum_float(float *a, float *result, int n) {
807807
if (tid == 0) atomicAdd(result, sdata[0]);
808808
}
809809

810-
811-
812-
// CUDA Kernel for Matrix Multiplication for non-square matrices
813-
__global__ void
814-
matmul_float_kernel(float* A, float* B, float* C, int widthA, int heightA, int widthB) {
815-
int row = blockIdx.y * blockDim.y + threadIdx.y;
816-
int col = blockIdx.x * blockDim.x + threadIdx.x;
817-
818-
if (row < heightA && col < widthB) {
819-
float sum = 0;
820-
for(int i = 0; i < widthA; ++i) {
821-
sum += A[row * widthA + i] * B[i * widthB + col];
822-
}
823-
C[row * widthB + col] = sum;
824-
}
825-
}
826-
827810
__global__
828811
void fill_float_kernel(float* array, int n, float value) {
829812
int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -916,15 +899,6 @@ extern "C" {
916899
cudaDeviceSynchronize();
917900
}
918901

919-
void
920-
cuda_matmul_float(int nblocks, float *a, float *b, float *rtn, int widthA, int heightA, int widthB) {
921-
dim3 blockSize(32, 32);
922-
dim3 gridSize((widthB + blockSize.x - 1) / blockSize.x, (heightA + blockSize.y - 1) / blockSize.y);
923-
924-
matmul_float_kernel<<<gridSize, blockSize>>>(a, b, rtn, widthA, heightA, widthB);
925-
cudaDeviceSynchronize();
926-
}
927-
928902
void
929903
cuda_sum_float(int nblocks, float *a, float *rtn, int nelements) {
930904
float *d_sum;

src/ndmath/linalg.c

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,36 @@ NDArray_FMatmul(NDArray *a, NDArray *b) {
5252
if (NDArray_DEVICE(a) == NDARRAY_DEVICE_GPU) {
5353
// Perform GPU matrix multiplication
5454
#ifdef HAVE_CUBLAS
55-
NDArray* result_gpu = NDArray_ToGPU(result);
56-
NDArray_FREE(result);
57-
cuda_matmul_float(NDArray_NUMELEMENTS(a), NDArray_FDATA(a), NDArray_FDATA(b), NDArray_FDATA(result_gpu),
58-
NDArray_SHAPE(a)[1], NDArray_SHAPE(a)[0], NDArray_SHAPE(b)[1]);
59-
return result_gpu;
55+
cublasHandle_t handle;
56+
cublasCreate(&handle);
57+
58+
float* d_A;
59+
float* d_B;
60+
float* d_C;
61+
size_t size_A = NDArray_NUMELEMENTS(a) * sizeof(float);
62+
size_t size_B = NDArray_NUMELEMENTS(b) * sizeof(float);
63+
size_t size_C = NDArray_NUMELEMENTS(result) * sizeof(float);
64+
65+
cudaMalloc((void**)&d_A, size_A);
66+
cudaMalloc((void**)&d_B, size_B);
67+
cudaMalloc((void**)&d_C, size_C);
68+
69+
cudaMemcpy(d_A, NDArray_FDATA(a), size_A, cudaMemcpyHostToDevice);
70+
cudaMemcpy(d_B, NDArray_FDATA(b), size_B, cudaMemcpyHostToDevice);
71+
72+
int m = NDArray_SHAPE(a)[0];
73+
int n = NDArray_SHAPE(b)[1];
74+
int k = NDArray_SHAPE(a)[1];
75+
float alpha = 1.0f;
76+
float beta = 0.0f;
77+
78+
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_B, n, d_A, k, &beta, d_C, n);
79+
cudaMemcpy(NDArray_FDATA(result), d_C, size_C, cudaMemcpyDeviceToHost);
80+
81+
cudaFree(d_A);
82+
cudaFree(d_B);
83+
cudaFree(d_C);
84+
cublasDestroy(handle);
6085
#endif
6186
} else {
6287
// Perform CPU matrix multiplication
@@ -222,6 +247,7 @@ NDArray_Matmul(NDArray *a, NDArray *b) {
222247

223248
if (NDArray_SHAPE(a)[NDArray_NDIM(a) - 1] != NDArray_SHAPE(b)[NDArray_NDIM(b) - 2]) {
224249
zend_throw_error(NULL, "Shape mismatch for matmul. cols(a) != rows(b)");
250+
return NULL;
225251
}
226252

227253
if (NDArray_NDIM(a) > 2 && NDArray_NDIM(b) > 2) {

0 commit comments

Comments
 (0)