From 96547c9f4c286fb84f45929e2dc207ef598dc752 Mon Sep 17 00:00:00 2001 From: shudson Date: Tue, 7 Jan 2025 13:23:26 -0600 Subject: [PATCH 1/3] Fix order of args in trmm cuda --- polybench-cuda/trmm/trmm.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polybench-cuda/trmm/trmm.cu b/polybench-cuda/trmm/trmm.cu index df4a7f72..a478fd52 100644 --- a/polybench-cuda/trmm/trmm.cu +++ b/polybench-cuda/trmm/trmm.cu @@ -116,7 +116,7 @@ int main(int argc, char** argv) cudaMemcpy(dev_A, A, n*m*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_B, B, m*n*sizeof(double), cudaMemcpyHostToDevice); /* Run kernel. */ - kernel(n,m, alpha, dev_A, dev_B); + kernel(n,m, alpha, dev_B, dev_A); cudaMemcpy(B, dev_B, m*n*sizeof(double), cudaMemcpyDeviceToHost); /* Prevent dead-code elimination. All live-out data must be printed From 5bca35ae2cdf1c8e6c0fa07e9f57c1a0ad56c2a1 Mon Sep 17 00:00:00 2001 From: shudson Date: Tue, 7 Jan 2025 13:23:46 -0600 Subject: [PATCH 2/3] Fix B initialization in trmm cuda --- polybench-cuda/trmm/trmm.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/polybench-cuda/trmm/trmm.cu b/polybench-cuda/trmm/trmm.cu index a478fd52..84becf8b 100644 --- a/polybench-cuda/trmm/trmm.cu +++ b/polybench-cuda/trmm/trmm.cu @@ -22,6 +22,14 @@ void init_array(int n,int m, int i, j; *alpha = 32412; + + // Initialize B to zero as only diagonals are set below + for (i = 0; i < m; i++) { + for (j = 0; j < n; j++) { + B[i*n + j] = 0.0; + } + } + for (i = 0; i < n; i++) for (j = 0; j < m; j++) { A[i*m+j] = ((double) i*j) / m; From 6d33d1f0faada197be279fbc0976da714980c2ba Mon Sep 17 00:00:00 2001 From: shudson Date: Tue, 7 Jan 2025 13:25:40 -0600 Subject: [PATCH 3/3] Update generated trmm_cpu.c file --- polybench-cuda/trmm/trmm_cpu.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/polybench-cuda/trmm/trmm_cpu.c b/polybench-cuda/trmm/trmm_cpu.c index b94dea1e..ac76f3f6 100644 --- a/polybench-cuda/trmm/trmm_cpu.c +++ b/polybench-cuda/trmm/trmm_cpu.c @@ -153,7 +153,7 @@ int main(int argc, char ** argv) { B = malloc(m * n * 8); _ZL10init_arrayiiPdS_S_(n, m, (&alpha), ((double*)A), ((double*)B)); ; - _ZL6kerneliidPdS_(n, m, alpha, ((double*)A), ((double*)B)); + _ZL6kerneliidPdS_(n, m, alpha, ((double*)B), ((double*)A)); ; if (dump_code == 1) { _ZL11print_arrayiiPd(m, n, ((double*)B)); @@ -167,10 +167,17 @@ free(((uint8_t*)((double*)B))); void _ZL10init_arrayiiPdS_S_(uint32_t n, uint32_t m, double* alpha, double* A, double* B) { int64_t i; int64_t j; - uint32_t j_2e_0; + uint32_t j_2e_1; *alpha = 32412; +for(int64_t i = 0; i < m; i = i + 1){ + +for(int64_t j = 0; j < n; j = j + 1){ + B[(i * n + j)] = 0; +} +} + for(int64_t i = 0; i < n; i = i + 1){ for(int64_t j = 0; j < m; j = j + 1){