diff --git a/polybench-cuda/trmm/trmm.cu b/polybench-cuda/trmm/trmm.cu index df4a7f72..84becf8b 100644 --- a/polybench-cuda/trmm/trmm.cu +++ b/polybench-cuda/trmm/trmm.cu @@ -22,6 +22,14 @@ void init_array(int n,int m, int i, j; *alpha = 32412; + + // Initialize B to zero as only diagonals are set below + for (i = 0; i < m; i++) { + for (j = 0; j < n; j++) { + B[i*n + j] = 0.0; + } + } + for (i = 0; i < n; i++) for (j = 0; j < m; j++) { A[i*m+j] = ((double) i*j) / m; @@ -116,7 +124,7 @@ int main(int argc, char** argv) cudaMemcpy(dev_A, A, n*m*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(dev_B, B, m*n*sizeof(double), cudaMemcpyHostToDevice); /* Run kernel. */ - kernel(n,m, alpha, dev_A, dev_B); + kernel(n,m, alpha, dev_B, dev_A); cudaMemcpy(B, dev_B, m*n*sizeof(double), cudaMemcpyDeviceToHost); /* Prevent dead-code elimination. All live-out data must be printed diff --git a/polybench-cuda/trmm/trmm_cpu.c b/polybench-cuda/trmm/trmm_cpu.c index b94dea1e..ac76f3f6 100644 --- a/polybench-cuda/trmm/trmm_cpu.c +++ b/polybench-cuda/trmm/trmm_cpu.c @@ -153,7 +153,7 @@ int main(int argc, char ** argv) { B = malloc(m * n * 8); _ZL10init_arrayiiPdS_S_(n, m, (&alpha), ((double*)A), ((double*)B)); ; - _ZL6kerneliidPdS_(n, m, alpha, ((double*)A), ((double*)B)); + _ZL6kerneliidPdS_(n, m, alpha, ((double*)B), ((double*)A)); ; if (dump_code == 1) { _ZL11print_arrayiiPd(m, n, ((double*)B)); @@ -167,10 +167,17 @@ free(((uint8_t*)((double*)B))); void _ZL10init_arrayiiPdS_S_(uint32_t n, uint32_t m, double* alpha, double* A, double* B) { int64_t i; int64_t j; - uint32_t j_2e_0; + uint32_t j_2e_1; *alpha = 32412; +for(int64_t i = 0; i < m; i = i + 1){ + +for(int64_t j = 0; j < n; j = j + 1){ + B[(i * n + j)] = 0; +} +} + for(int64_t i = 0; i < n; i = i + 1){ for(int64_t j = 0; j < m; j = j + 1){