-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathStep2Part2.cu
More file actions
171 lines (145 loc) · 5.72 KB
/
Step2Part2.cu
File metadata and controls
171 lines (145 loc) · 5.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h> // For rand() and srand()
#include <time.h> // For time()
#define NUM_SIZES 5
#define WIDTH {100, 250, 500, 1000, 1500} // Matrix sizes to experiment with
#define REPETITIONS 10 // Number of repetitions for timing measurements
cudaError_t multWithCuda(float* P, float* M, float* N, int Width);
__global__ void MatrixMulKernel(float* M, float* N, float* P, int Width)
{
// Calculate the row index of the P element and M
int Row = blockIdx.y * blockDim.y + threadIdx.y;
// Calculate the column index of P element and N
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (Row < Width && Col < Width)
{
float Pvalue = 0;
// each thread computes one element of the block sub-matrix
for (int k = 0; k < Width; ++k)
Pvalue += M[Row * Width + k] * N[k * Width + Col];
P[Row * Width + Col] = Pvalue;
}
}
// CPU Matrix Multiply Implementation
void MatrixMulCPU(float* M, float* N, float* P, int Width)
{
for (int i = 0; i < Width; ++i)
{
for (int j = 0; j < Width; ++j)
{
float sum = 0;
for (int k = 0; k < Width; ++k)
{
sum += M[i * Width + k] * N[k * Width + j];
}
P[i * Width + j] = sum;
}
}
}
// Helper function to calculate average execution time
float calculateAverageExecutionTime(float* execution_times, int num_repetitions) {
float sum = 0;
for (int i = 0; i < num_repetitions; ++i) {
sum += execution_times[i];
}
return sum / num_repetitions;
}
int main()
{
// Define matrix sizes to experiment with
int sizes[NUM_SIZES] = { 100, 250, 500, 1000, 1500 };
int num_sizes = NUM_SIZES;
// Arrays to store execution times
float execution_times_cpu[NUM_SIZES][REPETITIONS];
float execution_times_gpu[NUM_SIZES][REPETITIONS];
float transfer_times_gpu[NUM_SIZES][REPETITIONS];
// Repeat for each matrix size
for (int i = 0; i < num_sizes; ++i)
{
int width = sizes[i];
int size = width * width * sizeof(float);
// Allocate memory for matrices M, N, P
float* h_M, * h_N, * h_P;
h_M = (float*)malloc(size);
h_N = (float*)malloc(size);
h_P = (float*)malloc(size);
// Initialize matrices M, N with random values
srand(time(NULL));
for (int j = 0; j < width * width; ++j) {
h_M[j] = static_cast<float>(rand()) / RAND_MAX;
h_N[j] = static_cast<float>(rand()) / RAND_MAX;
}
// Allocate memory for device matrices M, N, P
float* d_M, * d_N, * d_P;
cudaMalloc((void**)&d_M, size);
cudaMalloc((void**)&d_N, size);
cudaMalloc((void**)&d_P, size);
// Create events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Repeat for multiple measurements
for (int k = 0; k < REPETITIONS; ++k) {
// Transfer matrices M, N from host to device
clock_t transfer_start = clock();
cudaMemcpy(d_M, h_M, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_N, h_N, size, cudaMemcpyHostToDevice);
clock_t transfer_end = clock();
transfer_times_gpu[i][k] = (float)(transfer_end - transfer_start) / CLOCKS_PER_SEC * 1000; // Convert to milliseconds
// Measure CPU matrix multiplication time
clock_t cpu_start = clock();
MatrixMulCPU(h_M, h_N, h_P, width);
clock_t cpu_end = clock();
execution_times_cpu[i][k] = (float)(cpu_end - cpu_start) / CLOCKS_PER_SEC * 1000; // Convert to milliseconds
// Measure GPU matrix multiplication time
cudaEventRecord(start, 0);
MatrixMulKernel << <1, dim3(width, width) >> > (d_M, d_N, d_P, width);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float gpu_execution_time;
cudaEventElapsedTime(&gpu_execution_time, start, stop);
execution_times_gpu[i][k] = gpu_execution_time;
}
// Free device memory
cudaFree(d_M);
cudaFree(d_N);
cudaFree(d_P);
// Free host memory
free(h_M);
free(h_N);
free(h_P);
}
FILE* fp = fopen("execution_times.csv", "w");
if (fp == NULL) {
printf("Error opening file.\n");
return 1;
}
// Write header
fprintf(fp, "Matrix Size,CPU execution time(ms),GPU execution time(ms),GPU data transfer time(ms)\n");
// Write data
for (int i = 0; i < num_sizes; ++i) {
for (int j = 0; j < REPETITIONS; ++j) {
fprintf(fp, "%d,%.2f,%.2f,%.2f\n", sizes[i], execution_times_cpu[i][j], execution_times_gpu[i][j], transfer_times_gpu[i][j]);
}
}
fclose(fp);
printf("Execution times saved to execution_times.csv\n");
// Calculate average execution times
float average_times_cpu[NUM_SIZES];
float average_times_gpu[NUM_SIZES];
float average_transfer_times_gpu[NUM_SIZES];
for (int i = 0; i < num_sizes; ++i) {
average_times_cpu[i] = calculateAverageExecutionTime(execution_times_cpu[i], REPETITIONS);
average_times_gpu[i] = calculateAverageExecutionTime(execution_times_gpu[i], REPETITIONS);
average_transfer_times_gpu[i] = calculateAverageExecutionTime(transfer_times_gpu[i], REPETITIONS);
}
// Print execution times
printf("Matrix Size\tCPU Time (ms)\tGPU Time (ms)\tGPU Transfer Time (ms)\n");
for (int i = 0; i < num_sizes; ++i)
{
printf("%d\t\t%.2f\t\t%.2f\t\t%.2f\n", sizes[i], average_times_cpu[i], average_times_gpu[i], average_transfer_times_gpu[i]);
}
return 0;
}