-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathStep2Part3.cu
More file actions
162 lines (132 loc) · 5.48 KB
/
Step2Part3.cu
File metadata and controls
162 lines (132 loc) · 5.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h> // For rand() and srand()
#include <time.h> // For time()
#include <algorithm> // For std::sort
#include <fstream>
#define NUM_SIZES 5
#define WIDTH {100, 250, 500, 1000, 1500} // Matrix sizes to experiment with
#define NUM_BLOCK_WIDTHS 5
#define BLOCK_WIDTHS {2, 5, 10, 25, 32} // Different block widths to experiment with
#define REPETITIONS 1 // Number of repetitions for timing measurements
cudaError_t multWithCuda(float* P, float* M, float* N, int Width, dim3 threadsPerBlock);
__global__ void MatrixMulKernel(float* M, float* N, float* P, int Width)
{
// Calculate the row index of the P element and M
int Row = blockIdx.y * blockDim.y + threadIdx.y;
// Calculate the column index of P element and N
int Col = blockIdx.x * blockDim.x + threadIdx.x;
if (Row < Width && Col < Width)
{
float Pvalue = 0;
// each thread computes one element of the block sub-matrix
for (int k = 0; k < Width; ++k)
Pvalue += M[Row * Width + k] * N[k * Width + Col];
P[Row * Width + Col] = Pvalue;
}
}
// Helper function to calculate average execution time and remove outliers
float calculateAverageExecutionTime(float* execution_times, int num_repetitions) {
std::sort(execution_times, execution_times + num_repetitions);
float sum = 0;
for (int i = 10; i < num_repetitions - 10; ++i) { // Exclude first 10 and last 10 measurements to remove outliers
sum += execution_times[i];
}
return sum / (num_repetitions - 20);
}
int main()
{
// Define matrix sizes and block widths to experiment with
int sizes[NUM_SIZES] = WIDTH;
int block_widths[NUM_BLOCK_WIDTHS] = BLOCK_WIDTHS;
int num_sizes = NUM_SIZES;
int num_block_widths = NUM_BLOCK_WIDTHS;
// Open CSV file for writing results
std::ofstream csv_file("matrix_multiplication_results.csv");
if (!csv_file.is_open()) {
printf("Error opening file.\n");
return 1;
}
// Write CSV header
csv_file << "Matrix Size,Block Width,GPU Execution Time (ms),NumBlocks/BlockWidth\n";
// Repeat for each block width
for (int b = 0; b < num_block_widths; ++b) {
int block_width = block_widths[b];
dim3 threadsPerBlock(block_width, block_width);
// Repeat for each matrix size
for (int i = 0; i < num_sizes; ++i)
{
int width = sizes[i];
int size = width * width * sizeof(float);
// Allocate memory for matrices M, N, P
float* h_M, * h_N, * h_P;
h_M = (float*)malloc(size);
h_N = (float*)malloc(size);
h_P = (float*)malloc(size);
// Initialize matrices M, N with random values
srand(time(NULL));
for (int j = 0; j < width * width; ++j) {
h_M[j] = static_cast<float>(rand()) / RAND_MAX;
h_N[j] = static_cast<float>(rand()) / RAND_MAX;
}
// Allocate memory on the device
float* d_M, * d_N, * d_P;
cudaMalloc((void**)&d_M, size);
cudaMalloc((void**)&d_N, size);
cudaMalloc((void**)&d_P, size);
// Create events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record start event
cudaEventRecord(start, 0);
// Invoke kernel
multWithCuda(d_P, d_M, d_N, width, threadsPerBlock);
// Record stop event
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// Calculate elapsed time
float elapsed_gpu;
cudaEventElapsedTime(&elapsed_gpu, start, stop);
// Calculate number of blocks per block width
int num_blocks_per_block_width = (width + block_width - 1) / block_width;
// Print results to terminal
printf("Block Width: %d,\t\t Matrix Size: %d,\t Kernel Execution Time: %.2f ms,\t NumBlocks/BlockWidth: %.2f\n",
block_width, width, elapsed_gpu, static_cast<float>(num_blocks_per_block_width));
// Write results to CSV file
csv_file << sizes[i] << "," << block_width << "," << elapsed_gpu << "," << num_blocks_per_block_width << "\n";
// Free device memory
cudaFree(d_M);
cudaFree(d_N);
cudaFree(d_P);
// Free host memory
free(h_M);
free(h_N);
free(h_P);
}
}
// Close CSV file
csv_file.close();
printf("Matrix multiplication results saved to matrix_multiplication_results.csv\n");
return 0;
}
// Helper function for using CUDA to perform matrix multiplication in parallel.
cudaError_t multWithCuda(float* P, float* M, float* N, int Width, dim3 threadsPerBlock)
{
// Invoke kernel
dim3 numBlocks(Width / threadsPerBlock.x, Width / threadsPerBlock.y);
MatrixMulKernel << <numBlocks, threadsPerBlock >> > (M, N, P, Width);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Kernel execution error: %s\n", cudaGetErrorString(cudaStatus));
return cudaStatus;
}
// Wait for GPU to finish
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
return cudaStatus;
}
return cudaStatus;
}