-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbasic_mm1.cpp
More file actions
69 lines (56 loc) · 1.71 KB
/
basic_mm1.cpp
File metadata and controls
69 lines (56 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <omp.h>
#define M 2048
#define N 2048
#define K 2048
#define BLOCK_SIZE 64
#define fast_min(a, b) ((a) < (b) ? (a) : (b))
void initializeMatrix(float* mat, int size, bool zero = false) {
#pragma omp parallel for
for (int i = 0; i < size; ++i)
mat[i] = zero ? 0.0f : std::rand() % 10;
}
// Matrix multiplication function using blocking + OpenMP + SIMD
void multiplyMatrices(const float* A, const float* B, float* C) {
#pragma omp parallel for collapse(2)
for (int ii = 0; ii < M; ii += BLOCK_SIZE) {
for (int jj = 0; jj < N; jj += BLOCK_SIZE) {
for (int kk = 0; kk < K; kk += BLOCK_SIZE) {
for (int i = ii; i < fast_min(ii + BLOCK_SIZE, M); ++i) {
for (int j = jj; j < fast_min(jj + BLOCK_SIZE, N); ++j) {
float sum = 0.0f;
#pragma omp simd reduction(+:sum)
for (int k = kk; k < fast_min(kk + BLOCK_SIZE, K); ++k) {
sum += A[i * K + k] * B[k * N + j];
}
C[i * N + j] += sum;
}
}
}
}
}
}
void cleanup(float* A, float* B, float* C) {
std::free(A);
std::free(B);
std::free(C);
}
int main() {
std::srand(static_cast<unsigned>(std::time(0)));
float* A = (float*) std::malloc(sizeof(float) * M * K);
float* B = (float*) std::malloc(sizeof(float) * K * N);
float* C = (float*) std::malloc(sizeof(float) * M * N);
if (!A || !B || !C) {
std::cerr << "Memory allocation failed!" << std::endl;
return -1;
}
initializeMatrix(A, M * K);
initializeMatrix(B, K * N);
initializeMatrix(C, M * N, true);
multiplyMatrices(A, B, C);
std::cout << "Matrix multiplication completed.\n";
cleanup(A, B, C);
return 0;
}