Advanced_HPC_Project/jacobi.c at master · PieroZ01/Advanced_HPC_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
// Libraries
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <mpi.h>
#include <omp.h>

int main(int argc, char* argv[]){

  // Initialize MPI
  int mpi_provided_thread_level;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &mpi_provided_thread_level);
  if (mpi_provided_thread_level < MPI_THREAD_FUNNELED)
  {
    printf("A problem arised when asking for MPI_THREAD_FUNNELED level\n");
    MPI_Finalize();
    exit(1);
  }

  // Get the rank of the process and the number of processes
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  // Get the number of OpenMP threads
  #pragma omp parallel
  {
    int num_threads = omp_get_num_threads();
    #pragma omp master
    {
      printf("Number of OpenMP threads: %d\n", num_threads);
    }
  }

  // Get the number of devices
  printf("Number of devices: %d\n", omp_get_num_devices());

  const int ngpus = omp_get_num_devices();
  const int device_id = rank % ngpus;
  omp_set_default_device(device_id);

  // Time measurements' variables
  double init_time = 0.0;
  double compute_time = 0.0;
  double communication_time = 0.0;
  double copy_time = 0.0;

  // Dimensions and iterations parameters
  size_t dimension = 0;
  size_t iterations = 0;

  // Check on input parameters
  if(argc != 3) {
    if (rank == 0) {
      fprintf(stderr, "\nWrong number of arguments. Usage: ./a.out dim it n\n");
    }
    MPI_Finalize();
    return 1;
  }

  // Read in the parameters
  dimension = atoi(argv[1]);
  iterations = atoi(argv[2]);

  // Check if the parameters are valid
  if (dimension <= 0 || iterations <= 0) {
    if (rank == 0) {
      fprintf(stderr, "\nInvalid parameters. Parameters must be positive integers.\n");
    }
    MPI_Finalize();
    return 1;
  }

  // Print the parameters
  if (rank == 0) {
    printf("Matrix size = %zu\n", dimension);
    printf("Number of iterations = %zu\n", iterations);
  }

  // Get the local amount of rows to be computed by each process
  // (Each process will compute a submatrix of the original matrix; the rows are distributed evenly among the processes:
  // the first processes will get one more row than the others if the number of rows is not divisible by the number of processes)
  const size_t rows_per_process = dimension / size;
  const size_t extra_rows = dimension % size;
  const size_t local_rows = (rank < (int)extra_rows) ? rows_per_process + 1 : rows_per_process;

  // Compute the offset (needed to access the correct rows in the global matrix)
  // The offset is the number of extra rows (if any) that the previous processes have taken
  const int offset = extra_rows * (rank >= (int)extra_rows);

  // Total number of rows for each process (including the ghost rows)
  const size_t total_rows = local_rows + 2;

  // Allocate memory for the matrices
  const MPI_Aint byte_dimension = sizeof(double) * (total_rows) * (dimension + 2);
  double *matrix;
  double *matrix_new;
  MPI_Alloc_mem(byte_dimension, MPI_INFO_NULL, &matrix);
  MPI_Alloc_mem(byte_dimension, MPI_INFO_NULL, &matrix_new);

  // Sincronize the processes
  MPI_Barrier(MPI_COMM_WORLD);

  // Start the timer for initialization
  init_time = MPI_Wtime();

  // Initialize the matrices
  memset(matrix, 0, byte_dimension);
  memset(matrix_new, 0, byte_dimension);

  // Fill the initial values of the matrices with 0.5
  // (Initialization is done in parallel to speed up the process using OpenMP)
  #pragma omp parallel for collapse(2)
  for (size_t i = 1; i <= local_rows; ++i)
  {
    for (size_t j = 1; j <= dimension; ++j)
    {
      matrix[(i * (dimension + 2)) + j] = 0.5;
    }
  }

  // Set up the boundary conditions
  // (Boundary conditions are set for the first column and the last row of the global matrix)
  // (Initialization is done in parallel to speed up the process using OpenMP)
  const double increment = 100.0 / ( dimension + 1 );

  // First row of each process
  int start_row = local_rows * rank + offset;

  // Last process will have to initialize the last row of the global matrix
  // All processes contribute to the initialization of the first column of the global matrix
  if (rank == size - 1) // Last process
  {
    // Vertical border
    #pragma omp parallel for
    for (size_t i = 1; i <= local_rows + 1; ++i)
    {
      matrix[i * (dimension + 2)] = (start_row + i) * increment;
      matrix_new[i * (dimension + 2)] = (start_row + i) * increment;
    }

    // Horizontal border
    #pragma omp parallel for
    for (size_t j = 1; j <= dimension + 1; ++j)
    {
      matrix[((local_rows + 1) * (dimension + 2)) + (dimension + 1 - j)] = j * increment;
      matrix_new[((local_rows + 1) * (dimension + 2)) + (dimension + 1 - j)] = j * increment;
    }
  }
  else // All other processes
  {
    // Vertical border
    #pragma omp parallel for
    for (size_t i = 1; i <= local_rows; ++i)
    {
      matrix[i * (dimension + 2)] = (start_row + i) * increment;
      matrix_new[i * (dimension + 2)] = (start_row + i) * increment;
    }
  }

  // Sincronize the processes
  MPI_Barrier(MPI_COMM_WORLD);

  // Stop the timer for initialization
  init_time = MPI_Wtime() - init_time;

  // Print the initialization time
  if (rank == 0) {
    printf("Initialization time: %f seconds\n", init_time);
  }

  // Define the "neighbors" for each process:
  // each process will need to "communicate" with its neighbors (the previous and next processes) to exchange the rows
  // needed for the computation of the new values. Remote memory access (RMA) will be used for this purpose.
  // The first and last processes will have MPI_PROC_NULL as their neighbor above and below respectively
  int proc_above = (rank == 0) ? MPI_PROC_NULL : rank - 1;
  int proc_below = (rank == size - 1) ? MPI_PROC_NULL : rank + 1;

  // Create the two memory windows (remote memory access):
  // one for the first row and one for the last row of the local matrix
  // (Processes will use these windows to access the rows of their neighbors)
  MPI_Win win_top;
  MPI_Win win_bottom;

  const int total_size = total_rows * (dimension + 2); // Total size of the matrix to be used in the OpenMP target region

  // Sincronize the processes
  MPI_Barrier(MPI_COMM_WORLD);

  copy_time = MPI_Wtime(); // Start the timer for the copy time

  #pragma omp target data map(tofrom: matrix[0:total_size]) map(to: matrix_new[0:total_size])
  {

    // Syncronize openmp threads
    #pragma omp barrier

    // Synchronize the processes
    MPI_Barrier(MPI_COMM_WORLD);

    // Stop the timer for the copy time
    copy_time = MPI_Wtime() - copy_time;

    if (rank == 0) {
      printf("Copy time: %f seconds\n", copy_time);
    }

    // Use device pointer for MPI_Win_create
    #pragma omp target data use_device_ptr(matrix)
    {
      // Create the memory windows for the first and last rows of the local matrix
      MPI_Win_create(&matrix[dimension + 2], (dimension + 2) * sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win_top);
      MPI_Win_create(&matrix[local_rows * (dimension + 2)], (dimension + 2) * sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win_bottom);
    }

    // Start the algorithm
    for (size_t it = 0; it < iterations; ++it)
    {

      // Sincronize the openmp threads
      #pragma omp barrier

      // Sincronize the processes
      MPI_Barrier(MPI_COMM_WORLD);

      // Start the timer for the communication time
      double t0 = MPI_Wtime();

      // Use the windows to access the rows of the neighbors
      #pragma omp target data use_device_ptr(matrix)
      {
        MPI_Win_fence(0, win_top); // Synchronize the top window
        MPI_Win_fence(0, win_bottom); // Synchronize the bottom window

        MPI_Get(&matrix[(local_rows + 1) * (dimension + 2)], (MPI_Aint) (dimension + 2), MPI_DOUBLE, proc_below,
                0, (MPI_Aint) (dimension + 2), MPI_DOUBLE, win_top); // Get the first row of the next process
        MPI_Get(matrix, (MPI_Aint) (dimension + 2), MPI_DOUBLE, proc_above,
                0, (MPI_Aint) (dimension + 2), MPI_DOUBLE, win_bottom); // Get the last row of the previous process

        MPI_Win_fence(0, win_top); // Synchronize the top window
        MPI_Win_fence(0, win_bottom); // Synchronize the bottom window
      }

      // Sincronize the openmp threads
      #pragma omp barrier

      // Sincronize the processes
      MPI_Barrier(MPI_COMM_WORLD);

      // Stop the timer for the communication time
      communication_time += MPI_Wtime() - t0;

      // Start the timer for the computation time
      double t1 = MPI_Wtime();

      // Compute the new values of the matrix
      #pragma omp target teams distribute parallel for simd collapse(2) num_teams(108)  // Use OpenMP to parallelize the computation
      for (size_t i = 1; i <= local_rows; ++i)
      {
        for (size_t j = 1; j <= dimension; ++j)
        {
          matrix_new[(i * (dimension + 2)) + j] = 0.25 * (matrix[((i - 1) * (dimension + 2)) + j] +
                                                          matrix[(i * (dimension + 2)) + (j + 1)] +
                                                          matrix[((i + 1) * (dimension + 2)) + j] +
                                                          matrix[(i * (dimension + 2)) + (j - 1)]);
        }
      }

      // Swap the matrices (matrix and matrix_new)
      #pragma omp target teams distribute parallel for simd collapse(2) num_teams(108)  // Use OpenMP to parallelize the computation
      for (size_t i = 1; i <= local_rows; ++i)
      {
        for (size_t j = 1; j <= dimension; ++j)
        {
          matrix[(i * (dimension + 2)) + j] = matrix_new[(i * (dimension + 2)) + j];
        }
      }

      // Sincronize the openmp threads
      #pragma omp barrier

      // Sincronize the processes
      MPI_Barrier(MPI_COMM_WORLD);

      // Stop the timer for the computation time
      compute_time += MPI_Wtime() - t1;

    } // End of iterations

  }

  // Free the memory windows
  MPI_Win_free(&win_top);
  MPI_Win_free(&win_bottom);

  // Print the communication, computation and total times
  if (rank == 0) {
    printf("Communication time: %f seconds\n", communication_time);
    printf("Computation time: %f seconds\n", compute_time);
    double total_time = init_time + compute_time + communication_time + copy_time;
    printf("Total time: %f seconds\n", total_time);
  }

  // Save the results to a file
  MPI_File fh;
  MPI_File_open(MPI_COMM_WORLD, "solution.bin", MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
  // Handle the distribution of the data across the processes
  if (rank == 0) { // First process
    MPI_Offset fh_offset = 0;
    MPI_File_write_at_all(fh, fh_offset, matrix, (local_rows + 1) * (dimension + 2), MPI_DOUBLE, MPI_STATUS_IGNORE);
  }
  else if (rank == size - 1) { // Last process
    MPI_Offset fh_offset = (rank * local_rows * (dimension + 2)) + (dimension + 2);
    // Handle the extra_rows
    if (rank >= (int)extra_rows) {
      fh_offset += extra_rows;
    }
    fh_offset *= sizeof(double);
    MPI_File_write_at_all(fh, fh_offset, &matrix[dimension + 2], (local_rows + 1) * (dimension + 2), MPI_DOUBLE, MPI_STATUS_IGNORE);
  }
  else { // All other processes
    MPI_Offset fh_offset = (rank * local_rows * (dimension + 2)) + (dimension + 2);
    // Handle the extra_rows
    if (rank >= (int)extra_rows) {
      fh_offset += extra_rows;
    }
    fh_offset *= sizeof(double);
    MPI_File_write_at_all(fh, fh_offset, &matrix[dimension + 2], local_rows * (dimension + 2), MPI_DOUBLE, MPI_STATUS_IGNORE);
  }
  MPI_File_close(&fh);

  // Free the allocated memory
  MPI_Free_mem(matrix);
  MPI_Free_mem(matrix_new);

  // Save the times to csv a file
  if (rank == 0) {
    FILE *fp = fopen("times.csv", "a+");
    if (fp == NULL) {
      fprintf(stderr, "Error opening file\n");
      MPI_Finalize();
      return 1;
    }
    fprintf(fp, "%zu,%zu,%f,%f,%f,%f\n", dimension, iterations, init_time, compute_time, communication_time, copy_time);
    fclose(fp);
  }

  // Finalize MPI
  MPI_Finalize();

  return 0;
}