diff --git a/include/GMGPolar/gmgpolar.h b/include/GMGPolar/gmgpolar.h index 1f46bfe3..c1d5478d 100644 --- a/include/GMGPolar/gmgpolar.h +++ b/include/GMGPolar/gmgpolar.h @@ -128,6 +128,7 @@ class GMGPolar : public IGMGPolar double mean_residual_reduction_factor_; bool converged(double current_residual_norm, double first_residual_norm); +public: // Public due to cuda restrictions /* ---------------------------------------------------- */ /* Compute exact error if an exact solution is provided */ // The results are stored as a pair: (weighted L2 error, infinity error). @@ -138,15 +139,19 @@ class GMGPolar : public IGMGPolar /* --------------- */ /* Setup Functions */ - int chooseNumberOfLevels(const PolarGrid& finest_grid); - // Public due to cuda restrictions -public: template void build_rhs_f(const Level& level, HostVector rhs_f, const BoundaryConditions& boundary_conditions, const SourceTerm& source_term); void discretize_rhs_f(const Level& level, HostVector rhs_f); + /* --------------- */ + /* Solve Functions */ + void applyExtrapolation(int current_level, HostVector fine_values, HostConstVector coarse_values); + private: + /* --------------- */ + /* Setup Functions */ + int chooseNumberOfLevels(const PolarGrid& finest_grid); bool checkUniformRefinement(const PolarGrid& grid, double tolerance) const; /* --------------- */ @@ -168,15 +173,13 @@ class GMGPolar : public IGMGPolar int iterations); void applyExtrapolatedMultigridIterations(Level& level, MultigridCycleType cycle, int iterations); - // Compute the extrapolated values: u_ex = 4/3 u_fine - 1/3 u_coarse - void applyExtrapolation(int current_level, HostVector fine_values, HostConstVector coarse_values); /* ----------------- */ /* Print information */ void printSettings(const PolarGrid& finest_grid, const PolarGrid& coarsest_grid) const; - void printIterationHeader(const ExactSolution* exact_solution); + void printIterationHeader(const bool is_exact_solution_provided); void printIterationInfo(int iteration, double current_residual_norm, double current_relative_residual_norm, - const ExactSolution* exact_solution); + const bool is_exact_solution_provided); /* ------------------- */ /* Multigrid Functions */ @@ -219,4 +222,5 @@ class GMGPolar : public IGMGPolar #include "MultigridMethods/extrapolated_multigrid_V_Cycle.h" #include "MultigridMethods/multigrid_F_Cycle.h" #include "MultigridMethods/multigrid_W_Cycle.h" + } // namespace gmgpolar diff --git a/include/GMGPolar/solver.h b/include/GMGPolar/solver.h index 6a38ee74..35c35628 100644 --- a/include/GMGPolar/solver.h +++ b/include/GMGPolar/solver.h @@ -96,8 +96,9 @@ void GMGPolar::solve(const BoundaryC LIKWID_STOP("Solver"); auto start_check_exact_error = std::chrono::high_resolution_clock::now(); - if (exact_solution_ != nullptr) + if (exact_solution_) { evaluateExactError(level, *exact_solution_); + } auto end_check_exact_error = std::chrono::high_resolution_clock::now(); t_check_exact_error_ += std::chrono::duration(end_check_exact_error - start_check_exact_error).count(); @@ -169,7 +170,7 @@ void GMGPolar::solve(const BoundaryC if (paraview_) { writeToVTK("output_solution", level, level.solution()); - if (exact_solution_ != nullptr) { + if (exact_solution_) { computeExactError(level, level.solution(), level.residual(), *exact_solution_); writeToVTK("output_error", level, level.residual()); } @@ -244,8 +245,9 @@ void GMGPolar::solveMultigrid(double LIKWID_STOP("Solver"); auto start_check_exact_error = std::chrono::high_resolution_clock::now(); - if (exact_solution_ != nullptr) + if (exact_solution_) { evaluateExactError(level, *exact_solution_); + } auto end_check_exact_error = std::chrono::high_resolution_clock::now(); t_check_exact_error_ += std::chrono::duration(end_check_exact_error - start_check_exact_error).count(); @@ -511,46 +513,48 @@ void GMGPolar::applyExtrapolation(in assert(std::ssize(fine_values) == fineGrid.numberOfNodes()); assert(std::ssize(coarse_values) == coarseGrid.numberOfNodes()); -#pragma omp parallel - { -/* Circluar Indexing Section */ -/* For loop matches circular access pattern */ -#pragma omp for nowait - for (int i_r = 0; i_r < fineGrid.numberSmootherCircles(); i_r++) { - int i_r_coarse = i_r / 2; - for (int i_theta = 0; i_theta < fineGrid.ntheta(); i_theta++) { - int i_theta_coarse = i_theta / 2; - - if (i_r & 1 || i_theta & 1) { - fine_values[fineGrid.index(i_r, i_theta)] *= 4.0 / 3.0; - } - else { - int fine_idx = fineGrid.index(i_r, i_theta); - int coarse_idx = coarseGrid.index(i_r_coarse, i_theta_coarse); - fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; - } + /* We split the loops into two regions to better respect the */ + /* access patterns of the smoother and improve cache locality. */ + + /* Circular Indexing Section */ + /* For loop matches circular access pattern */ + Kokkos::parallel_for( + "Extrapolation: Apply Extrapolation (Circular)", + Kokkos::MDRangePolicy>( + {0, 0}, {fineGrid.numberSmootherCircles(), fineGrid.ntheta()}), + KOKKOS_LAMBDA(const int i_r, const int i_theta) { + const int i_r_coarse = i_r / 2; + const int i_theta_coarse = i_theta / 2; + if (i_r & 1 || i_theta & 1) { + fine_values[fineGrid.index(i_r, i_theta)] *= 4.0 / 3.0; } - } - -/* Radial Indexing Section */ -/* For loop matches radial access pattern */ -#pragma omp for nowait - for (int i_theta = 0; i_theta < fineGrid.ntheta(); i_theta++) { - int i_theta_coarse = i_theta / 2; - for (int i_r = fineGrid.numberSmootherCircles(); i_r < fineGrid.nr(); i_r++) { - int i_r_coarse = i_r / 2; - - if (i_r & 1 || i_theta & 1) { - fine_values[fineGrid.index(i_r, i_theta)] *= 4.0 / 3.0; - } - else { - int fine_idx = fineGrid.index(i_r, i_theta); - int coarse_idx = coarseGrid.index(i_r_coarse, i_theta_coarse); - fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; - } + else { + const int fine_idx = fineGrid.index(i_r, i_theta); + const int coarse_idx = coarseGrid.index(i_r_coarse, i_theta_coarse); + fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; } - } - } + }); + + /* Radial Indexing Section */ + /* For loop matches radial access pattern */ + Kokkos::parallel_for( + "Extrapolation: Apply Extrapolation (Radial)", + Kokkos::MDRangePolicy>({0, fineGrid.numberSmootherCircles()}, + {fineGrid.ntheta(), fineGrid.nr()}), + KOKKOS_LAMBDA(const int i_theta, const int i_r) { + const int i_r_coarse = i_r / 2; + const int i_theta_coarse = i_theta / 2; + if (i_r & 1 || i_theta & 1) { + fine_values[fineGrid.index(i_r, i_theta)] *= 4.0 / 3.0; + } + else { + const int fine_idx = fineGrid.index(i_r, i_theta); + const int coarse_idx = coarseGrid.index(i_r_coarse, i_theta_coarse); + fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; + } + }); + + Kokkos::fence(); } // ============================================================================= @@ -591,31 +595,46 @@ std::pair GMGPolar:: assert(solution.size() == error.size()); assert(std::ssize(solution) == grid.numberOfNodes()); -#pragma omp parallel - { -#pragma omp for nowait - for (int i_r = 0; i_r < grid.numberSmootherCircles(); i_r++) { - double r = grid.radius(i_r); - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - double theta = grid.theta(i_theta); - error[grid.index(i_r, i_theta)] = - exact_solution.exact_solution(r, theta) - solution[grid.index(i_r, i_theta)]; - } - } -#pragma omp for nowait - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - double theta = grid.theta(i_theta); - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - double r = grid.radius(i_r); - error[grid.index(i_r, i_theta)] = - exact_solution.exact_solution(r, theta) - solution[grid.index(i_r, i_theta)]; - } - } - } - HostConstVector c_error = error; - double weighted_euclidean_error = l2_norm(c_error) / std::sqrt(grid.numberOfNodes()); - double infinity_error = infinity_norm(c_error); - + /* Fill exact solution on the host. */ + // Note: We must compute the exact solution values on the host because the + // ExactSolution object is not designed to be used on the device. + // As it is planned that the solution and error vector will use device memory during GPU porting, + // we need to transfer the exact solution values from host to device using the Kokkos::deep_copy operation. + HostVector exact_solution_values_host("Exact Solution Values Host", grid.numberOfNodes()); + + Kokkos::parallel_for( + "Exact Error: Compute Exact Solution Values (Circular)", + Kokkos::MDRangePolicy>( // Host parallel loop + {0, 0}, // Starting index + {grid.numberSmootherCircles(), grid.ntheta()}), // Ending index + [&](const int i_r, const int i_theta) { + const double radius = grid.radius(i_r); + const double theta = grid.theta(i_theta); + const int index = grid.index(i_r, i_theta); + exact_solution_values_host[index] = exact_solution.exact_solution(radius, theta); + }); + Kokkos::parallel_for( + "Exact Error: Compute Exact Solution Values (Radial)", + Kokkos::MDRangePolicy>( // Host parallel loop + {0, grid.numberSmootherCircles()}, // Starting index + {grid.ntheta(), grid.nr()}), // Ending index + [&](const int i_theta, const int i_r) { + const double radius = grid.radius(i_r); + const double theta = grid.theta(i_theta); + const int index = grid.index(i_r, i_theta); + exact_solution_values_host[index] = exact_solution.exact_solution(radius, theta); + }); + Kokkos::fence(); + + // Transfer the exact solution values from host to device memory for error computation. + Kokkos::deep_copy(error, exact_solution_values_host); + + // Compute the error as the difference between the exact and numerical solutions. + subtract(error, HostConstVector(solution)); + + // Compute the weighted L2 norm and infinity norm of the error. + const double weighted_euclidean_error = l2_norm(HostConstVector(error)) / std::sqrt(grid.numberOfNodes()); + const double infinity_error = infinity_norm(HostConstVector(error)); return std::make_pair(weighted_euclidean_error, infinity_error); } @@ -645,7 +664,7 @@ bool GMGPolar::converged(double resi // ============================================================================= template -void GMGPolar::printIterationHeader(const ExactSolution* exact_solution) +void GMGPolar::printIterationHeader(const bool is_exact_solution_provided) { if (verbose_ <= 0) return; @@ -663,7 +682,7 @@ void GMGPolar::printIterationHeader( else std::cout << std::setw(15 + table_spacing) << "||r_k||/||rhs||"; } - if (exact_solution != nullptr) { + if (is_exact_solution_provided) { std::cout << std::setw(12 + table_spacing) << "||u-u_k||_l2"; std::cout << std::setw(13 + table_spacing) << "||u-u_k||_inf"; } @@ -675,7 +694,7 @@ template ::printIterationInfo(int iteration, double current_residual_norm, double current_relative_residual_norm, - const ExactSolution* exact_solution) + const bool is_exact_solution_provided) { if (verbose_ <= 0) return; @@ -687,7 +706,7 @@ void GMGPolar::printIterationInfo(in std::cout << std::setw(9 + table_spacing + 2) << current_residual_norm; std::cout << std::setw(15 + table_spacing) << current_relative_residual_norm; } - if (exact_solution != nullptr) { + if (is_exact_solution_provided) { std::cout << std::setw(12 + table_spacing) << exact_errors_.back().first; std::cout << std::setw(13 + table_spacing) << exact_errors_.back().second; }