diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp index d4db3d790bc..980c9156802 100644 --- a/source/source_hsolver/diago_bpcg.cpp +++ b/source/source_hsolver/diago_bpcg.cpp @@ -282,6 +282,10 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6; + + // Compute optimal frequency for subspace diagonalization based on problem size + this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline); + do { ++ntry; @@ -314,7 +318,7 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, // orthogonal psi by cholesky method this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); - if (current_scf_iter == 1 && ntry % this->nline == 0) { + if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) { this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); } } while (ntry < max_iter && this->test_error(this->err_st, ethr_band)); diff --git a/source/source_hsolver/diago_bpcg.h b/source/source_hsolver/diago_bpcg.h index 27f528024ba..e89a59a5b10 100644 --- a/source/source_hsolver/diago_bpcg.h +++ b/source/source_hsolver/diago_bpcg.h @@ -84,6 +84,8 @@ class DiagoBPCG int n_dim = 0; /// max iter steps for all-band cg loop int nline = 4; + /// optimal frequency for subspace diagonalization (computed dynamically) + int optimal_freq = 4; /// parallel matrix multiplication ModuleBase::PGemmCN pmmcn; PLinearTransform plintrans; diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 88f94e288c6..ce9f1d62962 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -18,29 +18,75 @@ struct line_minimize_with_block_op const int& n_basis_max, const int& n_band) { + // OpenMP parallelization over bands: each band accesses a disjoint memory region + // [band_idx * n_basis_max, (band_idx+1) * n_basis_max), so no data races occur. + // MPI collective calls (reduce_pool) use per-band scalar reductions executed serially + // outside parallel regions to avoid thread-safety issues with MPI. + // Static scheduling is used since each band has equal workload (n_basis). + std::vector norms(n_band); + std::vector epsilo_0_arr(n_band); + std::vector epsilo_1_arr(n_band); + std::vector epsilo_2_arr(n_band); + + // Phase 1: parallel computation of per-band norms via BLAS dot +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (int band_idx = 0; band_idx < n_band; band_idx++) { - Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; - Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); - Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); - Parallel_Reduce::reduce_pool(norm); - norm = 1.0 / sqrt(norm); + norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + } + + // Phase 2: MPI reduction of norms across pool, then invert + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Parallel_Reduce::reduce_pool(norms[band_idx]); + norms[band_idx] = 1.0 / sqrt(norms[band_idx]); + } + + // Phase 3: parallel normalization of grad/hgrad and accumulation of epsilons +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real eps_0 = 0.0, eps_1 = 0.0, eps_2 = 0.0; + Real norm = norms[band_idx]; for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; grad_out[item] *= norm; hgrad_out[item] *= norm; - epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item])); - epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); - epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); + eps_0 += std::real(hpsi_out[item] * std::conj(psi_out[item])); + eps_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); + eps_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); } - Parallel_Reduce::reduce_pool(epsilo_0); - Parallel_Reduce::reduce_pool(epsilo_1); - Parallel_Reduce::reduce_pool(epsilo_2); - theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); - cos_theta = std::cos(theta); - sin_theta = std::sin(theta); + epsilo_0_arr[band_idx] = eps_0; + epsilo_1_arr[band_idx] = eps_1; + epsilo_2_arr[band_idx] = eps_2; + } + + // Phase 4: MPI reduction of epsilons across pool + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Parallel_Reduce::reduce_pool(epsilo_0_arr[band_idx]); + Parallel_Reduce::reduce_pool(epsilo_1_arr[band_idx]); + Parallel_Reduce::reduce_pool(epsilo_2_arr[band_idx]); + } + + // Phase 5: parallel application of rotation to psi and hpsi +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real epsilo_0 = epsilo_0_arr[band_idx]; + Real epsilo_1 = epsilo_1_arr[band_idx]; + Real epsilo_2 = epsilo_2_arr[band_idx]; + Real theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); + Real cos_theta = std::cos(theta); + Real sin_theta = std::sin(theta); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -66,43 +112,101 @@ struct calc_grad_with_block_op const int& n_basis_max, const int& n_band) { + // OpenMP parallelization over bands: each band accesses a disjoint memory region + // [band_idx * n_basis_max, (band_idx+1) * n_basis_max), so no data races occur. + // MPI collective calls (reduce_pool) use per-band scalar reductions executed serially + // outside parallel regions to avoid thread-safety issues with MPI. + // Static scheduling is used since each band has equal workload (n_basis). + std::vector norms(n_band); + std::vector epsilo_arr(n_band); + std::vector err_arr(n_band); + std::vector beta_arr(n_band); + + // Phase 1: parallel computation of per-band norms via BLAS dot +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (int band_idx = 0; band_idx < n_band; band_idx++) { - Real err = 0.0; - Real beta = 0.0; - Real epsilo = 0.0; - Real grad_2 = {0.0}; - T grad_1 = {0.0, 0.0}; auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); - Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); - Parallel_Reduce::reduce_pool(norm); - norm = 1.0 / sqrt(norm); + norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + } + + // Phase 2: MPI reduction of norms across pool, then invert + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Parallel_Reduce::reduce_pool(norms[band_idx]); + norms[band_idx] = 1.0 / sqrt(norms[band_idx]); + } + + // Phase 3: parallel normalization of psi/hpsi and accumulation of epsilo +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real eps = 0.0; + Real norm = norms[band_idx]; for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; psi_out[item] *= norm; hpsi_out[item] *= norm; - epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); + eps += std::real(hpsi_out[item] * std::conj(psi_out[item])); } - Parallel_Reduce::reduce_pool(epsilo); + epsilo_arr[band_idx] = eps; + } + + // Phase 4: MPI reduction of epsilons across pool + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Parallel_Reduce::reduce_pool(epsilo_arr[band_idx]); + } + + // Phase 5: parallel computation of per-band err and beta +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real err = 0.0; + Real beta = 0.0; + Real epsilo = epsilo_arr[band_idx]; for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi_out[item] - epsilo * psi_out[item]; - grad_2 = std::norm(grad_1); + T grad_1 = hpsi_out[item] - epsilo * psi_out[item]; + Real grad_2 = std::norm(grad_1); err += grad_2; - beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? + beta += grad_2 / prec_in[basis_idx]; } - Parallel_Reduce::reduce_pool(err); - Parallel_Reduce::reduce_pool(beta); + err_arr[band_idx] = err; + beta_arr[band_idx] = beta; + } + + // Phase 6: MPI reduction of err and beta across pool + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Parallel_Reduce::reduce_pool(err_arr[band_idx]); + Parallel_Reduce::reduce_pool(beta_arr[band_idx]); + } + + // Phase 7: parallel update of gradient and write output arrays +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real epsilo = epsilo_arr[band_idx]; + Real beta = beta_arr[band_idx]; for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi_out[item] - epsilo * psi_out[item]; + T grad_1 = hpsi_out[item] - epsilo * psi_out[item]; grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item]; } beta_out[band_idx] = beta; - err_out[band_idx] = sqrt(err); + err_out[band_idx] = sqrt(err_arr[band_idx]); } } }; @@ -207,6 +311,50 @@ struct refresh_hcc_scc_vcc_op } }; + +/** + * @brief Compute the optimal frequency for subspace diagonalization calls + * based on problem size. Uses a tiered approach: + * - Small problems: less frequent calls reduce overhead + * - Large problems: more frequent calls maintain orthogonality + * + * The problem size is computed as n_band * n_basis using size_t to avoid + * integer overflow. The frequency is clamped to [1, nline] range and + * should not exceed the base iteration count. + * + * Tier thresholds (problem_size = n_band * n_basis): + * size < 10,000 -> freq = nline (small, minimal overhead needed) + * size < 100,000 -> freq = min(6, nline) + * size < 500,000 -> freq = min(5, nline) + * size < 2,000,000 -> freq = min(4, nline) (baseline) + * size < 10,000,000 -> freq = min(3, nline) + * size >= 10,000,000 -> freq = min(2, nline) (large, frequent calls) + * + * @param n_band Number of bands (eigenvectors). + * @param n_basis Number of basis functions. + * @param nline Base iteration count for SCF > 1. + * @return Optimal frequency (number of CG steps between subspace diag calls). + */ +int compute_optimal_freq(const int n_band, const int n_basis, const int nline) +{ + const size_t problem_size = static_cast(n_band) * static_cast(n_basis); + int freq = nline; + if (problem_size >= 10000000) { + freq = 2; + } else if (problem_size >= 2000000) { + freq = 3; + } else if (problem_size >= 500000) { + freq = 4; + } else if (problem_size >= 100000) { + freq = 5; + } else if (problem_size >= 10000) { + freq = 6; + } + if (freq < 1) freq = 1; + if (freq > nline) freq = nline; + return freq; +} + template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h index 9ac7c5e2cee..a29819b7877 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.h +++ b/source/source_hsolver/kernels/bpcg_kernel_op.h @@ -3,7 +3,21 @@ #include "source_base/macros.h" #include "source_base/module_device/types.h" namespace hsolver -{ +{/** + * @brief Compute the optimal frequency for subspace diagonalization calls + * based on problem size (n_band * n_basis). + * + * Large problems benefit from more frequent subspace diagonalization to + * maintain orthogonality, while small problems can reduce overhead by + * calling it less frequently. + * + * @param n_band Number of bands (eigenvectors). + * @param n_basis Number of basis functions. + * @param nline Base iteration count for SCF > 1. + * @return Optimal frequency (number of CG steps between calc_hsub_with_block calls). + */ +int compute_optimal_freq(const int n_band, const int n_basis, const int nline); + template struct line_minimize_with_block_op