Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion source/source_hsolver/diago_bpcg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
int max_iter = current_scf_iter > 1 ?
this->nline :
this->nline * 6;

// Compute optimal frequency for subspace diagonalization based on problem size
this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline);

do
{
++ntry;
Expand Down Expand Up @@ -314,7 +318,7 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
// orthogonal psi by cholesky method
this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);

if (current_scf_iter == 1 && ntry % this->nline == 0) {
if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) {
this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
}
} while (ntry < max_iter && this->test_error(this->err_st, ethr_band));
Expand Down
2 changes: 2 additions & 0 deletions source/source_hsolver/diago_bpcg.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class DiagoBPCG
int n_dim = 0;
/// max iter steps for all-band cg loop
int nline = 4;
/// optimal frequency for subspace diagonalization (computed dynamically)
int optimal_freq = 4;
/// parallel matrix multiplication
ModuleBase::PGemmCN<T, Device> pmmcn;
PLinearTransform<T, Device> plintrans;
Expand Down
210 changes: 179 additions & 31 deletions source/source_hsolver/kernels/bpcg_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,75 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
const int& n_basis_max,
const int& n_band)
{
// OpenMP parallelization over bands: each band accesses a disjoint memory region
// [band_idx * n_basis_max, (band_idx+1) * n_basis_max), so no data races occur.
// MPI collective calls (reduce_pool) use per-band scalar reductions executed serially
// outside parallel regions to avoid thread-safety issues with MPI.
// Static scheduling is used since each band has equal workload (n_basis).
std::vector<Real> norms(n_band);
std::vector<Real> epsilo_0_arr(n_band);
std::vector<Real> epsilo_1_arr(n_band);
std::vector<Real> epsilo_2_arr(n_band);

// Phase 1: parallel computation of per-band norms via BLAS dot
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
}

// Phase 2: MPI reduction of norms across pool, then invert
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Parallel_Reduce::reduce_pool(norms[band_idx]);
norms[band_idx] = 1.0 / sqrt(norms[band_idx]);
}

// Phase 3: parallel normalization of grad/hgrad and accumulation of epsilons
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Real eps_0 = 0.0, eps_1 = 0.0, eps_2 = 0.0;
Real norm = norms[band_idx];
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
grad_out[item] *= norm;
hgrad_out[item] *= norm;
epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item]));
epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
eps_0 += std::real(hpsi_out[item] * std::conj(psi_out[item]));
eps_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
eps_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
}
Parallel_Reduce::reduce_pool(epsilo_0);
Parallel_Reduce::reduce_pool(epsilo_1);
Parallel_Reduce::reduce_pool(epsilo_2);
theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
cos_theta = std::cos(theta);
sin_theta = std::sin(theta);
epsilo_0_arr[band_idx] = eps_0;
epsilo_1_arr[band_idx] = eps_1;
epsilo_2_arr[band_idx] = eps_2;
}

// Phase 4: MPI reduction of epsilons across pool
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Parallel_Reduce::reduce_pool(epsilo_0_arr[band_idx]);
Parallel_Reduce::reduce_pool(epsilo_1_arr[band_idx]);
Parallel_Reduce::reduce_pool(epsilo_2_arr[band_idx]);
}

// Phase 5: parallel application of rotation to psi and hpsi
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Real epsilo_0 = epsilo_0_arr[band_idx];
Real epsilo_1 = epsilo_1_arr[band_idx];
Real epsilo_2 = epsilo_2_arr[band_idx];
Real theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
Real cos_theta = std::cos(theta);
Real sin_theta = std::sin(theta);
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand All @@ -66,43 +112,101 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
const int& n_basis_max,
const int& n_band)
{
// OpenMP parallelization over bands: each band accesses a disjoint memory region
// [band_idx * n_basis_max, (band_idx+1) * n_basis_max), so no data races occur.
// MPI collective calls (reduce_pool) use per-band scalar reductions executed serially
// outside parallel regions to avoid thread-safety issues with MPI.
// Static scheduling is used since each band has equal workload (n_basis).
std::vector<Real> norms(n_band);
std::vector<Real> epsilo_arr(n_band);
std::vector<Real> err_arr(n_band);
std::vector<Real> beta_arr(n_band);

// Phase 1: parallel computation of per-band norms via BLAS dot
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Real err = 0.0;
Real beta = 0.0;
Real epsilo = 0.0;
Real grad_2 = {0.0};
T grad_1 = {0.0, 0.0};
auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
}

// Phase 2: MPI reduction of norms across pool, then invert
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Parallel_Reduce::reduce_pool(norms[band_idx]);
norms[band_idx] = 1.0 / sqrt(norms[band_idx]);
}

// Phase 3: parallel normalization of psi/hpsi and accumulation of epsilo
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Real eps = 0.0;
Real norm = norms[band_idx];
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
psi_out[item] *= norm;
hpsi_out[item] *= norm;
epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
eps += std::real(hpsi_out[item] * std::conj(psi_out[item]));
}
Parallel_Reduce::reduce_pool(epsilo);
epsilo_arr[band_idx] = eps;
}

// Phase 4: MPI reduction of epsilons across pool
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Parallel_Reduce::reduce_pool(epsilo_arr[band_idx]);
}

// Phase 5: parallel computation of per-band err and beta
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Real err = 0.0;
Real beta = 0.0;
Real epsilo = epsilo_arr[band_idx];
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
grad_1 = hpsi_out[item] - epsilo * psi_out[item];
grad_2 = std::norm(grad_1);
T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
Real grad_2 = std::norm(grad_1);
err += grad_2;
beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
beta += grad_2 / prec_in[basis_idx];
}
Parallel_Reduce::reduce_pool(err);
Parallel_Reduce::reduce_pool(beta);
err_arr[band_idx] = err;
beta_arr[band_idx] = beta;
}

// Phase 6: MPI reduction of err and beta across pool
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Parallel_Reduce::reduce_pool(err_arr[band_idx]);
Parallel_Reduce::reduce_pool(beta_arr[band_idx]);
}

// Phase 7: parallel update of gradient and write output arrays
#ifdef _OPENMP
#pragma omp parallel for schedule(static)
#endif
for (int band_idx = 0; band_idx < n_band; band_idx++)
{
Real epsilo = epsilo_arr[band_idx];
Real beta = beta_arr[band_idx];
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
grad_1 = hpsi_out[item] - epsilo * psi_out[item];
T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
}
beta_out[band_idx] = beta;
err_out[band_idx] = sqrt(err);
err_out[band_idx] = sqrt(err_arr[band_idx]);
}
}
};
Expand Down Expand Up @@ -207,6 +311,50 @@ struct refresh_hcc_scc_vcc_op<T, base_device::DEVICE_CPU>
}
};


/**
* @brief Compute the optimal frequency for subspace diagonalization calls
* based on problem size. Uses a tiered approach:
* - Small problems: less frequent calls reduce overhead
* - Large problems: more frequent calls maintain orthogonality
*
* The problem size is computed as n_band * n_basis using size_t to avoid
* integer overflow. The frequency is clamped to [1, nline] range and
* should not exceed the base iteration count.
*
* Tier thresholds (problem_size = n_band * n_basis):
* size < 10,000 -> freq = nline (small, minimal overhead needed)
* size < 100,000 -> freq = min(6, nline)
* size < 500,000 -> freq = min(5, nline)
* size < 2,000,000 -> freq = min(4, nline) (baseline)
* size < 10,000,000 -> freq = min(3, nline)
* size >= 10,000,000 -> freq = min(2, nline) (large, frequent calls)
*
* @param n_band Number of bands (eigenvectors).
* @param n_basis Number of basis functions.
* @param nline Base iteration count for SCF > 1.
* @return Optimal frequency (number of CG steps between subspace diag calls).
*/
int compute_optimal_freq(const int n_band, const int n_basis, const int nline)
{
const size_t problem_size = static_cast<size_t>(n_band) * static_cast<size_t>(n_basis);
int freq = nline;
if (problem_size >= 10000000) {
freq = 2;
} else if (problem_size >= 2000000) {
freq = 3;
} else if (problem_size >= 500000) {
freq = 4;
} else if (problem_size >= 100000) {
freq = 5;
} else if (problem_size >= 10000) {
freq = 6;
}
if (freq < 1) freq = 1;
if (freq > nline) freq = nline;
return freq;
}

template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
Expand Down
16 changes: 15 additions & 1 deletion source/source_hsolver/kernels/bpcg_kernel_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,21 @@
#include "source_base/macros.h"
#include "source_base/module_device/types.h"
namespace hsolver
{
{/**
* @brief Compute the optimal frequency for subspace diagonalization calls
* based on problem size (n_band * n_basis).
*
* Large problems benefit from more frequent subspace diagonalization to
* maintain orthogonality, while small problems can reduce overhead by
* calling it less frequently.
*
* @param n_band Number of bands (eigenvectors).
* @param n_basis Number of basis functions.
* @param nline Base iteration count for SCF > 1.
* @return Optimal frequency (number of CG steps between calc_hsub_with_block calls).
*/
int compute_optimal_freq(const int n_band, const int n_basis, const int nline);


template <typename T, typename Device>
struct line_minimize_with_block_op
Expand Down
Loading