Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion source/source_hsolver/diago_bpcg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
int max_iter = current_scf_iter > 1 ?
this->nline :
this->nline * 6;

// Compute optimal frequency for subspace diagonalization based on problem size
this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline);

do
{
++ntry;
Expand Down Expand Up @@ -314,7 +318,7 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
// orthogonal psi by cholesky method
this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);

if (current_scf_iter == 1 && ntry % this->nline == 0) {
if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) {
this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
}
} while (ntry < max_iter && this->test_error(this->err_st, ethr_band));
Expand Down
2 changes: 2 additions & 0 deletions source/source_hsolver/diago_bpcg.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class DiagoBPCG
int n_dim = 0;
/// max iter steps for all-band cg loop
int nline = 4;
/// optimal frequency for subspace diagonalization (computed dynamically)
int optimal_freq = 4;
/// parallel matrix multiplication
ModuleBase::PGemmCN<T, Device> pmmcn;
PLinearTransform<T, Device> plintrans;
Expand Down
44 changes: 44 additions & 0 deletions source/source_hsolver/kernels/bpcg_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,50 @@ struct refresh_hcc_scc_vcc_op<T, base_device::DEVICE_CPU>
}
};


/**
* @brief Compute the optimal frequency for subspace diagonalization calls
* based on problem size. Uses a tiered approach:
* - Small problems: less frequent calls reduce overhead
* - Large problems: more frequent calls maintain orthogonality
*
* The problem size is computed as n_band * n_basis using size_t to avoid
* integer overflow. The frequency is clamped to [1, nline] range and
* should not exceed the base iteration count.
*
* Tier thresholds (problem_size = n_band * n_basis):
* size < 10,000 -> freq = nline (small, minimal overhead needed)
* size < 100,000 -> freq = min(6, nline)
* size < 500,000 -> freq = min(5, nline)
* size < 2,000,000 -> freq = min(4, nline) (baseline)
* size < 10,000,000 -> freq = min(3, nline)
* size >= 10,000,000 -> freq = min(2, nline) (large, frequent calls)
*
* @param n_band Number of bands (eigenvectors).
* @param n_basis Number of basis functions.
* @param nline Base iteration count for SCF > 1.
* @return Optimal frequency (number of CG steps between subspace diag calls).
*/
int compute_optimal_freq(const int n_band, const int n_basis, const int nline)
{
const size_t problem_size = static_cast<size_t>(n_band) * static_cast<size_t>(n_basis);
int freq = nline;
if (problem_size >= 10000000) {
freq = 2;
} else if (problem_size >= 2000000) {
freq = 3;
} else if (problem_size >= 500000) {
freq = 4;
} else if (problem_size >= 100000) {
freq = 5;
} else if (problem_size >= 10000) {
freq = 6;
}
if (freq < 1) freq = 1;
if (freq > nline) freq = nline;
return freq;
}

template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
Expand Down
16 changes: 15 additions & 1 deletion source/source_hsolver/kernels/bpcg_kernel_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,21 @@
#include "source_base/macros.h"
#include "source_base/module_device/types.h"
namespace hsolver
{
{/**
* @brief Compute the optimal frequency for subspace diagonalization calls
* based on problem size (n_band * n_basis).
*
* Large problems benefit from more frequent subspace diagonalization to
* maintain orthogonality, while small problems can reduce overhead by
* calling it less frequently.
*
* @param n_band Number of bands (eigenvectors).
* @param n_basis Number of basis functions.
* @param nline Base iteration count for SCF > 1.
* @return Optimal frequency (number of CG steps between calc_hsub_with_block calls).
*/
int compute_optimal_freq(const int n_band, const int n_basis, const int nline);


template <typename T, typename Device>
struct line_minimize_with_block_op
Expand Down
Loading