From a7c9671a093c8df5ef361d8964fc82c0462cc2c6 Mon Sep 17 00:00:00 2001 From: Missing-Hex <1304266750@qq.com> Date: Fri, 5 Jun 2026 20:52:25 +0800 Subject: [PATCH 1/2] refactor: omtimize subspace diag freq in bpcg --- source/source_hsolver/diago_bpcg.cpp | 6 ++- source/source_hsolver/diago_bpcg.h | 2 + .../source_hsolver/kernels/bpcg_kernel_op.cpp | 44 +++++++++++++++++++ .../source_hsolver/kernels/bpcg_kernel_op.h | 16 ++++++- 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp index d4db3d790bc..980c9156802 100644 --- a/source/source_hsolver/diago_bpcg.cpp +++ b/source/source_hsolver/diago_bpcg.cpp @@ -282,6 +282,10 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6; + + // Compute optimal frequency for subspace diagonalization based on problem size + this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline); + do { ++ntry; @@ -314,7 +318,7 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, // orthogonal psi by cholesky method this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); - if (current_scf_iter == 1 && ntry % this->nline == 0) { + if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) { this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); } } while (ntry < max_iter && this->test_error(this->err_st, ethr_band)); diff --git a/source/source_hsolver/diago_bpcg.h b/source/source_hsolver/diago_bpcg.h index 27f528024ba..e89a59a5b10 100644 --- a/source/source_hsolver/diago_bpcg.h +++ b/source/source_hsolver/diago_bpcg.h @@ -84,6 +84,8 @@ class DiagoBPCG int n_dim = 0; /// max iter steps for all-band cg loop int nline = 4; + /// optimal frequency for subspace diagonalization (computed dynamically) + int optimal_freq = 4; /// parallel matrix multiplication ModuleBase::PGemmCN pmmcn; PLinearTransform plintrans; diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 88f94e288c6..e3af30c98af 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -207,6 +207,50 @@ struct refresh_hcc_scc_vcc_op } }; + +/** + * @brief Compute the optimal frequency for subspace diagonalization calls + * based on problem size. Uses a tiered approach: + * - Small problems: less frequent calls reduce overhead + * - Large problems: more frequent calls maintain orthogonality + * + * The problem size is computed as n_band * n_basis using size_t to avoid + * integer overflow. The frequency is clamped to [1, nline] range and + * should not exceed the base iteration count. + * + * Tier thresholds (problem_size = n_band * n_basis): + * size < 10,000 -> freq = nline (small, minimal overhead needed) + * size < 100,000 -> freq = min(6, nline) + * size < 500,000 -> freq = min(5, nline) + * size < 2,000,000 -> freq = min(4, nline) (baseline) + * size < 10,000,000 -> freq = min(3, nline) + * size >= 10,000,000 -> freq = min(2, nline) (large, frequent calls) + * + * @param n_band Number of bands (eigenvectors). + * @param n_basis Number of basis functions. + * @param nline Base iteration count for SCF > 1. + * @return Optimal frequency (number of CG steps between subspace diag calls). + */ +int compute_optimal_freq(const int n_band, const int n_basis, const int nline) +{ + const size_t problem_size = static_cast(n_band) * static_cast(n_basis); + int freq = nline; + if (problem_size >= 10000000) { + freq = 2; + } else if (problem_size >= 2000000) { + freq = 3; + } else if (problem_size >= 500000) { + freq = 4; + } else if (problem_size >= 100000) { + freq = 5; + } else if (problem_size >= 10000) { + freq = 6; + } + if (freq < 1) freq = 1; + if (freq > nline) freq = nline; + return freq; +} + template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; template struct line_minimize_with_block_op, base_device::DEVICE_CPU>; template struct calc_grad_with_block_op, base_device::DEVICE_CPU>; diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h index 9ac7c5e2cee..a29819b7877 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.h +++ b/source/source_hsolver/kernels/bpcg_kernel_op.h @@ -3,7 +3,21 @@ #include "source_base/macros.h" #include "source_base/module_device/types.h" namespace hsolver -{ +{/** + * @brief Compute the optimal frequency for subspace diagonalization calls + * based on problem size (n_band * n_basis). + * + * Large problems benefit from more frequent subspace diagonalization to + * maintain orthogonality, while small problems can reduce overhead by + * calling it less frequently. + * + * @param n_band Number of bands (eigenvectors). + * @param n_basis Number of basis functions. + * @param nline Base iteration count for SCF > 1. + * @return Optimal frequency (number of CG steps between calc_hsub_with_block calls). + */ +int compute_optimal_freq(const int n_band, const int n_basis, const int nline); + template struct line_minimize_with_block_op From 4b5fc5009b94605895ab56cb63e8a66e6897bc4f Mon Sep 17 00:00:00 2001 From: Missing-Hex <1304266750@qq.com> Date: Sat, 6 Jun 2026 14:09:58 +0800 Subject: [PATCH 2/2] refactor:add adaptive frequency in bpcg --- source/source_hsolver/diago_bpcg.cpp | 737 +++++++++--------- source/source_hsolver/diago_bpcg.h | 19 + .../source_hsolver/kernels/bpcg_kernel_op.cpp | 34 + .../source_hsolver/kernels/bpcg_kernel_op.h | 28 + 4 files changed, 471 insertions(+), 347 deletions(-) diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp index 980c9156802..ed557b8c9e9 100644 --- a/source/source_hsolver/diago_bpcg.cpp +++ b/source/source_hsolver/diago_bpcg.cpp @@ -1,347 +1,390 @@ -#include "source_hsolver/diago_bpcg.h" - -#include "diago_iter_assist.h" -#include "source_base/global_function.h" -#include "source_base/kernels/math_kernel_op.h" -#include "source_base/parallel_comm.h" // different MPI worlds -#include "source_hsolver/kernels/bpcg_kernel_op.h" -#include "para_linear_transform.h" - -#include -#include -#include -#include - -namespace hsolver { - -template -DiagoBPCG::DiagoBPCG(const Real* precondition_in) -{ - this->r_type = ct::DataTypeToEnum::value; - this->t_type = ct::DataTypeToEnum::value; - this->device_type = ct::DeviceTypeToEnum::value; - - this->h_prec = std::move(ct::TensorMap((void *) precondition_in, r_type, device_type, {this->n_basis})); - - this->one = &one_; - this->zero = &zero_; - this->neg_one = &neg_one_; -} - -template -DiagoBPCG::~DiagoBPCG() { - // Note, we do not need to free the h_prec and psi pointer as they are refs to the outside data -} - -template -void DiagoBPCG::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) { - // Specify the problem size n_basis, n_band, while lda is n_basis - this->n_band = nband; - this->n_band_l = nband_l; - this->n_basis = nbasis; - this->n_dim = ndim; - - // All column major tensors - - this->beta = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); - this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band})); - this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); - - this->hsub = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band})); - - this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - this->hgrad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - this->grad_old = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); - - this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis})); - - this->grad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); -#ifdef __MPI - this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); - this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false); -#else - this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); - this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false); -#endif -} - -template -bool DiagoBPCG::test_error(const ct::Tensor& err_in, const std::vector& ethr_band) -{ - Real* _err_st = err_in.data(); - bool not_conv = false; - std::vector tmp_cpu; - if (err_in.device_type() == ct::DeviceType::GpuDevice) { - // ct::Tensor h_err_in = err_in.to_device(); - // _err_st = h_err_in.data(); - // qianrui change it, because it can not pass the valgrind test - tmp_cpu.resize(this->n_band_l); - _err_st = tmp_cpu.data(); - syncmem_var_d2h_op()(_err_st, err_in.data(), this->n_band_l); - } - for (int ii = 0; ii < this->n_band_l; ii++) { - if (_err_st[ii] > ethr_band[ii]) { - not_conv = true; - } - } -#ifdef __MPI - MPI_Allreduce(MPI_IN_PLACE, ¬_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); -#endif - return not_conv; -} - -// Finally, the last one! -template -void DiagoBPCG::line_minimize( - ct::Tensor& grad_in, - ct::Tensor& hgrad_in, - ct::Tensor& psi_out, - ct::Tensor& hpsi_out) -{ - line_minimize_with_block_op()(grad_in.data(), - hgrad_in.data(), - psi_out.data(), - hpsi_out.data(), - this->n_dim, - this->n_basis, - this->n_band_l); -} - - -// Finally, the last two! -template -void DiagoBPCG::orth_cholesky( - ct::Tensor& workspace_in, - ct::Tensor& psi_out, - ct::Tensor& hpsi_out, - ct::Tensor& hsub_out) -{ - // gemm: hsub_out(n_band x n_band) = psi_out^T(n_band x n_basis) * psi_out(n_basis x n_band) - this->pmmcn.multiply(1.0, psi_out.data(), psi_out.data(), 0.0, hsub_out.data()); - - // set hsub matrix to lower format; - ct::kernels::set_matrix()( - 'L', hsub_out.data(), this->n_band); - - ct::kernels::lapack_potrf()( - 'U', this->n_band, hsub_out.data(), this->n_band); - ct::kernels::lapack_trtri()( - 'U', 'N', this->n_band, hsub_out.data(), this->n_band); - - this->rotate_wf(hsub_out, psi_out, workspace_in); - this->rotate_wf(hsub_out, hpsi_out, workspace_in); -} - -template -void DiagoBPCG::calc_grad_with_block( - const ct::Tensor& prec_in, - ct::Tensor& err_out, - ct::Tensor& beta_out, - ct::Tensor& psi_in, - ct::Tensor& hpsi_in, - ct::Tensor& grad_out, - ct::Tensor& grad_old_out) -{ - calc_grad_with_block_op()(prec_in.data(), - err_out.data(), - beta_out.data(), - psi_in.data(), - hpsi_in.data(), - grad_out.data(), - grad_old_out.data(), - this->n_dim, - this->n_basis, - this->n_band_l); -} - -template -void DiagoBPCG::calc_prec() -{ - syncmem_var_h2d_op()(this->prec.template data(), this->h_prec.template data(), this->n_basis); -} - -template -void DiagoBPCG::orth_projection( - const ct::Tensor& psi_in, - ct::Tensor& hsub_in, - ct::Tensor& grad_out) -{ - // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band) - this->pmmcn.multiply(1.0, psi_in.data(), grad_out.data(), 0.0, hsub_in.data()); - - // grad_out(n_basis x n_band) = 1.0 * grad_out(n_basis x n_band) - psi_in(n_basis x n_band) * hsub_in(n_band x - // n_band) - this->plintrans.act(-1.0, psi_in.data(), hsub_in.data(), 1.0, grad_out.data()); - return; -} - -template -void DiagoBPCG::rotate_wf( - const ct::Tensor& hsub_in, - ct::Tensor& psi_out, - ct::Tensor& workspace_in) -{ - // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band) - this->plintrans.act(1.0, psi_out.data(), hsub_in.data(), 0.0, workspace_in.data()); - syncmem_complex_op()(psi_out.template data(), workspace_in.template data(), this->n_band_l * this->n_basis); - - return; -} - -template -void DiagoBPCG::calc_hpsi_with_block( - const HPsiFunc& hpsi_func, - T *psi_in, - ct::Tensor& hpsi_out) -{ - // calculate all-band hpsi - hpsi_func(psi_in, hpsi_out.data(), this->n_basis, this->n_band_l); -} - -template -void DiagoBPCG::diag_hsub( - const ct::Tensor& psi_in, - const ct::Tensor& hpsi_in, - ct::Tensor& hsub_out, - ct::Tensor& eigenvalue_out) -{ - // gemm: hsub_out(n_band x n_band) = hpsi_in^T(n_band x n_basis) * psi_in(n_basis x n_band) - this->pmmcn.multiply(1.0, hpsi_in.data(), psi_in.data(), 0.0, hsub_out.data()); - - // ct::kernels::lapack_heevd()('V', 'U', hsub_out.data(), this->n_band, eigenvalue_out.data()); - ct::kernels::lapack_heevd()(this->n_band, hsub_out.data(), this->n_band, eigenvalue_out.data()); - - return; -} - -template -void DiagoBPCG::calc_hsub_with_block( - const HPsiFunc& hpsi_func, - T *psi_in, - ct::Tensor& psi_out, - ct::Tensor& hpsi_out, - ct::Tensor& hsub_out, - ct::Tensor& workspace_in, - ct::Tensor& eigenvalue_out) -{ - // Apply the H operator to psi and obtain the hpsi matrix. - this->calc_hpsi_with_block(hpsi_func, psi_in, hpsi_out); - - // Diagonalization of the subspace matrix. - this->diag_hsub(psi_out,hpsi_out, hsub_out, eigenvalue_out); - - // inplace matmul to get the initial guessed wavefunction psi. - // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band] - // hpsi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band] - this->rotate_wf(hsub_out, psi_out, workspace_in); - this->rotate_wf(hsub_out, hpsi_out, workspace_in); - - return; -} - -template -void DiagoBPCG::calc_hsub_with_block_exit( - ct::Tensor& psi_out, - ct::Tensor& hpsi_out, - ct::Tensor& hsub_out, - ct::Tensor& workspace_in, - ct::Tensor& eigenvalue_out) -{ - // Diagonalization of the subspace matrix. - this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out); - - // inplace matmul to get the initial guessed wavefunction psi. - // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band] - this->rotate_wf(hsub_out, psi_out, workspace_in); - - return; -} - -template -void DiagoBPCG::diag(const HPsiFunc& hpsi_func, - T* psi_in, - Real* eigenvalue_in, - const std::vector& ethr_band) -{ - const int current_scf_iter = hsolver::DiagoIterAssist::SCF_ITER; - // Get the pointer of the input psi - this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis})); - - // Update the precondition array - this->calc_prec(); - - // Improving the initial guess of the wave function psi through a subspace diagonalization. - this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); - - setmem_complex_op()(this->grad_old.template data(), 0, this->n_basis * this->n_band_l); - - setmem_var_op()(this->beta.template data(), std::numeric_limits::infinity(), this->n_band_l); - - int ntry = 0; - int max_iter = current_scf_iter > 1 ? - this->nline : - this->nline * 6; - - // Compute optimal frequency for subspace diagonalization based on problem size - this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline); - - do - { - ++ntry; - // Be careful here ! dangerous zone! - // 1. normalize psi - // 2. calculate the epsilo - // 3. calculate the gradient by hpsi - epsilo * psi - // 4. gradient mix with the previous gradient - // 5. Do precondition - this->calc_grad_with_block(this->prec, this->err_st, this->beta, - this->psi, this->hpsi, this->grad, this->grad_old); - - // Orthogonalize column vectors g_i in matrix grad to column vectors p_j in matrix psi - // for all 'j less or equal to i'. - // Note: hsub and work are only used to store intermediate variables of gemm operator. - this->orth_projection(this->psi, this->hsub, this->grad); - - // this->grad_old = this->grad; - syncmem_complex_op()(this->grad_old.template data(), this->grad.template data(), n_basis * n_band_l); - - // Calculate H|grad> matrix - this->calc_hpsi_with_block(hpsi_func, this->grad.template data(), /*this->grad_wrapper[0],*/ this->hgrad); - - // optimize psi as well as the hpsi - // 1. normalize grad - // 2. calculate theta - // 3. update psi as well as hpsi - this->line_minimize(this->grad, this->hgrad, this->psi, this->hpsi); - - // orthogonal psi by cholesky method - this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); - - if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) { - this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); - } - } while (ntry < max_iter && this->test_error(this->err_st, ethr_band)); - - this->calc_hsub_with_block_exit(this->psi, this->hpsi, this->hsub, this->work, this->eigen); - - int start_nband = 0; -#ifdef __MPI - if (this->plintrans.nproc_col > 1) - { - start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP]; - } -#endif - syncmem_var_d2h_op()(eigenvalue_in, this->eigen.template data() + start_nband, this->n_band_l); - - return; -} - -template class DiagoBPCG, base_device::DEVICE_CPU>; -template class DiagoBPCG, base_device::DEVICE_CPU>; -#if ((defined __CUDA) || (defined __ROCM)) -template class DiagoBPCG, base_device::DEVICE_GPU>; -template class DiagoBPCG, base_device::DEVICE_GPU>; -#endif - -} // namespace hsolver +#include "source_hsolver/diago_bpcg.h" + +#include "diago_iter_assist.h" +#include "source_base/global_function.h" +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/parallel_comm.h" // different MPI worlds +#include "source_base/parallel_reduce.h" +#include "source_hsolver/kernels/bpcg_kernel_op.h" +#include "para_linear_transform.h" + +#include +#include +#include +#include + +namespace hsolver { + +template +DiagoBPCG::DiagoBPCG(const Real* precondition_in) +{ + this->r_type = ct::DataTypeToEnum::value; + this->t_type = ct::DataTypeToEnum::value; + this->device_type = ct::DeviceTypeToEnum::value; + + this->h_prec = std::move(ct::TensorMap((void *) precondition_in, r_type, device_type, {this->n_basis})); + + this->one = &one_; + this->zero = &zero_; + this->neg_one = &neg_one_; +} + +template +DiagoBPCG::~DiagoBPCG() { + // Note, we do not need to free the h_prec and psi pointer as they are refs to the outside data +} + +template +void DiagoBPCG::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) { + // Specify the problem size n_basis, n_band, while lda is n_basis + this->n_band = nband; + this->n_band_l = nband_l; + this->n_basis = nbasis; + this->n_dim = ndim; + + // All column major tensors + + this->beta = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); + this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band})); + this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); + + this->hsub = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band})); + + this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hgrad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->grad_old = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + + this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis})); + + this->grad = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); + this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false); +#else + this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); + this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false); +#endif +} + +template +void DiagoBPCG::update_adaptive_freq(const std::vector& ethr_band) +{ + Real* _err_st = this->err_st.data(); + std::vector tmp_cpu; + if (this->err_st.device_type() == ct::DeviceType::GpuDevice) { + tmp_cpu.resize(this->n_band_l); + _err_st = tmp_cpu.data(); + syncmem_var_d2h_op()(_err_st, this->err_st.template data(), this->n_band_l); + } + + Real avg_err = 0.0; + for (int ii = 0; ii < this->n_band_l; ii++) { + avg_err += _err_st[ii]; + } + avg_err /= this->n_band_l; + +#ifdef __MPI + Parallel_Reduce::reduce_pool(avg_err); +#endif + + if (this->prev_avg_error > 0.0) { + double conv_rate = compute_convergence_rate( + static_cast(this->prev_avg_error), + static_cast(avg_err)); + + this->adaptive_freq = compute_adaptive_freq( + conv_rate, this->optimal_freq, this->nline); + } + + this->prev_avg_error = static_cast(avg_err); + this->steps_since_measure = 0; +} + +template +bool DiagoBPCG::test_error(const ct::Tensor& err_in, const std::vector& ethr_band) +{ + Real* _err_st = err_in.data(); + bool not_conv = false; + std::vector tmp_cpu; + if (err_in.device_type() == ct::DeviceType::GpuDevice) { + // ct::Tensor h_err_in = err_in.to_device(); + // _err_st = h_err_in.data(); + // qianrui change it, because it can not pass the valgrind test + tmp_cpu.resize(this->n_band_l); + _err_st = tmp_cpu.data(); + syncmem_var_d2h_op()(_err_st, err_in.data(), this->n_band_l); + } + for (int ii = 0; ii < this->n_band_l; ii++) { + if (_err_st[ii] > ethr_band[ii]) { + not_conv = true; + } + } +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, ¬_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); +#endif + return not_conv; +} + +// Finally, the last one! +template +void DiagoBPCG::line_minimize( + ct::Tensor& grad_in, + ct::Tensor& hgrad_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out) +{ + line_minimize_with_block_op()(grad_in.data(), + hgrad_in.data(), + psi_out.data(), + hpsi_out.data(), + this->n_dim, + this->n_basis, + this->n_band_l); +} + + +// Finally, the last two! +template +void DiagoBPCG::orth_cholesky( + ct::Tensor& workspace_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out) +{ + // gemm: hsub_out(n_band x n_band) = psi_out^T(n_band x n_basis) * psi_out(n_basis x n_band) + this->pmmcn.multiply(1.0, psi_out.data(), psi_out.data(), 0.0, hsub_out.data()); + + // set hsub matrix to lower format; + ct::kernels::set_matrix()( + 'L', hsub_out.data(), this->n_band); + + ct::kernels::lapack_potrf()( + 'U', this->n_band, hsub_out.data(), this->n_band); + ct::kernels::lapack_trtri()( + 'U', 'N', this->n_band, hsub_out.data(), this->n_band); + + this->rotate_wf(hsub_out, psi_out, workspace_in); + this->rotate_wf(hsub_out, hpsi_out, workspace_in); +} + +template +void DiagoBPCG::calc_grad_with_block( + const ct::Tensor& prec_in, + ct::Tensor& err_out, + ct::Tensor& beta_out, + ct::Tensor& psi_in, + ct::Tensor& hpsi_in, + ct::Tensor& grad_out, + ct::Tensor& grad_old_out) +{ + calc_grad_with_block_op()(prec_in.data(), + err_out.data(), + beta_out.data(), + psi_in.data(), + hpsi_in.data(), + grad_out.data(), + grad_old_out.data(), + this->n_dim, + this->n_basis, + this->n_band_l); +} + +template +void DiagoBPCG::calc_prec() +{ + syncmem_var_h2d_op()(this->prec.template data(), this->h_prec.template data(), this->n_basis); +} + +template +void DiagoBPCG::orth_projection( + const ct::Tensor& psi_in, + ct::Tensor& hsub_in, + ct::Tensor& grad_out) +{ + // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band) + this->pmmcn.multiply(1.0, psi_in.data(), grad_out.data(), 0.0, hsub_in.data()); + + // grad_out(n_basis x n_band) = 1.0 * grad_out(n_basis x n_band) - psi_in(n_basis x n_band) * hsub_in(n_band x + // n_band) + this->plintrans.act(-1.0, psi_in.data(), hsub_in.data(), 1.0, grad_out.data()); + return; +} + +template +void DiagoBPCG::rotate_wf( + const ct::Tensor& hsub_in, + ct::Tensor& psi_out, + ct::Tensor& workspace_in) +{ + // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band) + this->plintrans.act(1.0, psi_out.data(), hsub_in.data(), 0.0, workspace_in.data()); + syncmem_complex_op()(psi_out.template data(), workspace_in.template data(), this->n_band_l * this->n_basis); + + return; +} + +template +void DiagoBPCG::calc_hpsi_with_block( + const HPsiFunc& hpsi_func, + T *psi_in, + ct::Tensor& hpsi_out) +{ + // calculate all-band hpsi + hpsi_func(psi_in, hpsi_out.data(), this->n_basis, this->n_band_l); +} + +template +void DiagoBPCG::diag_hsub( + const ct::Tensor& psi_in, + const ct::Tensor& hpsi_in, + ct::Tensor& hsub_out, + ct::Tensor& eigenvalue_out) +{ + // gemm: hsub_out(n_band x n_band) = hpsi_in^T(n_band x n_basis) * psi_in(n_basis x n_band) + this->pmmcn.multiply(1.0, hpsi_in.data(), psi_in.data(), 0.0, hsub_out.data()); + + // ct::kernels::lapack_heevd()('V', 'U', hsub_out.data(), this->n_band, eigenvalue_out.data()); + ct::kernels::lapack_heevd()(this->n_band, hsub_out.data(), this->n_band, eigenvalue_out.data()); + + return; +} + +template +void DiagoBPCG::calc_hsub_with_block( + const HPsiFunc& hpsi_func, + T *psi_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out, + ct::Tensor& workspace_in, + ct::Tensor& eigenvalue_out) +{ + // Apply the H operator to psi and obtain the hpsi matrix. + this->calc_hpsi_with_block(hpsi_func, psi_in, hpsi_out); + + // Diagonalization of the subspace matrix. + this->diag_hsub(psi_out,hpsi_out, hsub_out, eigenvalue_out); + + // inplace matmul to get the initial guessed wavefunction psi. + // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band] + // hpsi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band] + this->rotate_wf(hsub_out, psi_out, workspace_in); + this->rotate_wf(hsub_out, hpsi_out, workspace_in); + + return; +} + +template +void DiagoBPCG::calc_hsub_with_block_exit( + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out, + ct::Tensor& workspace_in, + ct::Tensor& eigenvalue_out) +{ + // Diagonalization of the subspace matrix. + this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out); + + // inplace matmul to get the initial guessed wavefunction psi. + // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band] + this->rotate_wf(hsub_out, psi_out, workspace_in); + + return; +} + +template +void DiagoBPCG::diag(const HPsiFunc& hpsi_func, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band) +{ + const int current_scf_iter = hsolver::DiagoIterAssist::SCF_ITER; + // Get the pointer of the input psi + this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis})); + + // Update the precondition array + this->calc_prec(); + + // Improving the initial guess of the wave function psi through a subspace diagonalization. + this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); + + setmem_complex_op()(this->grad_old.template data(), 0, this->n_basis * this->n_band_l); + + setmem_var_op()(this->beta.template data(), std::numeric_limits::infinity(), this->n_band_l); + + int ntry = 0; + int max_iter = current_scf_iter > 1 ? + this->nline : + this->nline * 6; + + // Compute optimal frequency for subspace diagonalization based on problem size + this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline); + this->adaptive_freq = this->optimal_freq; + this->prev_avg_error = 0.0; + this->steps_since_measure = 0; + + do + { + ++ntry; + // Be careful here ! dangerous zone! + // 1. normalize psi + // 2. calculate the epsilo + // 3. calculate the gradient by hpsi - epsilo * psi + // 4. gradient mix with the previous gradient + // 5. Do precondition + this->calc_grad_with_block(this->prec, this->err_st, this->beta, + this->psi, this->hpsi, this->grad, this->grad_old); + + // Orthogonalize column vectors g_i in matrix grad to column vectors p_j in matrix psi + // for all 'j less or equal to i'. + // Note: hsub and work are only used to store intermediate variables of gemm operator. + this->orth_projection(this->psi, this->hsub, this->grad); + + // this->grad_old = this->grad; + syncmem_complex_op()(this->grad_old.template data(), this->grad.template data(), n_basis * n_band_l); + + // Calculate H|grad> matrix + this->calc_hpsi_with_block(hpsi_func, this->grad.template data(), /*this->grad_wrapper[0],*/ this->hgrad); + + // optimize psi as well as the hpsi + // 1. normalize grad + // 2. calculate theta + // 3. update psi as well as hpsi + this->line_minimize(this->grad, this->hgrad, this->psi, this->hpsi); + + // orthogonal psi by cholesky method + this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); + + ++this->steps_since_measure; + if (current_scf_iter == 1 && this->steps_since_measure >= this->measure_interval) { + this->update_adaptive_freq(ethr_band); + } + + if (current_scf_iter == 1 && ntry % this->adaptive_freq == 0) { + this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); + } + } while (ntry < max_iter && this->test_error(this->err_st, ethr_band)); + + this->calc_hsub_with_block_exit(this->psi, this->hpsi, this->hsub, this->work, this->eigen); + + int start_nband = 0; +#ifdef __MPI + if (this->plintrans.nproc_col > 1) + { + start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP]; + } +#endif + syncmem_var_d2h_op()(eigenvalue_in, this->eigen.template data() + start_nband, this->n_band_l); + + return; +} + +template class DiagoBPCG, base_device::DEVICE_CPU>; +template class DiagoBPCG, base_device::DEVICE_CPU>; +#if ((defined __CUDA) || (defined __ROCM)) +template class DiagoBPCG, base_device::DEVICE_GPU>; +template class DiagoBPCG, base_device::DEVICE_GPU>; +#endif + +} // namespace hsolver diff --git a/source/source_hsolver/diago_bpcg.h b/source/source_hsolver/diago_bpcg.h index e89a59a5b10..6bd52d902a3 100644 --- a/source/source_hsolver/diago_bpcg.h +++ b/source/source_hsolver/diago_bpcg.h @@ -86,6 +86,14 @@ class DiagoBPCG int nline = 4; /// optimal frequency for subspace diagonalization (computed dynamically) int optimal_freq = 4; + /// current adaptive frequency (may override optimal_freq based on convergence) + int adaptive_freq = 4; + /// previous average error for convergence rate tracking + double prev_avg_error = 0.0; + /// number of iterations since last error measurement + int steps_since_measure = 0; + /// measurement interval (how often to sample error for rate computation) + int measure_interval = 2; /// parallel matrix multiplication ModuleBase::PGemmCN pmmcn; PLinearTransform plintrans; @@ -326,6 +334,17 @@ class DiagoBPCG ct::Tensor& hpsi_out, ct::Tensor& hsub_out); + /** + * @brief Update adaptive frequency based on convergence rate. + * + * Computes the average error from err_st and compares it with + * the previous measurement to determine the convergence rate. + * The adaptive frequency is then computed and applied. + * + * @param ethr_band Per-band error thresholds. + */ + void update_adaptive_freq(const std::vector& ethr_band); + /** * @brief Checks if the error satisfies the given threshold. * diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index e3af30c98af..93d398037a3 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -246,8 +246,42 @@ int compute_optimal_freq(const int n_band, const int n_basis, const int nline) } else if (problem_size >= 10000) { freq = 6; } + // Ensure freq is at least 1 (call at most every step) + if (freq < 1) freq = 1; + return freq; +} + +double compute_convergence_rate(const double prev_avg_err, const double curr_avg_err) +{ + if (prev_avg_err <= 0.0) return 0.0; + if (curr_avg_err <= 0.0) return 0.0; + return curr_avg_err / prev_avg_err; +} + +int compute_adaptive_freq(const double conv_rate, const int base_freq, const int nline) +{ + int freq = base_freq; + + if (conv_rate < 0.3) + { + freq = base_freq + 1; + } + else if (conv_rate < 0.6) + { + freq = base_freq; + } + else if (conv_rate < 0.85) + { + freq = std::max(base_freq - 1, 1); + } + else + { + freq = std::max(base_freq - 2, 1); + } + if (freq < 1) freq = 1; if (freq > nline) freq = nline; + return freq; } diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h index a29819b7877..0ee7b37769f 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.h +++ b/source/source_hsolver/kernels/bpcg_kernel_op.h @@ -18,6 +18,34 @@ namespace hsolver */ int compute_optimal_freq(const int n_band, const int n_basis, const int nline); +/** + * @brief Compute the convergence rate from historical error values. + * + * The convergence rate is computed as the ratio of current average error + * to previous average error. A rate close to 1.0 indicates slow convergence + * (stagnation), while a rate close to 0.0 indicates fast convergence. + * + * @param prev_avg_err Average error from the previous measurement point. + * @param curr_avg_err Average error from the current measurement point. + * @return Convergence rate in range [0.0, 1.0+]. Values > 1.0 indicate divergence. + */ +double compute_convergence_rate(const double prev_avg_err, const double curr_avg_err); + +/** + * @brief Compute adaptive diagonalization frequency based on convergence rate. + * + * Uses the convergence rate to dynamically adjust how often subspace + * diagonalization is performed. Slow convergence triggers more frequent + * diagonalization to reset the search direction, while fast convergence + * reduces the frequency to save computation. + * + * @param conv_rate Convergence rate (curr_avg_err / prev_avg_err). + * @param base_freq Base frequency from problem-size-based computation. + * @param nline Maximum allowed frequency (CG iteration limit). + * @return Adaptive frequency in range [1, nline]. + */ +int compute_adaptive_freq(const double conv_rate, const int base_freq, const int nline); + template struct line_minimize_with_block_op