From a7c9671a093c8df5ef361d8964fc82c0462cc2c6 Mon Sep 17 00:00:00 2001
From: Missing-Hex <1304266750@qq.com>
Date: Fri, 5 Jun 2026 20:52:25 +0800
Subject: [PATCH 1/2] refactor: omtimize subspace diag freq in bpcg

---
 source/source_hsolver/diago_bpcg.cpp          |  6 ++-
 source/source_hsolver/diago_bpcg.h            |  2 +
 .../source_hsolver/kernels/bpcg_kernel_op.cpp | 44 +++++++++++++++++++
 .../source_hsolver/kernels/bpcg_kernel_op.h   | 16 ++++++-
 4 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp
index d4db3d790bc..980c9156802 100644
--- a/source/source_hsolver/diago_bpcg.cpp
+++ b/source/source_hsolver/diago_bpcg.cpp
@@ -282,6 +282,10 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
     int max_iter = current_scf_iter > 1 ?
                    this->nline :
                    this->nline * 6;
+
+    // Compute optimal frequency for subspace diagonalization based on problem size
+    this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline);
+
     do
     {
         ++ntry;
@@ -314,7 +318,7 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
         // orthogonal psi by cholesky method
         this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
 
-        if (current_scf_iter == 1 && ntry % this->nline == 0) {
+        if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) {
             this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
         }
     } while (ntry < max_iter && this->test_error(this->err_st, ethr_band));
diff --git a/source/source_hsolver/diago_bpcg.h b/source/source_hsolver/diago_bpcg.h
index 27f528024ba..e89a59a5b10 100644
--- a/source/source_hsolver/diago_bpcg.h
+++ b/source/source_hsolver/diago_bpcg.h
@@ -84,6 +84,8 @@ class DiagoBPCG
     int n_dim = 0;
     /// max iter steps for all-band cg loop
     int nline = 4;
+    /// optimal frequency for subspace diagonalization (computed dynamically)
+    int optimal_freq = 4;
     /// parallel matrix multiplication
     ModuleBase::PGemmCN<T, Device> pmmcn;
     PLinearTransform<T, Device> plintrans;
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
index 88f94e288c6..e3af30c98af 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
@@ -207,6 +207,50 @@ struct refresh_hcc_scc_vcc_op<T, base_device::DEVICE_CPU>
     }
 };
 
+
+/**
+ * @brief Compute the optimal frequency for subspace diagonalization calls
+ *        based on problem size. Uses a tiered approach:
+ *        - Small problems: less frequent calls reduce overhead
+ *        - Large problems: more frequent calls maintain orthogonality
+ *
+ * The problem size is computed as n_band * n_basis using size_t to avoid
+ * integer overflow. The frequency is clamped to [1, nline] range and
+ * should not exceed the base iteration count.
+ *
+ * Tier thresholds (problem_size = n_band * n_basis):
+ *   size < 10,000        -> freq = nline     (small, minimal overhead needed)
+ *   size < 100,000       -> freq = min(6, nline)
+ *   size < 500,000       -> freq = min(5, nline)
+ *   size < 2,000,000     -> freq = min(4, nline) (baseline)
+ *   size < 10,000,000    -> freq = min(3, nline)
+ *   size >= 10,000,000   -> freq = min(2, nline) (large, frequent calls)
+ *
+ * @param n_band Number of bands (eigenvectors).
+ * @param n_basis Number of basis functions.
+ * @param nline Base iteration count for SCF > 1.
+ * @return Optimal frequency (number of CG steps between subspace diag calls).
+ */
+int compute_optimal_freq(const int n_band, const int n_basis, const int nline)
+{
+    const size_t problem_size = static_cast<size_t>(n_band) * static_cast<size_t>(n_basis);
+    int freq = nline;
+    if (problem_size >= 10000000) {
+        freq = 2;
+    } else if (problem_size >= 2000000) {
+        freq = 3;
+    } else if (problem_size >= 500000) {
+        freq = 4;
+    } else if (problem_size >= 100000) {
+        freq = 5;
+    } else if (problem_size >= 10000) {
+        freq = 6;
+    }
+    if (freq < 1) freq = 1;
+    if (freq > nline) freq = nline;
+    return freq;
+}
+
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h
index 9ac7c5e2cee..a29819b7877 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.h
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.h
@@ -3,7 +3,21 @@
 #include "source_base/macros.h"
 #include "source_base/module_device/types.h"
 namespace hsolver
-{
+{/**
+ * @brief Compute the optimal frequency for subspace diagonalization calls
+ *        based on problem size (n_band * n_basis).
+ *
+ * Large problems benefit from more frequent subspace diagonalization to
+ * maintain orthogonality, while small problems can reduce overhead by
+ * calling it less frequently.
+ *
+ * @param n_band Number of bands (eigenvectors).
+ * @param n_basis Number of basis functions.
+ * @param nline Base iteration count for SCF > 1.
+ * @return Optimal frequency (number of CG steps between calc_hsub_with_block calls).
+ */
+int compute_optimal_freq(const int n_band, const int n_basis, const int nline);
+
 
 template <typename T, typename Device>
 struct line_minimize_with_block_op

From 4b5fc5009b94605895ab56cb63e8a66e6897bc4f Mon Sep 17 00:00:00 2001
From: Missing-Hex <1304266750@qq.com>
Date: Sat, 6 Jun 2026 14:09:58 +0800
Subject: [PATCH 2/2] refactor:add adaptive frequency in bpcg

---
 source/source_hsolver/diago_bpcg.cpp          | 737 +++++++++---------
 source/source_hsolver/diago_bpcg.h            |  19 +
 .../source_hsolver/kernels/bpcg_kernel_op.cpp |  34 +
 .../source_hsolver/kernels/bpcg_kernel_op.h   |  28 +
 4 files changed, 471 insertions(+), 347 deletions(-)

diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp
index 980c9156802..ed557b8c9e9 100644
--- a/source/source_hsolver/diago_bpcg.cpp
+++ b/source/source_hsolver/diago_bpcg.cpp
@@ -1,347 +1,390 @@
-#include "source_hsolver/diago_bpcg.h"
-
-#include "diago_iter_assist.h"
-#include "source_base/global_function.h"
-#include "source_base/kernels/math_kernel_op.h"
-#include "source_base/parallel_comm.h" // different MPI worlds
-#include "source_hsolver/kernels/bpcg_kernel_op.h"
-#include "para_linear_transform.h"
-
-#include <ATen/kernels/blas.h>
-#include <ATen/kernels/lapack.h>
-#include <ATen/ops/einsum_op.h>
-#include <limits>
-
-namespace hsolver {
-
-template<typename T, typename Device>
-DiagoBPCG<T, Device>::DiagoBPCG(const Real* precondition_in)
-{
-    this->r_type   = ct::DataTypeToEnum<Real>::value;
-    this->t_type   = ct::DataTypeToEnum<T>::value;
-    this->device_type    = ct::DeviceTypeToEnum<Device>::value;
-
-    this->h_prec  = std::move(ct::TensorMap((void *) precondition_in, r_type, device_type, {this->n_basis}));
-
-    this->one = &one_;
-    this->zero = &zero_;
-    this->neg_one = &neg_one_;
-}
-
-template<typename T, typename Device>
-DiagoBPCG<T, Device>::~DiagoBPCG() {
-    // Note, we do not need to free the h_prec and psi pointer as they are refs to the outside data
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) {
-    // Specify the problem size n_basis, n_band, while lda is n_basis
-    this->n_band        = nband;
-    this->n_band_l      = nband_l;
-    this->n_basis       = nbasis;
-    this->n_dim         = ndim;
-
-    // All column major tensors
-
-    this->beta          = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
-    this->eigen         = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
-    this->err_st        = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
-
-    this->hsub          = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band}));
-
-    this->hpsi          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-    this->work          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-    this->hgrad         = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-    this->grad_old      = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-
-    this->prec          = std::move(ct::Tensor(r_type, device_type, {this->n_basis}));
-
-    this->grad          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
-#ifdef __MPI
-    this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
-    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false);
-#else
-    this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
-    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false);
-#endif
-}
-
-template<typename T, typename Device>
-bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band)
-{
-    Real* _err_st = err_in.data<Real>();
-    bool not_conv = false;
-    std::vector<Real> tmp_cpu;
-    if (err_in.device_type() == ct::DeviceType::GpuDevice) {
-        // ct::Tensor h_err_in = err_in.to_device<ct::DEVICE_CPU>();
-        // _err_st = h_err_in.data<Real>();
-        // qianrui change it, because it can not pass the valgrind test
-        tmp_cpu.resize(this->n_band_l);
-        _err_st = tmp_cpu.data();
-        syncmem_var_d2h_op()(_err_st, err_in.data<Real>(), this->n_band_l);
-    }
-    for (int ii = 0; ii < this->n_band_l; ii++) {
-        if (_err_st[ii] > ethr_band[ii]) {
-            not_conv = true;
-        }
-    }
-#ifdef __MPI
-    MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
-#endif
-    return not_conv;
-}
-
-// Finally, the last one!
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::line_minimize(
-    ct::Tensor& grad_in,
-    ct::Tensor& hgrad_in,
-    ct::Tensor& psi_out,
-    ct::Tensor& hpsi_out)
-{
-    line_minimize_with_block_op<T, Device>()(grad_in.data<T>(),
-                                             hgrad_in.data<T>(),
-                                             psi_out.data<T>(),
-                                             hpsi_out.data<T>(),
-                                             this->n_dim,
-                                             this->n_basis,
-                                             this->n_band_l);
-}
-
-
-// Finally, the last two!
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::orth_cholesky(
-		ct::Tensor& workspace_in,
-		ct::Tensor& psi_out,
-		ct::Tensor& hpsi_out,
-		ct::Tensor& hsub_out)
-{
-    // gemm: hsub_out(n_band x n_band) = psi_out^T(n_band x n_basis) * psi_out(n_basis x n_band)
-    this->pmmcn.multiply(1.0, psi_out.data<T>(), psi_out.data<T>(), 0.0, hsub_out.data<T>());
-
-    // set hsub matrix to lower format;
-    ct::kernels::set_matrix<T, ct_Device>()(
-        'L', hsub_out.data<T>(), this->n_band);
-
-    ct::kernels::lapack_potrf<T, ct_Device>()(
-        'U', this->n_band, hsub_out.data<T>(), this->n_band);
-    ct::kernels::lapack_trtri<T, ct_Device>()(
-        'U', 'N', this->n_band, hsub_out.data<T>(), this->n_band);
-
-    this->rotate_wf(hsub_out, psi_out, workspace_in);
-    this->rotate_wf(hsub_out, hpsi_out, workspace_in);
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_grad_with_block(
-        const ct::Tensor& prec_in,
-        ct::Tensor& err_out,
-        ct::Tensor& beta_out,
-        ct::Tensor& psi_in,
-        ct::Tensor& hpsi_in,
-        ct::Tensor& grad_out,
-        ct::Tensor& grad_old_out)
-{
-    calc_grad_with_block_op<T, Device>()(prec_in.data<Real>(),
-                                         err_out.data<Real>(),
-                                         beta_out.data<Real>(),
-                                         psi_in.data<T>(),
-                                         hpsi_in.data<T>(),
-                                         grad_out.data<T>(),
-                                         grad_old_out.data<T>(),
-                                         this->n_dim,
-                                         this->n_basis,
-                                         this->n_band_l);
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_prec()
-{
-    syncmem_var_h2d_op()(this->prec.template data<Real>(), this->h_prec.template data<Real>(), this->n_basis);
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::orth_projection(
-        const ct::Tensor& psi_in,
-        ct::Tensor& hsub_in,
-        ct::Tensor& grad_out)
-{
-    // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band)
-    this->pmmcn.multiply(1.0, psi_in.data<T>(), grad_out.data<T>(), 0.0, hsub_in.data<T>());
-
-    // grad_out(n_basis x n_band) = 1.0 * grad_out(n_basis x n_band) - psi_in(n_basis x n_band) * hsub_in(n_band x
-    // n_band)
-    this->plintrans.act(-1.0, psi_in.data<T>(), hsub_in.data<T>(), 1.0, grad_out.data<T>());
-    return;
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::rotate_wf(
-        const ct::Tensor& hsub_in,
-        ct::Tensor& psi_out,
-        ct::Tensor& workspace_in)
-{
-    // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
-    this->plintrans.act(1.0, psi_out.data<T>(), hsub_in.data<T>(), 0.0, workspace_in.data<T>());
-    syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band_l * this->n_basis);
-
-    return;
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_hpsi_with_block(
-        const HPsiFunc& hpsi_func,
-        T *psi_in,
-        ct::Tensor& hpsi_out)
-{
-    // calculate all-band hpsi
-    hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band_l);
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::diag_hsub(
-        const ct::Tensor& psi_in,
-        const ct::Tensor& hpsi_in,
-        ct::Tensor& hsub_out,
-        ct::Tensor& eigenvalue_out)
-{
-    // gemm: hsub_out(n_band x n_band) = hpsi_in^T(n_band x n_basis) * psi_in(n_basis x n_band)
-    this->pmmcn.multiply(1.0, hpsi_in.data<T>(), psi_in.data<T>(), 0.0, hsub_out.data<T>());
-
-    // ct::kernels::lapack_heevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
-    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band, hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
-
-    return;
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_hsub_with_block(
-        const HPsiFunc& hpsi_func,
-        T *psi_in,
-        ct::Tensor& psi_out,
-        ct::Tensor& hpsi_out,
-        ct::Tensor& hsub_out,
-        ct::Tensor& workspace_in,
-        ct::Tensor& eigenvalue_out)
-{
-    // Apply the H operator to psi and obtain the hpsi matrix.
-    this->calc_hpsi_with_block(hpsi_func, psi_in, hpsi_out);
-
-    // Diagonalization of the subspace matrix.
-    this->diag_hsub(psi_out,hpsi_out, hsub_out, eigenvalue_out);
-
-    // inplace matmul to get the initial guessed wavefunction psi.
-    // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band]
-    // hpsi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band]
-    this->rotate_wf(hsub_out, psi_out, workspace_in);
-    this->rotate_wf(hsub_out, hpsi_out, workspace_in);
-
-    return;
-}
-
-template<typename T, typename Device>
-void DiagoBPCG<T, Device>::calc_hsub_with_block_exit(
-        ct::Tensor& psi_out,
-        ct::Tensor& hpsi_out,
-        ct::Tensor& hsub_out,
-        ct::Tensor& workspace_in,
-        ct::Tensor& eigenvalue_out)
-{
-    // Diagonalization of the subspace matrix.
-    this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out);
-
-    // inplace matmul to get the initial guessed wavefunction psi.
-    // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band]
-    this->rotate_wf(hsub_out, psi_out, workspace_in);
-
-    return;
-}
-
-template <typename T, typename Device>
-void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
-                                T* psi_in,
-                                Real* eigenvalue_in,
-                                const std::vector<double>& ethr_band)
-{
-    const int current_scf_iter = hsolver::DiagoIterAssist<T, Device>::SCF_ITER;
-    // Get the pointer of the input psi
-    this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis}));
-
-    // Update the precondition array
-    this->calc_prec();
-
-    // Improving the initial guess of the wave function psi through a subspace diagonalization.
-    this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
-
-    setmem_complex_op()(this->grad_old.template data<T>(), 0, this->n_basis * this->n_band_l);
-
-    setmem_var_op()(this->beta.template data<Real>(), std::numeric_limits<Real>::infinity(), this->n_band_l);
-
-    int ntry = 0;
-    int max_iter = current_scf_iter > 1 ?
-                   this->nline :
-                   this->nline * 6;
-
-    // Compute optimal frequency for subspace diagonalization based on problem size
-    this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline);
-
-    do
-    {
-        ++ntry;
-        // Be careful here ! dangerous zone!
-        // 1. normalize psi
-        // 2. calculate the epsilo
-        // 3. calculate the gradient by hpsi - epsilo * psi
-        // 4. gradient mix with the previous gradient
-        // 5. Do precondition
-        this->calc_grad_with_block(this->prec, this->err_st, this->beta,
-                                 this->psi, this->hpsi, this->grad, this->grad_old);
-
-        // Orthogonalize column vectors g_i in matrix grad to column vectors p_j in matrix psi
-        // for all 'j less or equal to i'.
-        // Note: hsub and work are only used to store intermediate variables of gemm operator.
-        this->orth_projection(this->psi, this->hsub, this->grad);
-
-        // this->grad_old = this->grad;
-        syncmem_complex_op()(this->grad_old.template data<T>(), this->grad.template data<T>(), n_basis * n_band_l);
-
-        // Calculate H|grad> matrix
-        this->calc_hpsi_with_block(hpsi_func, this->grad.template data<T>(), /*this->grad_wrapper[0],*/ this->hgrad);
-
-        // optimize psi as well as the hpsi
-        // 1. normalize grad
-        // 2. calculate theta
-        // 3. update psi as well as hpsi
-        this->line_minimize(this->grad, this->hgrad, this->psi, this->hpsi);
-
-        // orthogonal psi by cholesky method
-        this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
-
-        if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) {
-            this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
-        }
-    } while (ntry < max_iter && this->test_error(this->err_st, ethr_band));
-
-    this->calc_hsub_with_block_exit(this->psi, this->hpsi, this->hsub, this->work, this->eigen);
-
-    int start_nband = 0;
-#ifdef __MPI
-    if (this->plintrans.nproc_col > 1)
-    {
-        start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP];
-    }
-#endif
-    syncmem_var_d2h_op()(eigenvalue_in, this->eigen.template data<Real>() + start_nband, this->n_band_l);
-
-    return;
-}
-
-template class DiagoBPCG<std::complex<float>, base_device::DEVICE_CPU>;
-template class DiagoBPCG<std::complex<double>, base_device::DEVICE_CPU>;
-#if ((defined __CUDA) || (defined __ROCM))
-template class DiagoBPCG<std::complex<float>, base_device::DEVICE_GPU>;
-template class DiagoBPCG<std::complex<double>, base_device::DEVICE_GPU>;
-#endif
-
-} // namespace hsolver
+#include "source_hsolver/diago_bpcg.h"
+
+#include "diago_iter_assist.h"
+#include "source_base/global_function.h"
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/parallel_comm.h" // different MPI worlds
+#include "source_base/parallel_reduce.h"
+#include "source_hsolver/kernels/bpcg_kernel_op.h"
+#include "para_linear_transform.h"
+
+#include <ATen/kernels/blas.h>
+#include <ATen/kernels/lapack.h>
+#include <ATen/ops/einsum_op.h>
+#include <limits>
+
+namespace hsolver {
+
+template<typename T, typename Device>
+DiagoBPCG<T, Device>::DiagoBPCG(const Real* precondition_in)
+{
+    this->r_type   = ct::DataTypeToEnum<Real>::value;
+    this->t_type   = ct::DataTypeToEnum<T>::value;
+    this->device_type    = ct::DeviceTypeToEnum<Device>::value;
+
+    this->h_prec  = std::move(ct::TensorMap((void *) precondition_in, r_type, device_type, {this->n_basis}));
+
+    this->one = &one_;
+    this->zero = &zero_;
+    this->neg_one = &neg_one_;
+}
+
+template<typename T, typename Device>
+DiagoBPCG<T, Device>::~DiagoBPCG() {
+    // Note, we do not need to free the h_prec and psi pointer as they are refs to the outside data
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) {
+    // Specify the problem size n_basis, n_band, while lda is n_basis
+    this->n_band        = nband;
+    this->n_band_l      = nband_l;
+    this->n_basis       = nbasis;
+    this->n_dim         = ndim;
+
+    // All column major tensors
+
+    this->beta          = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
+    this->eigen         = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
+    this->err_st        = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
+
+    this->hsub          = std::move(ct::Tensor(t_type, device_type, {this->n_band, this->n_band}));
+
+    this->hpsi          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->work          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hgrad         = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->grad_old      = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+
+    this->prec          = std::move(ct::Tensor(r_type, device_type, {this->n_basis}));
+
+    this->grad          = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+#ifdef __MPI
+    this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
+    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false);
+#else
+    this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
+    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false);
+#endif
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::update_adaptive_freq(const std::vector<double>& ethr_band)
+{
+    Real* _err_st = this->err_st.data<Real>();
+    std::vector<Real> tmp_cpu;
+    if (this->err_st.device_type() == ct::DeviceType::GpuDevice) {
+        tmp_cpu.resize(this->n_band_l);
+        _err_st = tmp_cpu.data();
+        syncmem_var_d2h_op()(_err_st, this->err_st.template data<Real>(), this->n_band_l);
+    }
+
+    Real avg_err = 0.0;
+    for (int ii = 0; ii < this->n_band_l; ii++) {
+        avg_err += _err_st[ii];
+    }
+    avg_err /= this->n_band_l;
+
+#ifdef __MPI
+    Parallel_Reduce::reduce_pool(avg_err);
+#endif
+
+    if (this->prev_avg_error > 0.0) {
+        double conv_rate = compute_convergence_rate(
+            static_cast<double>(this->prev_avg_error),
+            static_cast<double>(avg_err));
+
+        this->adaptive_freq = compute_adaptive_freq(
+            conv_rate, this->optimal_freq, this->nline);
+    }
+
+    this->prev_avg_error = static_cast<double>(avg_err);
+    this->steps_since_measure = 0;
+}
+
+template<typename T, typename Device>
+bool DiagoBPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band)
+{
+    Real* _err_st = err_in.data<Real>();
+    bool not_conv = false;
+    std::vector<Real> tmp_cpu;
+    if (err_in.device_type() == ct::DeviceType::GpuDevice) {
+        // ct::Tensor h_err_in = err_in.to_device<ct::DEVICE_CPU>();
+        // _err_st = h_err_in.data<Real>();
+        // qianrui change it, because it can not pass the valgrind test
+        tmp_cpu.resize(this->n_band_l);
+        _err_st = tmp_cpu.data();
+        syncmem_var_d2h_op()(_err_st, err_in.data<Real>(), this->n_band_l);
+    }
+    for (int ii = 0; ii < this->n_band_l; ii++) {
+        if (_err_st[ii] > ethr_band[ii]) {
+            not_conv = true;
+        }
+    }
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+#endif
+    return not_conv;
+}
+
+// Finally, the last one!
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::line_minimize(
+    ct::Tensor& grad_in,
+    ct::Tensor& hgrad_in,
+    ct::Tensor& psi_out,
+    ct::Tensor& hpsi_out)
+{
+    line_minimize_with_block_op<T, Device>()(grad_in.data<T>(),
+                                             hgrad_in.data<T>(),
+                                             psi_out.data<T>(),
+                                             hpsi_out.data<T>(),
+                                             this->n_dim,
+                                             this->n_basis,
+                                             this->n_band_l);
+}
+
+
+// Finally, the last two!
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::orth_cholesky(
+		ct::Tensor& workspace_in,
+		ct::Tensor& psi_out,
+		ct::Tensor& hpsi_out,
+		ct::Tensor& hsub_out)
+{
+    // gemm: hsub_out(n_band x n_band) = psi_out^T(n_band x n_basis) * psi_out(n_basis x n_band)
+    this->pmmcn.multiply(1.0, psi_out.data<T>(), psi_out.data<T>(), 0.0, hsub_out.data<T>());
+
+    // set hsub matrix to lower format;
+    ct::kernels::set_matrix<T, ct_Device>()(
+        'L', hsub_out.data<T>(), this->n_band);
+
+    ct::kernels::lapack_potrf<T, ct_Device>()(
+        'U', this->n_band, hsub_out.data<T>(), this->n_band);
+    ct::kernels::lapack_trtri<T, ct_Device>()(
+        'U', 'N', this->n_band, hsub_out.data<T>(), this->n_band);
+
+    this->rotate_wf(hsub_out, psi_out, workspace_in);
+    this->rotate_wf(hsub_out, hpsi_out, workspace_in);
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_grad_with_block(
+        const ct::Tensor& prec_in,
+        ct::Tensor& err_out,
+        ct::Tensor& beta_out,
+        ct::Tensor& psi_in,
+        ct::Tensor& hpsi_in,
+        ct::Tensor& grad_out,
+        ct::Tensor& grad_old_out)
+{
+    calc_grad_with_block_op<T, Device>()(prec_in.data<Real>(),
+                                         err_out.data<Real>(),
+                                         beta_out.data<Real>(),
+                                         psi_in.data<T>(),
+                                         hpsi_in.data<T>(),
+                                         grad_out.data<T>(),
+                                         grad_old_out.data<T>(),
+                                         this->n_dim,
+                                         this->n_basis,
+                                         this->n_band_l);
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_prec()
+{
+    syncmem_var_h2d_op()(this->prec.template data<Real>(), this->h_prec.template data<Real>(), this->n_basis);
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::orth_projection(
+        const ct::Tensor& psi_in,
+        ct::Tensor& hsub_in,
+        ct::Tensor& grad_out)
+{
+    // gemm: hsub_in(n_band x n_band) = psi_in^T(n_band x n_basis) * grad_out(n_basis x n_band)
+    this->pmmcn.multiply(1.0, psi_in.data<T>(), grad_out.data<T>(), 0.0, hsub_in.data<T>());
+
+    // grad_out(n_basis x n_band) = 1.0 * grad_out(n_basis x n_band) - psi_in(n_basis x n_band) * hsub_in(n_band x
+    // n_band)
+    this->plintrans.act(-1.0, psi_in.data<T>(), hsub_in.data<T>(), 1.0, grad_out.data<T>());
+    return;
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::rotate_wf(
+        const ct::Tensor& hsub_in,
+        ct::Tensor& psi_out,
+        ct::Tensor& workspace_in)
+{
+    // gemm: workspace_in(n_basis x n_band) = psi_out(n_basis x n_band) * hsub_in(n_band x n_band)
+    this->plintrans.act(1.0, psi_out.data<T>(), hsub_in.data<T>(), 0.0, workspace_in.data<T>());
+    syncmem_complex_op()(psi_out.template data<T>(), workspace_in.template data<T>(), this->n_band_l * this->n_basis);
+
+    return;
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_hpsi_with_block(
+        const HPsiFunc& hpsi_func,
+        T *psi_in,
+        ct::Tensor& hpsi_out)
+{
+    // calculate all-band hpsi
+    hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band_l);
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::diag_hsub(
+        const ct::Tensor& psi_in,
+        const ct::Tensor& hpsi_in,
+        ct::Tensor& hsub_out,
+        ct::Tensor& eigenvalue_out)
+{
+    // gemm: hsub_out(n_band x n_band) = hpsi_in^T(n_band x n_basis) * psi_in(n_basis x n_band)
+    this->pmmcn.multiply(1.0, hpsi_in.data<T>(), psi_in.data<T>(), 0.0, hsub_out.data<T>());
+
+    // ct::kernels::lapack_heevd<T, ct_Device>()('V', 'U', hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
+    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band, hsub_out.data<T>(), this->n_band, eigenvalue_out.data<Real>());
+
+    return;
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_hsub_with_block(
+        const HPsiFunc& hpsi_func,
+        T *psi_in,
+        ct::Tensor& psi_out,
+        ct::Tensor& hpsi_out,
+        ct::Tensor& hsub_out,
+        ct::Tensor& workspace_in,
+        ct::Tensor& eigenvalue_out)
+{
+    // Apply the H operator to psi and obtain the hpsi matrix.
+    this->calc_hpsi_with_block(hpsi_func, psi_in, hpsi_out);
+
+    // Diagonalization of the subspace matrix.
+    this->diag_hsub(psi_out,hpsi_out, hsub_out, eigenvalue_out);
+
+    // inplace matmul to get the initial guessed wavefunction psi.
+    // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band]
+    // hpsi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band]
+    this->rotate_wf(hsub_out, psi_out, workspace_in);
+    this->rotate_wf(hsub_out, hpsi_out, workspace_in);
+
+    return;
+}
+
+template<typename T, typename Device>
+void DiagoBPCG<T, Device>::calc_hsub_with_block_exit(
+        ct::Tensor& psi_out,
+        ct::Tensor& hpsi_out,
+        ct::Tensor& hsub_out,
+        ct::Tensor& workspace_in,
+        ct::Tensor& eigenvalue_out)
+{
+    // Diagonalization of the subspace matrix.
+    this->diag_hsub(psi_out, hpsi_out, hsub_out, eigenvalue_out);
+
+    // inplace matmul to get the initial guessed wavefunction psi.
+    // psi_out[n_basis, n_band] = psi_out[n_basis, n_band] x hsub_out[n_band, n_band]
+    this->rotate_wf(hsub_out, psi_out, workspace_in);
+
+    return;
+}
+
+template <typename T, typename Device>
+void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
+                                T* psi_in,
+                                Real* eigenvalue_in,
+                                const std::vector<double>& ethr_band)
+{
+    const int current_scf_iter = hsolver::DiagoIterAssist<T, Device>::SCF_ITER;
+    // Get the pointer of the input psi
+    this->psi = std::move(ct::TensorMap(psi_in /*psi_in.get_pointer()*/, t_type, device_type, {this->n_band_l, this->n_basis}));
+
+    // Update the precondition array
+    this->calc_prec();
+
+    // Improving the initial guess of the wave function psi through a subspace diagonalization.
+    this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
+
+    setmem_complex_op()(this->grad_old.template data<T>(), 0, this->n_basis * this->n_band_l);
+
+    setmem_var_op()(this->beta.template data<Real>(), std::numeric_limits<Real>::infinity(), this->n_band_l);
+
+    int ntry = 0;
+    int max_iter = current_scf_iter > 1 ?
+                   this->nline :
+                   this->nline * 6;
+
+    // Compute optimal frequency for subspace diagonalization based on problem size
+    this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline);
+    this->adaptive_freq = this->optimal_freq;
+    this->prev_avg_error = 0.0;
+    this->steps_since_measure = 0;
+
+    do
+    {
+        ++ntry;
+        // Be careful here ! dangerous zone!
+        // 1. normalize psi
+        // 2. calculate the epsilo
+        // 3. calculate the gradient by hpsi - epsilo * psi
+        // 4. gradient mix with the previous gradient
+        // 5. Do precondition
+        this->calc_grad_with_block(this->prec, this->err_st, this->beta,
+                                 this->psi, this->hpsi, this->grad, this->grad_old);
+
+        // Orthogonalize column vectors g_i in matrix grad to column vectors p_j in matrix psi
+        // for all 'j less or equal to i'.
+        // Note: hsub and work are only used to store intermediate variables of gemm operator.
+        this->orth_projection(this->psi, this->hsub, this->grad);
+
+        // this->grad_old = this->grad;
+        syncmem_complex_op()(this->grad_old.template data<T>(), this->grad.template data<T>(), n_basis * n_band_l);
+
+        // Calculate H|grad> matrix
+        this->calc_hpsi_with_block(hpsi_func, this->grad.template data<T>(), /*this->grad_wrapper[0],*/ this->hgrad);
+
+        // optimize psi as well as the hpsi
+        // 1. normalize grad
+        // 2. calculate theta
+        // 3. update psi as well as hpsi
+        this->line_minimize(this->grad, this->hgrad, this->psi, this->hpsi);
+
+        // orthogonal psi by cholesky method
+        this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
+
+        ++this->steps_since_measure;
+        if (current_scf_iter == 1 && this->steps_since_measure >= this->measure_interval) {
+            this->update_adaptive_freq(ethr_band);
+        }
+
+        if (current_scf_iter == 1 && ntry % this->adaptive_freq == 0) {
+            this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
+        }
+    } while (ntry < max_iter && this->test_error(this->err_st, ethr_band));
+
+    this->calc_hsub_with_block_exit(this->psi, this->hpsi, this->hsub, this->work, this->eigen);
+
+    int start_nband = 0;
+#ifdef __MPI
+    if (this->plintrans.nproc_col > 1)
+    {
+        start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP];
+    }
+#endif
+    syncmem_var_d2h_op()(eigenvalue_in, this->eigen.template data<Real>() + start_nband, this->n_band_l);
+
+    return;
+}
+
+template class DiagoBPCG<std::complex<float>, base_device::DEVICE_CPU>;
+template class DiagoBPCG<std::complex<double>, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class DiagoBPCG<std::complex<float>, base_device::DEVICE_GPU>;
+template class DiagoBPCG<std::complex<double>, base_device::DEVICE_GPU>;
+#endif
+
+} // namespace hsolver
diff --git a/source/source_hsolver/diago_bpcg.h b/source/source_hsolver/diago_bpcg.h
index e89a59a5b10..6bd52d902a3 100644
--- a/source/source_hsolver/diago_bpcg.h
+++ b/source/source_hsolver/diago_bpcg.h
@@ -86,6 +86,14 @@ class DiagoBPCG
     int nline = 4;
     /// optimal frequency for subspace diagonalization (computed dynamically)
     int optimal_freq = 4;
+    /// current adaptive frequency (may override optimal_freq based on convergence)
+    int adaptive_freq = 4;
+    /// previous average error for convergence rate tracking
+    double prev_avg_error = 0.0;
+    /// number of iterations since last error measurement
+    int steps_since_measure = 0;
+    /// measurement interval (how often to sample error for rate computation)
+    int measure_interval = 2;
     /// parallel matrix multiplication
     ModuleBase::PGemmCN<T, Device> pmmcn;
     PLinearTransform<T, Device> plintrans;
@@ -326,6 +334,17 @@ class DiagoBPCG
         ct::Tensor& hpsi_out, 
         ct::Tensor& hsub_out);
 
+    /**
+     * @brief Update adaptive frequency based on convergence rate.
+     *
+     * Computes the average error from err_st and compares it with
+     * the previous measurement to determine the convergence rate.
+     * The adaptive frequency is then computed and applied.
+     *
+     * @param ethr_band Per-band error thresholds.
+     */
+    void update_adaptive_freq(const std::vector<double>& ethr_band);
+
     /**
      * @brief Checks if the error satisfies the given threshold.
      *
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
index e3af30c98af..93d398037a3 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
@@ -246,8 +246,42 @@ int compute_optimal_freq(const int n_band, const int n_basis, const int nline)
     } else if (problem_size >= 10000) {
         freq = 6;
     }
+    // Ensure freq is at least 1 (call at most every step)
+    if (freq < 1) freq = 1;
+    return freq;
+}
+
+double compute_convergence_rate(const double prev_avg_err, const double curr_avg_err)
+{
+    if (prev_avg_err <= 0.0) return 0.0;
+    if (curr_avg_err <= 0.0) return 0.0;
+    return curr_avg_err / prev_avg_err;
+}
+
+int compute_adaptive_freq(const double conv_rate, const int base_freq, const int nline)
+{
+    int freq = base_freq;
+
+    if (conv_rate < 0.3)
+    {
+        freq = base_freq + 1;
+    }
+    else if (conv_rate < 0.6)
+    {
+        freq = base_freq;
+    }
+    else if (conv_rate < 0.85)
+    {
+        freq = std::max(base_freq - 1, 1);
+    }
+    else
+    {
+        freq = std::max(base_freq - 2, 1);
+    }
+
     if (freq < 1) freq = 1;
     if (freq > nline) freq = nline;
+
     return freq;
 }
 
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h
index a29819b7877..0ee7b37769f 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.h
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.h
@@ -18,6 +18,34 @@ namespace hsolver
  */
 int compute_optimal_freq(const int n_band, const int n_basis, const int nline);
 
+/**
+ * @brief Compute the convergence rate from historical error values.
+ *
+ * The convergence rate is computed as the ratio of current average error
+ * to previous average error. A rate close to 1.0 indicates slow convergence
+ * (stagnation), while a rate close to 0.0 indicates fast convergence.
+ *
+ * @param prev_avg_err Average error from the previous measurement point.
+ * @param curr_avg_err Average error from the current measurement point.
+ * @return Convergence rate in range [0.0, 1.0+]. Values > 1.0 indicate divergence.
+ */
+double compute_convergence_rate(const double prev_avg_err, const double curr_avg_err);
+
+/**
+ * @brief Compute adaptive diagonalization frequency based on convergence rate.
+ *
+ * Uses the convergence rate to dynamically adjust how often subspace
+ * diagonalization is performed. Slow convergence triggers more frequent
+ * diagonalization to reset the search direction, while fast convergence
+ * reduces the frequency to save computation.
+ *
+ * @param conv_rate Convergence rate (curr_avg_err / prev_avg_err).
+ * @param base_freq Base frequency from problem-size-based computation.
+ * @param nline Maximum allowed frequency (CG iteration limit).
+ * @return Adaptive frequency in range [1, nline].
+ */
+int compute_adaptive_freq(const double conv_rate, const int base_freq, const int nline);
+
 
 template <typename T, typename Device>
 struct line_minimize_with_block_op