diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp
index d4db3d790bc..980c9156802 100644
--- a/source/source_hsolver/diago_bpcg.cpp
+++ b/source/source_hsolver/diago_bpcg.cpp
@@ -282,6 +282,10 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
     int max_iter = current_scf_iter > 1 ?
                    this->nline :
                    this->nline * 6;
+
+    // Compute optimal frequency for subspace diagonalization based on problem size
+    this->optimal_freq = compute_optimal_freq(this->n_band, this->n_basis, this->nline);
+
     do
     {
         ++ntry;
@@ -314,7 +318,7 @@ void DiagoBPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
         // orthogonal psi by cholesky method
         this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub);
 
-        if (current_scf_iter == 1 && ntry % this->nline == 0) {
+        if (current_scf_iter == 1 && ntry % this->optimal_freq == 0) {
             this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen);
         }
     } while (ntry < max_iter && this->test_error(this->err_st, ethr_band));
diff --git a/source/source_hsolver/diago_bpcg.h b/source/source_hsolver/diago_bpcg.h
index 27f528024ba..e89a59a5b10 100644
--- a/source/source_hsolver/diago_bpcg.h
+++ b/source/source_hsolver/diago_bpcg.h
@@ -84,6 +84,8 @@ class DiagoBPCG
     int n_dim = 0;
     /// max iter steps for all-band cg loop
     int nline = 4;
+    /// optimal frequency for subspace diagonalization (computed dynamically)
+    int optimal_freq = 4;
     /// parallel matrix multiplication
     ModuleBase::PGemmCN<T, Device> pmmcn;
     PLinearTransform<T, Device> plintrans;
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
index 88f94e288c6..ce9f1d62962 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
@@ -18,29 +18,75 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
                     const int& n_basis_max,
                     const int& n_band)
     {
+        // OpenMP parallelization over bands: each band accesses a disjoint memory region
+        // [band_idx * n_basis_max, (band_idx+1) * n_basis_max), so no data races occur.
+        // MPI collective calls (reduce_pool) use per-band scalar reductions executed serially
+        // outside parallel regions to avoid thread-safety issues with MPI.
+        // Static scheduling is used since each band has equal workload (n_basis).
+        std::vector<Real> norms(n_band);
+        std::vector<Real> epsilo_0_arr(n_band);
+        std::vector<Real> epsilo_1_arr(n_band);
+        std::vector<Real> epsilo_2_arr(n_band);
+
+        // Phase 1: parallel computation of per-band norms via BLAS dot
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
         for (int band_idx = 0; band_idx < n_band; band_idx++)
         {
-            Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0;
-            Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0;
             auto A = reinterpret_cast<const Real*>(grad_out + band_idx * n_basis_max);
-            Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
-            Parallel_Reduce::reduce_pool(norm);
-            norm = 1.0 / sqrt(norm);
+            norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
+        }
+
+        // Phase 2: MPI reduction of norms across pool, then invert
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Parallel_Reduce::reduce_pool(norms[band_idx]);
+            norms[band_idx] = 1.0 / sqrt(norms[band_idx]);
+        }
+
+        // Phase 3: parallel normalization of grad/hgrad and accumulation of epsilons
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Real eps_0 = 0.0, eps_1 = 0.0, eps_2 = 0.0;
+            Real norm = norms[band_idx];
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
                 grad_out[item] *= norm;
                 hgrad_out[item] *= norm;
-                epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item]));
-                epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
-                epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
+                eps_0 += std::real(hpsi_out[item] * std::conj(psi_out[item]));
+                eps_1 += std::real(grad_out[item] * std::conj(hpsi_out[item]));
+                eps_2 += std::real(grad_out[item] * std::conj(hgrad_out[item]));
             }
-            Parallel_Reduce::reduce_pool(epsilo_0);
-            Parallel_Reduce::reduce_pool(epsilo_1);
-            Parallel_Reduce::reduce_pool(epsilo_2);
-            theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
-            cos_theta = std::cos(theta);
-            sin_theta = std::sin(theta);
+            epsilo_0_arr[band_idx] = eps_0;
+            epsilo_1_arr[band_idx] = eps_1;
+            epsilo_2_arr[band_idx] = eps_2;
+        }
+
+        // Phase 4: MPI reduction of epsilons across pool
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Parallel_Reduce::reduce_pool(epsilo_0_arr[band_idx]);
+            Parallel_Reduce::reduce_pool(epsilo_1_arr[band_idx]);
+            Parallel_Reduce::reduce_pool(epsilo_2_arr[band_idx]);
+        }
+
+        // Phase 5: parallel application of rotation to psi and hpsi
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Real epsilo_0 = epsilo_0_arr[band_idx];
+            Real epsilo_1 = epsilo_1_arr[band_idx];
+            Real epsilo_2 = epsilo_2_arr[band_idx];
+            Real theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
+            Real cos_theta = std::cos(theta);
+            Real sin_theta = std::sin(theta);
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -66,43 +112,101 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
                     const int& n_basis_max,
                     const int& n_band)
     {
+        // OpenMP parallelization over bands: each band accesses a disjoint memory region
+        // [band_idx * n_basis_max, (band_idx+1) * n_basis_max), so no data races occur.
+        // MPI collective calls (reduce_pool) use per-band scalar reductions executed serially
+        // outside parallel regions to avoid thread-safety issues with MPI.
+        // Static scheduling is used since each band has equal workload (n_basis).
+        std::vector<Real> norms(n_band);
+        std::vector<Real> epsilo_arr(n_band);
+        std::vector<Real> err_arr(n_band);
+        std::vector<Real> beta_arr(n_band);
+
+        // Phase 1: parallel computation of per-band norms via BLAS dot
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
         for (int band_idx = 0; band_idx < n_band; band_idx++)
         {
-            Real err = 0.0;
-            Real beta = 0.0;
-            Real epsilo = 0.0;
-            Real grad_2 = {0.0};
-            T grad_1 = {0.0, 0.0};
             auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
-            Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
-            Parallel_Reduce::reduce_pool(norm);
-            norm = 1.0 / sqrt(norm);
+            norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
+        }
+
+        // Phase 2: MPI reduction of norms across pool, then invert
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Parallel_Reduce::reduce_pool(norms[band_idx]);
+            norms[band_idx] = 1.0 / sqrt(norms[band_idx]);
+        }
+
+        // Phase 3: parallel normalization of psi/hpsi and accumulation of epsilo
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Real eps = 0.0;
+            Real norm = norms[band_idx];
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
                 psi_out[item] *= norm;
                 hpsi_out[item] *= norm;
-                epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
+                eps += std::real(hpsi_out[item] * std::conj(psi_out[item]));
             }
-            Parallel_Reduce::reduce_pool(epsilo);
+            epsilo_arr[band_idx] = eps;
+        }
+
+        // Phase 4: MPI reduction of epsilons across pool
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Parallel_Reduce::reduce_pool(epsilo_arr[band_idx]);
+        }
+
+        // Phase 5: parallel computation of per-band err and beta
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Real err = 0.0;
+            Real beta = 0.0;
+            Real epsilo = epsilo_arr[band_idx];
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
-                grad_2 = std::norm(grad_1);
+                T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                Real grad_2 = std::norm(grad_1);
                 err += grad_2;
-                beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
+                beta += grad_2 / prec_in[basis_idx];
             }
-            Parallel_Reduce::reduce_pool(err);
-            Parallel_Reduce::reduce_pool(beta);
+            err_arr[band_idx] = err;
+            beta_arr[band_idx] = beta;
+        }
+
+        // Phase 6: MPI reduction of err and beta across pool
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Parallel_Reduce::reduce_pool(err_arr[band_idx]);
+            Parallel_Reduce::reduce_pool(beta_arr[band_idx]);
+        }
+
+        // Phase 7: parallel update of gradient and write output arrays
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+        for (int band_idx = 0; band_idx < n_band; band_idx++)
+        {
+            Real epsilo = epsilo_arr[band_idx];
+            Real beta = beta_arr[band_idx];
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
                 grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
             }
             beta_out[band_idx] = beta;
-            err_out[band_idx] = sqrt(err);
+            err_out[band_idx] = sqrt(err_arr[band_idx]);
         }
     }
 };
@@ -207,6 +311,50 @@ struct refresh_hcc_scc_vcc_op<T, base_device::DEVICE_CPU>
     }
 };
 
+
+/**
+ * @brief Compute the optimal frequency for subspace diagonalization calls
+ *        based on problem size. Uses a tiered approach:
+ *        - Small problems: less frequent calls reduce overhead
+ *        - Large problems: more frequent calls maintain orthogonality
+ *
+ * The problem size is computed as n_band * n_basis using size_t to avoid
+ * integer overflow. The frequency is clamped to [1, nline] range and
+ * should not exceed the base iteration count.
+ *
+ * Tier thresholds (problem_size = n_band * n_basis):
+ *   size < 10,000        -> freq = nline     (small, minimal overhead needed)
+ *   size < 100,000       -> freq = min(6, nline)
+ *   size < 500,000       -> freq = min(5, nline)
+ *   size < 2,000,000     -> freq = min(4, nline) (baseline)
+ *   size < 10,000,000    -> freq = min(3, nline)
+ *   size >= 10,000,000   -> freq = min(2, nline) (large, frequent calls)
+ *
+ * @param n_band Number of bands (eigenvectors).
+ * @param n_basis Number of basis functions.
+ * @param nline Base iteration count for SCF > 1.
+ * @return Optimal frequency (number of CG steps between subspace diag calls).
+ */
+int compute_optimal_freq(const int n_band, const int n_basis, const int nline)
+{
+    const size_t problem_size = static_cast<size_t>(n_band) * static_cast<size_t>(n_basis);
+    int freq = nline;
+    if (problem_size >= 10000000) {
+        freq = 2;
+    } else if (problem_size >= 2000000) {
+        freq = 3;
+    } else if (problem_size >= 500000) {
+        freq = 4;
+    } else if (problem_size >= 100000) {
+        freq = 5;
+    } else if (problem_size >= 10000) {
+        freq = 6;
+    }
+    if (freq < 1) freq = 1;
+    if (freq > nline) freq = nline;
+    return freq;
+}
+
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h
index 9ac7c5e2cee..a29819b7877 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.h
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.h
@@ -3,7 +3,21 @@
 #include "source_base/macros.h"
 #include "source_base/module_device/types.h"
 namespace hsolver
-{
+{/**
+ * @brief Compute the optimal frequency for subspace diagonalization calls
+ *        based on problem size (n_band * n_basis).
+ *
+ * Large problems benefit from more frequent subspace diagonalization to
+ * maintain orthogonality, while small problems can reduce overhead by
+ * calling it less frequently.
+ *
+ * @param n_band Number of bands (eigenvectors).
+ * @param n_basis Number of basis functions.
+ * @param nline Base iteration count for SCF > 1.
+ * @return Optimal frequency (number of CG steps between calc_hsub_with_block calls).
+ */
+int compute_optimal_freq(const int n_band, const int n_basis, const int nline);
+
 
 template <typename T, typename Device>
 struct line_minimize_with_block_op