From fe5f33397c3ab64f8104ad19b73a49c8b4c94ade Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Fri, 6 Feb 2026 15:58:06 +0100
Subject: [PATCH] Scheduler-Worker Model + Reliability Branching (#766)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR implements the schedule-worker model for B&B, where one thread (the scheduler) is responsible for scheduling tasks to the other threads (the workers). With this model, the CPU resources can be shared efficiently among the different tasks within the MIP solver. It also allows the scheduling policy to be changed in runtime (e.g., in the future, the solver can dynamically set the number of workers allocated to each diving heuristics at runtime).

This also implements a parallel reliability branching (Section 5.7 from [1]). This also fixes the incorrect pseudocost update in each node in the B&B tree.

Closes #526.
Closes #445.
Closes #700.

Performance over the MIPLIB2017 dataset (GH200):

```
================================================================================
main - 046eafd (1) vs reliability-branching-cuts (2)
================================================================================
Feasible solutions = 225 vs 224 (-1)
Optimal solutions = 44 vs 68 (+24)
Solutions with <0.1% primal gap = 109 vs 125 (+16)
Average nodes explored = 7298973 vs 4300451 (-2998522, -41.081%)
Shifted geomean for MIP gap = 0.2979 vs 0.2374 (-0.0605, -20.302%)
Average primal gap = 11.9935 vs 11.0833 (-0.9102, -7.589%)
Average primal integral per time = 19.3326 vs 30.7054 (+11.3728, +37.038%)
================================================================================

```

## Reference
[1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin, Berlin, 2007. doi: [10.14279/depositonce-1634](https://doi.org/10.14279/depositonce-1634).

Authors:
  - Nicolas L. Guidotti (https://github.com/nguidotti)
  - Chris Maes (https://github.com/chris-maes)

Approvers:
  - Chris Maes (https://github.com/chris-maes)

URL: https://github.com/NVIDIA/cuopt/pull/766
---
 .../linear_programming/cuopt/run_mip.cpp      |  25 +-
 .../cuopt/linear_programming/constants.h      |   1 +
 .../mip/solver_settings.hpp                   |   1 +
 cpp/src/dual_simplex/basis_updates.hpp        |   5 +
 cpp/src/dual_simplex/branch_and_bound.cpp     | 844 +++++++-----------
 cpp/src/dual_simplex/branch_and_bound.hpp     | 124 +--
 .../dual_simplex/branch_and_bound_worker.hpp  | 280 ++++++
 cpp/src/dual_simplex/diving_heuristics.cpp    |  15 +-
 cpp/src/dual_simplex/mip_node.hpp             |  16 +-
 cpp/src/dual_simplex/node_queue.hpp           |  11 +-
 cpp/src/dual_simplex/presolve.cpp             |   2 +-
 cpp/src/dual_simplex/pseudo_costs.cpp         | 353 ++++++--
 cpp/src/dual_simplex/pseudo_costs.hpp         |  74 +-
 .../dual_simplex/simplex_solver_settings.hpp  |  30 +-
 cpp/src/dual_simplex/solution.hpp             |   4 +-
 cpp/src/math_optimization/solver_settings.cu  |   1 +
 cpp/src/mip/diversity/lns/rins.cu             |  21 +-
 cpp/src/mip/diversity/recombiners/sub_mip.cuh |  19 +-
 cpp/src/mip/solver.cu                         |  19 +-
 cpp/src/utilities/omp_helpers.hpp             |  42 +-
 cpp/src/utilities/pcgenerator.hpp             | 159 ++++
 21 files changed, 1318 insertions(+), 728 deletions(-)
 create mode 100644 cpp/src/dual_simplex/branch_and_bound_worker.hpp
 create mode 100644 cpp/src/utilities/pcgenerator.hpp

diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp
index 213e38e5e..308c7087b 100644
--- a/benchmarks/linear_programming/cuopt/run_mip.cpp
+++ b/benchmarks/linear_programming/cuopt/run_mip.cpp
@@ -147,6 +147,7 @@ int run_single_file(std::string file_path,
                     int num_cpu_threads,
                     bool write_log_file,
                     bool log_to_console,
+                    int reliability_branching,
                     double time_limit)
 {
   const raft::handle_t handle_{};
@@ -196,7 +197,6 @@ int run_single_file(std::string file_path,
       }
     }
   }
-
   settings.time_limit                    = time_limit;
   settings.heuristics_only               = heuristics_only;
   settings.num_cpu_threads               = num_cpu_threads;
@@ -204,6 +204,7 @@ int run_single_file(std::string file_path,
   settings.tolerances.relative_tolerance = 1e-12;
   settings.tolerances.absolute_tolerance = 1e-6;
   settings.presolve                      = true;
+  settings.reliability_branching         = reliability_branching;
   cuopt::linear_programming::benchmark_info_t benchmark_info;
   settings.benchmark_info_ptr = &benchmark_info;
   auto start_run_solver       = std::chrono::high_resolution_clock::now();
@@ -256,6 +257,7 @@ void run_single_file_mp(std::string file_path,
                         int num_cpu_threads,
                         bool write_log_file,
                         bool log_to_console,
+                        int reliability_branching,
                         double time_limit)
 {
   std::cout << "running file " << file_path << " on gpu : " << device << std::endl;
@@ -271,6 +273,7 @@ void run_single_file_mp(std::string file_path,
                                   num_cpu_threads,
                                   write_log_file,
                                   log_to_console,
+                                  reliability_branching,
                                   time_limit);
   // this is a bad design to communicate the result but better than adding complexity of IPC or
   // pipes
@@ -354,6 +357,11 @@ int main(int argc, char* argv[])
     .help("track allocations (t/f)")
     .default_value(std::string("f"));
 
+  program.add_argument("--reliability-branching")
+    .help("reliability branching: -1 (automatic), 0 (disable) or k > 0 (use k)")
+    .scan<'i', int>()
+    .default_value(-1);
+
   // Parse arguments
   try {
     program.parse_args(argc, argv);
@@ -376,12 +384,13 @@ int main(int argc, char* argv[])
   std::string result_file;
   int batch_num = -1;
 
-  bool heuristics_only   = program.get<std::string>("--heuristics-only")[0] == 't';
-  int num_cpu_threads    = program.get<int>("--num-cpu-threads");
-  bool write_log_file    = program.get<std::string>("--write-log-file")[0] == 't';
-  bool log_to_console    = program.get<std::string>("--log-to-console")[0] == 't';
-  double memory_limit    = program.get<double>("--memory-limit");
-  bool track_allocations = program.get<std::string>("--track-allocations")[0] == 't';
+  bool heuristics_only      = program.get<std::string>("--heuristics-only")[0] == 't';
+  int num_cpu_threads       = program.get<int>("--num-cpu-threads");
+  bool write_log_file       = program.get<std::string>("--write-log-file")[0] == 't';
+  bool log_to_console       = program.get<std::string>("--log-to-console")[0] == 't';
+  double memory_limit       = program.get<double>("--memory-limit");
+  bool track_allocations    = program.get<std::string>("--track-allocations")[0] == 't';
+  int reliability_branching = program.get<int>("--reliability-branching");
 
   if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; }
 
@@ -469,6 +478,7 @@ int main(int argc, char* argv[])
                                num_cpu_threads,
                                write_log_file,
                                log_to_console,
+                               reliability_branching,
                                time_limit);
           } else if (sys_pid < 0) {
             std::cerr << "Fork failed!" << std::endl;
@@ -509,6 +519,7 @@ int main(int argc, char* argv[])
                     num_cpu_threads,
                     write_log_file,
                     log_to_console,
+                    reliability_branching,
                     time_limit);
   }
 
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 7caf7aeeb..c52c22c85 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -57,6 +57,7 @@
 #define CUOPT_MIP_HEURISTICS_ONLY             "mip_heuristics_only"
 #define CUOPT_MIP_SCALING                     "mip_scaling"
 #define CUOPT_MIP_PRESOLVE                    "mip_presolve"
+#define CUOPT_MIP_RELIABILITY_BRANCHING       "mip_reliability_branching"
 #define CUOPT_MIP_CUT_PASSES                  "mip_cut_passes"
 #define CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS "mip_mixed_integer_rounding_cuts"
 #define CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS   "mip_mixed_integer_gomory_cuts"
diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 863e5d66d..c5c26884f 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -86,6 +86,7 @@ class mip_solver_settings_t {
   f_t time_limit                = std::numeric_limits<f_t>::infinity();
   i_t node_limit                = std::numeric_limits<i_t>::max();
   bool heuristics_only          = false;
+  i_t reliability_branching     = -1;
   i_t num_cpu_threads           = -1;  // -1 means use default number of threads in branch and bound
   i_t max_cut_passes            = 10;  // number of cut passes to make
   i_t mir_cuts                  = -1;
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index 8eca3ba8a..d9783053f 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -223,6 +223,9 @@ class basis_update_mpf_t {
     reset_stats();
   }
 
+  basis_update_mpf_t(const basis_update_mpf_t& other)            = default;
+  basis_update_mpf_t& operator=(const basis_update_mpf_t& other) = default;
+
   void print_stats() const
   {
     i_t total_L_transpose_calls = total_sparse_L_transpose_ + total_dense_L_transpose_;
@@ -381,6 +384,8 @@ class basis_update_mpf_t {
                      std::vector<i_t>& nonbasic_list,
                      std::vector<variable_status_t>& vstatus);
 
+  void set_refactor_frequency(i_t new_frequency) { refactor_frequency_ = new_frequency; }
+
  private:
   void clear()
   {
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index d35415904..c6e8b0d17 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -201,26 +201,26 @@ std::string user_mip_gap(f_t obj_value, f_t lower_bound)
 }
 
 #ifdef SHOW_DIVING_TYPE
-inline char feasible_solution_symbol(bnb_worker_type_t type)
+inline char feasible_solution_symbol(search_strategy_t strategy)
 {
-  switch (type) {
-    case bnb_worker_type_t::BEST_FIRST: return 'B';
-    case bnb_worker_type_t::COEFFICIENT_DIVING: return 'C';
-    case bnb_worker_type_t::LINE_SEARCH_DIVING: return 'L';
-    case bnb_worker_type_t::PSEUDOCOST_DIVING: return 'P';
-    case bnb_worker_type_t::GUIDED_DIVING: return 'G';
+  switch (strategy) {
+    case search_strategy_t::BEST_FIRST: return 'B';
+    case search_strategy_t::COEFFICIENT_DIVING: return 'C';
+    case search_strategy_t::LINE_SEARCH_DIVING: return 'L';
+    case search_strategy_t::PSEUDOCOST_DIVING: return 'P';
+    case search_strategy_t::GUIDED_DIVING: return 'G';
     default: return 'U';
   }
 }
 #else
-inline char feasible_solution_symbol(bnb_worker_type_t type)
+inline char feasible_solution_symbol(search_strategy_t strategy)
 {
-  switch (type) {
-    case bnb_worker_type_t::BEST_FIRST: return 'B';
-    case bnb_worker_type_t::COEFFICIENT_DIVING: return 'D';
-    case bnb_worker_type_t::LINE_SEARCH_DIVING: return 'D';
-    case bnb_worker_type_t::PSEUDOCOST_DIVING: return 'D';
-    case bnb_worker_type_t::GUIDED_DIVING: return 'D';
+  switch (strategy) {
+    case search_strategy_t::BEST_FIRST: return 'B';
+    case search_strategy_t::COEFFICIENT_DIVING: return 'D';
+    case search_strategy_t::LINE_SEARCH_DIVING: return 'D';
+    case search_strategy_t::PSEUDOCOST_DIVING: return 'D';
+    case search_strategy_t::GUIDED_DIVING: return 'D';
     default: return 'U';
   }
 }
@@ -273,7 +273,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   }
 #endif
 
-  upper_bound_ = inf;
+  upper_bound_    = inf;
+  root_objective_ = std::numeric_limits<f_t>::quiet_NaN();
 }
 
 template <typename i_t, typename f_t>
@@ -282,18 +283,21 @@ f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
   f_t lower_bound      = lower_bound_ceiling_.load();
   f_t heap_lower_bound = node_queue_.get_lower_bound();
   lower_bound          = std::min(heap_lower_bound, lower_bound);
+  lower_bound          = std::min(worker_pool_.get_lower_bound(), lower_bound);
 
-  for (i_t i = 0; i < local_lower_bounds_.size(); ++i) {
-    lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound);
+  if (std::isfinite(lower_bound)) {
+    return lower_bound;
+  } else if (std::isfinite(root_objective_)) {
+    return root_objective_;
+  } else {
+    return -inf;
   }
-
-  return std::isfinite(lower_bound) ? lower_bound : -inf;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj)
 {
-  if (is_running) {
+  if (is_running_) {
     f_t user_obj         = compute_user_objective(original_lp_, obj);
     f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
     std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
@@ -671,7 +675,7 @@ template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
                                                          const std::vector<f_t>& leaf_solution,
                                                          i_t leaf_depth,
-                                                         bnb_worker_type_t thread_type)
+                                                         search_strategy_t thread_type)
 {
   bool send_solution = false;
 
@@ -720,91 +724,64 @@ template <typename i_t, typename f_t>
 branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   mip_node_t<i_t, f_t>* node_ptr,
   const std::vector<i_t>& fractional,
-  const std::vector<f_t>& solution,
-  bnb_worker_type_t type)
+  branch_and_bound_worker_t<i_t, f_t>* worker)
 {
   logger_t log;
   log.log                        = false;
   i_t branch_var                 = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   std::vector<f_t> current_incumbent;
+  std::vector<f_t>& solution = worker->leaf_solution.x;
+
+  switch (worker->search_strategy) {
+    case search_strategy_t::BEST_FIRST:
+
+      if (settings_.reliability_branching != 0) {
+        branch_var = pc_.reliable_variable_selection(node_ptr,
+                                                     fractional,
+                                                     solution,
+                                                     settings_,
+                                                     var_types_,
+                                                     worker,
+                                                     exploration_stats_,
+                                                     upper_bound_,
+                                                     worker_pool_.num_idle_workers(),
+                                                     log);
+      } else {
+        branch_var = pc_.variable_selection(fractional, solution, log);
+      }
 
-  // If there is no incumbent, use pseudocost diving instead of guided diving
-  if (upper_bound_ == inf && type == bnb_worker_type_t::GUIDED_DIVING) {
-    type = bnb_worker_type_t::PSEUDOCOST_DIVING;
-  }
+      round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
 
-  switch (type) {
-    case bnb_worker_type_t::BEST_FIRST:
-      branch_var = pc_.variable_selection(fractional, solution, log);
-      round_dir  = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
       return {branch_var, round_dir};
 
-    case bnb_worker_type_t::COEFFICIENT_DIVING:
+    case search_strategy_t::COEFFICIENT_DIVING:
       return coefficient_diving(
         original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log);
 
-    case bnb_worker_type_t::LINE_SEARCH_DIVING:
+    case search_strategy_t::LINE_SEARCH_DIVING:
       return line_search_diving(fractional, solution, root_relax_soln_.x, log);
 
-    case bnb_worker_type_t::PSEUDOCOST_DIVING:
+    case search_strategy_t::PSEUDOCOST_DIVING:
       return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log);
 
-    case bnb_worker_type_t::GUIDED_DIVING:
+    case search_strategy_t::GUIDED_DIVING:
       mutex_upper_.lock();
       current_incumbent = incumbent_.x;
       mutex_upper_.unlock();
       return guided_diving(pc_, fractional, solution, current_incumbent, log);
 
     default:
-      log.debug("Unknown variable selection method: %d\n", type);
+      log.debug("Unknown variable selection method: %d\n", worker->search_strategy);
       return {-1, rounding_direction_t::NONE};
   }
 }
 
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::initialize_diving_heuristics_settings(
-  std::vector<bnb_worker_type_t>& diving_strategies)
-{
-  diving_strategies.reserve(4);
-
-  if (settings_.diving_settings.pseudocost_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::PSEUDOCOST_DIVING);
-  }
-
-  if (settings_.diving_settings.line_search_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::LINE_SEARCH_DIVING);
-  }
-
-  if (settings_.diving_settings.guided_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::GUIDED_DIVING);
-  }
-
-  if (settings_.diving_settings.coefficient_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::COEFFICIENT_DIVING);
-    calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
-  }
-
-  if (diving_strategies.empty()) {
-    settings_.log.printf("Warning: All diving heuristics are disabled!\n");
-  }
-}
-
 template <typename i_t, typename f_t>
 dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   mip_node_t<i_t, f_t>* node_ptr,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  lp_solution_t<i_t, f_t>& leaf_solution,
-  std::vector<f_t>& leaf_edge_norms,
-  basis_update_mpf_t<i_t, f_t>& basis_factors,
-  std::vector<i_t>& basic_list,
-  std::vector<i_t>& nonbasic_list,
-  bounds_strengthening_t<i_t, f_t>& node_presolver,
-  bnb_worker_type_t thread_type,
-  bool recompute_bounds_and_basis,
-  const std::vector<f_t>& root_lower,
-  const std::vector<f_t>& root_upper,
-  bnb_stats_t<i_t, f_t>& stats,
+  branch_and_bound_worker_t<i_t, f_t>* worker,
+  branch_and_bound_stats_t<i_t, f_t>& stats,
   logger_t& log)
 {
 #ifdef DEBUG_BRANCHING
@@ -843,7 +820,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 #endif
 
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  assert(leaf_vstatus.size() == leaf_problem.num_cols);
+  assert(leaf_vstatus.size() == worker->leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
@@ -852,10 +829,10 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
   lp_settings.scale_columns = false;
 
-  if (thread_type != bnb_worker_type_t::BEST_FIRST) {
-    i_t bnb_lp_iters            = exploration_stats_.total_lp_iters;
+  if (worker->search_strategy != search_strategy_t::BEST_FIRST) {
+    int64_t bnb_lp_iters        = exploration_stats_.total_lp_iters;
     f_t factor                  = settings_.diving_settings.iteration_limit_factor;
-    i_t max_iter                = factor * bnb_lp_iters;
+    int64_t max_iter            = factor * bnb_lp_iters;
     lp_settings.iteration_limit = max_iter - stats.total_lp_iters;
     if (lp_settings.iteration_limit <= 0) { return dual::status_t::ITERATION_LIMIT; }
   }
@@ -882,24 +859,9 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
     node_ptr->vstatus[node_ptr->branch_var]);
 #endif
 
-  // Reset the bound_changed markers
-  std::vector<bool> bounds_changed(original_lp_.num_cols, false);
-
-  // Set the correct bounds for the leaf problem
-  if (recompute_bounds_and_basis) {
-    leaf_problem.lower = root_lower;
-    leaf_problem.upper = root_upper;
-    node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
-
-  } else {
-    node_ptr->update_branched_variable_bounds(
-      leaf_problem.lower, leaf_problem.upper, bounds_changed);
-  }
-
-  bool feasible = node_presolver.bounds_strengthening(
-    lp_settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
-
+  bool feasible            = worker->set_lp_variable_bounds(node_ptr, settings_);
   dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED;
+  worker->leaf_edge_norms  = edge_norms_;
 
   if (feasible) {
     i_t node_iter     = 0;
@@ -907,29 +869,29 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
     lp_status = dual_phase2_with_advanced_basis(2,
                                                 0,
-                                                recompute_bounds_and_basis,
+                                                worker->recompute_basis,
                                                 lp_start_time,
-                                                leaf_problem,
+                                                worker->leaf_problem,
                                                 lp_settings,
                                                 leaf_vstatus,
-                                                basis_factors,
-                                                basic_list,
-                                                nonbasic_list,
-                                                leaf_solution,
+                                                worker->basis_factors,
+                                                worker->basic_list,
+                                                worker->nonbasic_list,
+                                                worker->leaf_solution,
                                                 node_iter,
-                                                leaf_edge_norms);
+                                                worker->leaf_edge_norms);
 
     if (lp_status == dual::status_t::NUMERICAL) {
       log.debug("Numerical issue node %d. Resolving from scratch.\n", node_ptr->node_id);
-      lp_status_t second_status = solve_linear_program_with_advanced_basis(leaf_problem,
+      lp_status_t second_status = solve_linear_program_with_advanced_basis(worker->leaf_problem,
                                                                            lp_start_time,
                                                                            lp_settings,
-                                                                           leaf_solution,
-                                                                           basis_factors,
-                                                                           basic_list,
-                                                                           nonbasic_list,
+                                                                           worker->leaf_solution,
+                                                                           worker->basis_factors,
+                                                                           worker->basic_list,
+                                                                           worker->nonbasic_list,
                                                                            leaf_vstatus,
-                                                                           leaf_edge_norms);
+                                                                           worker->leaf_edge_norms);
 
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
@@ -944,20 +906,18 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   return lp_status;
 }
-
 template <typename i_t, typename f_t>
 std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::update_tree(
   mip_node_t<i_t, f_t>* node_ptr,
   search_tree_t<i_t, f_t>& search_tree,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  lp_solution_t<i_t, f_t>& leaf_solution,
-  std::vector<f_t>& leaf_edge_norms,
-  bnb_worker_type_t thread_type,
+  branch_and_bound_worker_t<i_t, f_t>* worker,
   dual::status_t lp_status,
   logger_t& log)
 {
   const f_t abs_fathom_tol                     = settings_.absolute_mip_gap_tol / 10;
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
+  lp_problem_t<i_t, f_t>& leaf_problem         = worker->leaf_problem;
+  lp_solution_t<i_t, f_t>& leaf_solution       = worker->leaf_solution;
 
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
     // Node was infeasible. Do not branch
@@ -997,12 +957,12 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
     }
 #endif
 
-    f_t leaf_objective    = compute_objective(leaf_problem, leaf_solution.x);
-    node_ptr->lower_bound = leaf_objective;
+    f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x);
     search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective);
     pc_.update_pseudo_costs(node_ptr, leaf_objective);
+    node_ptr->lower_bound = leaf_objective;
 
-    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+    if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
       if (settings_.node_processed_callback != nullptr) {
         std::vector<f_t> original_x;
         uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x);
@@ -1012,15 +972,15 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
 
     if (leaf_num_fractional == 0) {
       // Found a integer feasible solution
-      add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, thread_type);
+      add_feasible_solution(
+        leaf_objective, leaf_solution.x, node_ptr->depth, worker->search_strategy);
       search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective);
       search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE);
       return {node_status_t::INTEGER_FEASIBLE, rounding_direction_t::NONE};
 
     } else if (leaf_objective <= upper_bound_ + abs_fathom_tol) {
       // Choose fractional variable to branch on
-      auto [branch_var, round_dir] =
-        variable_selection(node_ptr, leaf_fractional, leaf_solution.x, thread_type);
+      auto [branch_var, round_dir] = variable_selection(node_ptr, leaf_fractional, worker);
 
       assert(leaf_vstatus.size() == leaf_problem.num_cols);
       assert(branch_var >= 0);
@@ -1029,7 +989,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
       // Note that the exploration thread is the only one that can insert new nodes into the heap,
       // and thus, we only need to calculate the objective estimate here (it is used for
       // sorting the nodes for diving).
-      if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+      if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
         logger_t pc_log;
         pc_log.log = false;
         node_ptr->objective_estimate =
@@ -1052,7 +1012,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
       return {node_status_t::FATHOMED, rounding_direction_t::NONE};
     }
   } else {
-    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+    if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
       fetch_min(lower_bound_ceiling_, node_ptr->lower_bound);
       log.printf(
         "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set "
@@ -1070,136 +1030,19 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* node,
-                                                       i_t initial_heap_size)
+void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker)
 {
-  if (solver_status_ != mip_status_t::UNSET) { return; }
-
-  // Note that we do not know which thread will execute the
-  // `exploration_ramp_up` task, so we allow to any thread
-  // to repair the heuristic solution.
-  repair_heuristic_solutions();
-
-  f_t lower_bound = node->lower_bound;
-  f_t upper_bound = upper_bound_;
-  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  f_t abs_gap     = upper_bound - lower_bound;
-
-  if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-    search_tree_.graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
-    search_tree_.update(node, node_status_t::FATHOMED);
-    --exploration_stats_.nodes_unexplored;
-    return;
-  }
-
-  f_t now = toc(exploration_stats_.start_time);
-  f_t time_since_last_log =
-    exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-  if (((exploration_stats_.nodes_since_last_log >= 10 ||
-        abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-       (time_since_last_log >= 1)) ||
-      (time_since_last_log > 30) || now > settings_.time_limit) {
-    bool should_report = should_report_.exchange(false);
-
-    if (should_report) {
-      report(' ', upper_bound, root_objective_, node->depth, node->integer_infeasible);
-      exploration_stats_.nodes_since_last_log = 0;
-      exploration_stats_.last_log             = tic();
-      should_report_                          = true;
-    }
-  }
-
-  if (now > settings_.time_limit) {
-    solver_status_ = mip_status_t::TIME_LIMIT;
-    return;
-  }
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
-
-  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-  std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
-  dual::status_t lp_status         = solve_node_lp(node,
-                                           leaf_problem,
-                                           leaf_solution,
-                                           leaf_edge_norms,
-                                           basis_factors,
-                                           basic_list,
-                                           nonbasic_list,
-                                           node_presolver,
-                                           bnb_worker_type_t::BEST_FIRST,
-                                           true,
-                                           original_lp_.lower,
-                                           original_lp_.upper,
-                                           exploration_stats_,
-                                           settings_.log);
-  if (lp_status == dual::status_t::TIME_LIMIT) {
-    solver_status_ = mip_status_t::TIME_LIMIT;
-    return;
-  }
-
-  ++exploration_stats_.nodes_since_last_log;
-  ++exploration_stats_.nodes_explored;
-  --exploration_stats_.nodes_unexplored;
-
-  auto [node_status, round_dir] = update_tree(node,
-                                              search_tree_,
-                                              leaf_problem,
-                                              leaf_solution,
-                                              leaf_edge_norms,
-                                              bnb_worker_type_t::BEST_FIRST,
-                                              lp_status,
-                                              settings_.log);
-
-  if (node_status == node_status_t::HAS_CHILDREN) {
-    exploration_stats_.nodes_unexplored += 2;
-
-    // If we haven't generated enough nodes to keep the threads busy, continue the ramp up phase
-    if (exploration_stats_.nodes_unexplored < initial_heap_size) {
-#pragma omp task
-      exploration_ramp_up(node->get_down_child(), initial_heap_size);
-
-#pragma omp task
-      exploration_ramp_up(node->get_up_child(), initial_heap_size);
-
-    } else {
-      // We've generated enough nodes, push further nodes onto the heap
-      node_queue_.push(node->get_down_child());
-      node_queue_.push(node->get_up_child());
-    }
-  }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
-                                               mip_node_t<i_t, f_t>* start_node,
-                                               lp_problem_t<i_t, f_t>& leaf_problem,
-                                               bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                               basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                               std::vector<i_t>& basic_list,
-                                               std::vector<i_t>& nonbasic_list)
-{
-  bool recompute_bounds_and_basis = true;
   std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(start_node);
-
-  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
-    if (task_id == 0) { repair_heuristic_solutions(); }
+  stack.push_front(worker->start_node);
+  worker->recompute_basis  = true;
+  worker->recompute_bounds = true;
 
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
     f_t lower_bound = node_ptr->lower_bound;
     f_t upper_bound = upper_bound_;
-    f_t abs_gap     = upper_bound - lower_bound;
     f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
 
     // This is based on three assumptions:
@@ -1208,58 +1051,28 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
     // - The current node and its siblings uses the lower bound of the parent before solving the LP
     // relaxation
     // - The lower bound of the parent is lower or equal to its children
-    assert(task_id < local_lower_bounds_.size());
-    local_lower_bounds_[task_id] = lower_bound;
+    worker->lower_bound = lower_bound;
 
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
       search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
       search_tree_.update(node_ptr, node_status_t::FATHOMED);
-      recompute_bounds_and_basis = true;
+      worker->recompute_basis  = true;
+      worker->recompute_bounds = true;
       --exploration_stats_.nodes_unexplored;
       continue;
     }
 
-    f_t now = toc(exploration_stats_.start_time);
-
-    if (task_id == 0) {
-      f_t time_since_last_log =
-        exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-      if (((exploration_stats_.nodes_since_last_log >= 1000 ||
-            abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-           time_since_last_log >= 1) ||
-          (time_since_last_log > 30) || now > settings_.time_limit) {
-        report(' ', upper_bound, get_lower_bound(), node_ptr->depth, node_ptr->integer_infeasible);
-        exploration_stats_.last_log             = tic();
-        exploration_stats_.nodes_since_last_log = 0;
-      }
-    }
-
-    if (now > settings_.time_limit) {
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
       solver_status_ = mip_status_t::TIME_LIMIT;
       break;
     }
+
     if (exploration_stats_.nodes_explored >= settings_.node_limit) {
       solver_status_ = mip_status_t::NODE_LIMIT;
       break;
     }
 
-    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-    std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
-    dual::status_t lp_status         = solve_node_lp(node_ptr,
-                                             leaf_problem,
-                                             leaf_solution,
-                                             leaf_edge_norms,
-                                             basis_factors,
-                                             basic_list,
-                                             nonbasic_list,
-                                             node_presolver,
-                                             bnb_worker_type_t::BEST_FIRST,
-                                             recompute_bounds_and_basis,
-                                             original_lp_.lower,
-                                             original_lp_.upper,
-                                             exploration_stats_,
-                                             settings_.log);
+    dual::status_t lp_status = solve_node_lp(node_ptr, worker, exploration_stats_, settings_.log);
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
@@ -1272,16 +1085,11 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
     ++exploration_stats_.nodes_explored;
     --exploration_stats_.nodes_unexplored;
 
-    auto [node_status, round_dir] = update_tree(node_ptr,
-                                                search_tree_,
-                                                leaf_problem,
-                                                leaf_solution,
-                                                leaf_edge_norms,
-                                                bnb_worker_type_t::BEST_FIRST,
-                                                lp_status,
-                                                settings_.log);
+    auto [node_status, round_dir] =
+      update_tree(node_ptr, search_tree_, worker, lp_status, settings_.log);
 
-    recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN;
+    worker->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
     if (node_status == node_status_t::HAS_CHILDREN) {
       // The stack should only contain the children of the current parent.
@@ -1297,143 +1105,73 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
       exploration_stats_.nodes_unexplored += 2;
 
       if (round_dir == rounding_direction_t::UP) {
-        stack.push_front(node_ptr->get_down_child());
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_down_child());
+        } else {
+          stack.push_front(node_ptr->get_down_child());
+        }
+
         stack.push_front(node_ptr->get_up_child());
       } else {
-        stack.push_front(node_ptr->get_up_child());
-        stack.push_front(node_ptr->get_down_child());
-      }
-    }
-  }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t task_id)
-{
-  f_t lower_bound = -inf;
-  f_t abs_gap     = inf;
-  f_t rel_gap     = inf;
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_up_child());
+        } else {
+          stack.push_front(node_ptr->get_up_child());
+        }
 
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol &&
-         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
-    // In the current implementation, we are use the active number of subtree to decide
-    // when to stop the execution. We need to increment the counter at the same
-    // time as we pop a node from the queue to avoid some threads exiting
-    // the main loop thinking that the solver has already finished.
-    // This will be not needed in the master-worker model.
-    node_queue_.lock();
-    // If there any node left in the heap, we pop the top node and explore it.
-    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-    if (start_node.has_value()) { active_subtrees_++; };
-    node_queue_.unlock();
-
-    if (start_node.has_value()) {
-      if (upper_bound_ < start_node.value()->lower_bound) {
-        // This node was put on the heap earlier but its lower bound is now greater than the
-        // current upper bound
-        search_tree_.graphviz_node(
-          settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-        search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-        active_subtrees_--;
-        continue;
+        stack.push_front(node_ptr->get_down_child());
       }
-
-      // Best-first search with plunging
-      plunge_from(task_id,
-                  start_node.value(),
-                  leaf_problem,
-                  node_presolver,
-                  basis_factors,
-                  basic_list,
-                  nonbasic_list);
-
-      active_subtrees_--;
     }
-
-    lower_bound = get_lower_bound();
-    abs_gap     = upper_bound_ - lower_bound;
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
   }
 
-  is_running = false;
-
-  // Check if it is the last thread that exited the loop and no
-  // timeout or numerical error has happen.
-  if (solver_status_ == mip_status_t::UNSET) {
-    if (active_subtrees_ > 0) { local_lower_bounds_[task_id] = inf; }
+  if (settings_.num_threads > 1) {
+    worker_pool_.return_worker_to_pool(worker);
+    active_workers_per_strategy_[BEST_FIRST]--;
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
-                                             const std::vector<f_t>& start_lower,
-                                             const std::vector<f_t>& start_upper,
-                                             lp_problem_t<i_t, f_t>& leaf_problem,
-                                             bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                             basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                             std::vector<i_t>& basic_list,
-                                             std::vector<i_t>& nonbasic_list,
-                                             bnb_worker_type_t diving_type)
+void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>* worker)
 {
   logger_t log;
   log.log = false;
 
-  const i_t diving_node_limit      = settings_.diving_settings.node_limit;
-  const i_t diving_backtrack_limit = settings_.diving_settings.backtrack_limit;
-  bool recompute_bounds_and_basis  = true;
-  search_tree_t<i_t, f_t> dive_tree(std::move(start_node));
+  search_strategy_t search_strategy = worker->search_strategy;
+  const i_t diving_node_limit       = settings_.diving_settings.node_limit;
+  const i_t diving_backtrack_limit  = settings_.diving_settings.backtrack_limit;
+
+  worker->recompute_basis  = true;
+  worker->recompute_bounds = true;
+
+  search_tree_t<i_t, f_t> dive_tree(std::move(*worker->start_node));
   std::deque<mip_node_t<i_t, f_t>*> stack;
   stack.push_front(&dive_tree.root);
 
-  bnb_stats_t<i_t, f_t> dive_stats;
+  branch_and_bound_stats_t<i_t, f_t> dive_stats;
   dive_stats.total_lp_iters      = 0;
   dive_stats.total_lp_solve_time = 0;
   dive_stats.nodes_explored      = 0;
   dive_stats.nodes_unexplored    = 1;
 
-  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running_) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
-    f_t lower_bound = node_ptr->lower_bound;
-    f_t upper_bound = upper_bound_;
-    f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    f_t lower_bound     = node_ptr->lower_bound;
+    f_t upper_bound     = upper_bound_;
+    f_t rel_gap         = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    worker->lower_bound = lower_bound;
 
     if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-      recompute_bounds_and_basis = true;
+      worker->recompute_basis  = true;
+      worker->recompute_bounds = true;
       continue;
     }
 
     if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; }
     if (dive_stats.nodes_explored > diving_node_limit) { break; }
 
-    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-    std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
-    dual::status_t lp_status         = solve_node_lp(node_ptr,
-                                             leaf_problem,
-                                             leaf_solution,
-                                             leaf_edge_norms,
-                                             basis_factors,
-                                             basic_list,
-                                             nonbasic_list,
-                                             node_presolver,
-                                             diving_type,
-                                             recompute_bounds_and_basis,
-                                             start_lower,
-                                             start_upper,
-                                             dive_stats,
-                                             log);
+    dual::status_t lp_status = solve_node_lp(node_ptr, worker, dive_stats, log);
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
@@ -1444,15 +1182,10 @@ void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
 
     ++dive_stats.nodes_explored;
 
-    auto [node_status, round_dir] = update_tree(node_ptr,
-                                                dive_tree,
-                                                leaf_problem,
-                                                leaf_solution,
-                                                leaf_edge_norms,
-                                                diving_type,
-                                                lp_status,
-                                                log);
-    recompute_bounds_and_basis    = node_status != node_status_t::HAS_CHILDREN;
+    auto [node_status, round_dir] = update_tree(node_ptr, dive_tree, worker, lp_status, log);
+
+    worker->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
     if (node_status == node_status_t::HAS_CHILDREN) {
       if (round_dir == rounding_direction_t::UP) {
@@ -1470,68 +1203,207 @@ void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
       stack.pop_back();
     }
   }
+
+  worker_pool_.return_worker_to_pool(worker);
+  active_workers_per_strategy_[search_strategy]--;
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::diving_thread(bnb_worker_type_t diving_type)
+void branch_and_bound_t<i_t, f_t>::run_scheduler()
 {
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-  std::vector<bool> bounds_changed(original_lp_.num_cols, false);
+  diving_heuristics_settings_t<i_t, f_t> diving_settings = settings_.diving_settings;
+  const i_t num_workers                                  = 2 * settings_.num_threads;
 
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
+  if (!std::isfinite(upper_bound_)) { diving_settings.guided_diving = false; }
+  std::vector<search_strategy_t> strategies = get_search_strategies(diving_settings);
+  std::array<i_t, num_search_strategies> max_num_workers_per_type =
+    get_max_workers(num_workers, strategies);
+
+  worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, settings_);
+  active_workers_per_strategy_.fill(0);
+
+#ifdef CUOPT_LOG_DEBUG
+  for (auto strategy : strategies) {
+    settings_.log.debug("%c%d: max num of workers = %d",
+                        feasible_solution_symbol(strategy),
+                        strategy,
+                        max_num_workers_per_type[strategy]);
+  }
+#endif
+
+  f_t lower_bound     = get_lower_bound();
+  f_t abs_gap         = upper_bound_ - lower_bound;
+  f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+  i_t last_node_depth = 0;
+  i_t last_int_infeas = 0;
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol &&
+         (active_workers_per_strategy_[0] > 0 || node_queue_.best_first_queue_size() > 0)) {
+    bool launched_any_task = false;
+    lower_bound            = get_lower_bound();
+    abs_gap                = upper_bound_ - lower_bound;
+    rel_gap                = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    repair_heuristic_solutions();
+
+    // If the guided diving was disabled previously due to the lack of an incumbent solution,
+    // re-enable as soon as a new incumbent is found.
+    if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) {
+      if (std::isfinite(upper_bound_)) {
+        diving_settings.guided_diving = settings_.diving_settings.guided_diving;
+        strategies                    = get_search_strategies(diving_settings);
+        max_num_workers_per_type      = get_max_workers(num_workers, strategies);
+
+#ifdef CUOPT_LOG_DEBUG
+        for (auto type : strategies) {
+          settings_.log.debug("%c%d: max num of workers = %d",
+                              feasible_solution_symbol(type),
+                              type,
+                              max_num_workers_per_type[type]);
+        }
+#endif
+      }
+    }
 
-  std::vector<f_t> start_lower;
-  std::vector<f_t> start_upper;
-  bool reset_starting_bounds = true;
-
-  while (solver_status_ == mip_status_t::UNSET && is_running &&
-         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
-    if (reset_starting_bounds) {
-      start_lower = original_lp_.lower;
-      start_upper = original_lp_.upper;
-      std::fill(bounds_changed.begin(), bounds_changed.end(), false);
-      reset_starting_bounds = false;
+    f_t now = toc(exploration_stats_.start_time);
+    f_t time_since_last_log =
+      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+         time_since_last_log >= 1) ||
+        (time_since_last_log > 30) || now > settings_.time_limit) {
+      i_t queue_size = node_queue_.best_first_queue_size();
+      i_t depth      = queue_size > 0 ? node_queue_.bfs_top()->depth : last_node_depth;
+      i_t int_infeas = queue_size > 0 ? node_queue_.bfs_top()->integer_infeasible : last_int_infeas;
+      report(' ', upper_bound_, lower_bound, depth, int_infeas);
+      exploration_stats_.last_log             = tic();
+      exploration_stats_.nodes_since_last_log = 0;
     }
 
-    // In the current implementation, multiple threads can pop the nodes
-    // from the queue, so we need to initialize the lower and upper bound here
-    // to avoid other thread fathoming the node (i.e., deleting) before we can read
-    // the variable bounds from the tree.
-    // This will be not needed in the master-worker model.
-    node_queue_.lock();
-    std::optional<mip_node_t<i_t, f_t>*> node_ptr  = node_queue_.pop_diving();
-    std::optional<mip_node_t<i_t, f_t>> start_node = std::nullopt;
-
-    if (node_ptr.has_value()) {
-      node_ptr.value()->get_variable_bounds(start_lower, start_upper, bounds_changed);
-      start_node = node_ptr.value()->detach_copy();
+    if (now > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
     }
-    node_queue_.unlock();
 
-    if (start_node.has_value()) {
-      reset_starting_bounds = true;
+    for (auto strategy : strategies) {
+      if (active_workers_per_strategy_[strategy] >= max_num_workers_per_type[strategy]) {
+        continue;
+      }
+
+      // Get an idle worker.
+      branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
+      if (worker == nullptr) { break; }
+
+      if (strategy == BEST_FIRST) {
+        // If there any node left in the heap, we pop the top node and explore it.
+        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+
+        if (!start_node.has_value()) { continue; }
+        if (upper_bound_ < start_node.value()->lower_bound) {
+          // This node was put on the heap earlier but its lower bound is now greater than the
+          // current upper bound
+          search_tree_.graphviz_node(
+            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
+          continue;
+        }
 
-      if (upper_bound_ < start_node->lower_bound) { continue; }
-      bool is_feasible =
-        node_presolver.bounds_strengthening(settings_, bounds_changed, start_lower, start_upper);
-      if (!is_feasible) { continue; }
-
-      dive_from(start_node.value(),
-                start_lower,
-                start_upper,
-                leaf_problem,
-                node_presolver,
-                basis_factors,
-                basic_list,
-                nonbasic_list,
-                diving_type);
+        // Remove the worker from the idle list.
+        worker_pool_.pop_idle_worker();
+        worker->init_best_first(start_node.value(), original_lp_);
+        last_node_depth = start_node.value()->depth;
+        last_int_infeas = start_node.value()->integer_infeasible;
+        active_workers_per_strategy_[strategy]++;
+        launched_any_task = true;
+
+#pragma omp task affinity(worker)
+        plunge_with(worker);
+
+      } else {
+        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
+
+        if (!start_node.has_value()) { continue; }
+        if (upper_bound_ < start_node.value()->lower_bound ||
+            start_node.value()->depth < diving_settings.min_node_depth) {
+          continue;
+        }
+
+        bool is_feasible =
+          worker->init_diving(start_node.value(), strategy, original_lp_, settings_);
+        if (!is_feasible) { continue; }
+
+        // Remove the worker from the idle list.
+        worker_pool_.pop_idle_worker();
+        active_workers_per_strategy_[strategy]++;
+        launched_any_task = true;
+
+#pragma omp task affinity(worker)
+        dive_with(worker);
+      }
     }
+
+    // If no new task was launched in this iteration, suspend temporarily the
+    // execution of the scheduler. As of 8/Jan/2026, GCC does not
+    // implement taskyield, but LLVM does.
+    if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+  }
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
+{
+  branch_and_bound_worker_t<i_t, f_t> worker(0, original_lp_, Arow_, var_types_, settings_);
+
+  f_t lower_bound = get_lower_bound();
+  f_t abs_gap     = upper_bound_ - lower_bound;
+  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) {
+    bool launched_any_task = false;
+    lower_bound            = get_lower_bound();
+    abs_gap                = upper_bound_ - lower_bound;
+    rel_gap                = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    repair_heuristic_solutions();
+
+    f_t now = toc(exploration_stats_.start_time);
+    f_t time_since_last_log =
+      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+         time_since_last_log >= 1) ||
+        (time_since_last_log > 30) || now > settings_.time_limit) {
+      i_t depth      = node_queue_.bfs_top()->depth;
+      i_t int_infeas = node_queue_.bfs_top()->integer_infeasible;
+      report(' ', upper_bound_, lower_bound, depth, int_infeas);
+      exploration_stats_.last_log             = tic();
+      exploration_stats_.nodes_since_last_log = 0;
+    }
+
+    if (now > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    }
+
+    // If there any node left in the heap, we pop the top node and explore it.
+    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+
+    if (!start_node.has_value()) { continue; }
+    if (upper_bound_ < start_node.value()->lower_bound) {
+      // This node was put on the heap earlier but its lower bound is now greater than the
+      // current upper bound
+      search_tree_.graphviz_node(
+        settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+      search_tree_.update(start_node.value(), node_status_t::FATHOMED);
+      continue;
+    }
+
+    worker.init_best_first(start_node.value(), original_lp_);
+    plunge_with(&worker);
   }
 }
 
@@ -1675,7 +1547,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                          solver_name.c_str());
     settings_.log.printf("Root relaxation objective %+.8e\n", user_objective);
   } else {
-    settings_.log.printf("Root relaxation returned status: %s\n",
+    settings_.log.printf("Root relaxation returned: %s\n",
                          lp_status_to_string(root_status).c_str());
   }
 
@@ -1692,14 +1564,11 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   log.log                             = false;
   log.log_prefix                      = settings_.log.log_prefix;
   solver_status_                      = mip_status_t::UNSET;
-  is_running                          = false;
+  is_running_                         = false;
   exploration_stats_.nodes_unexplored = 0;
   exploration_stats_.nodes_explored   = 0;
   original_lp_.A.to_compressed_row(Arow_);
 
-  std::vector<bnb_worker_type_t> diving_strategies;
-  initialize_diving_heuristics_settings(diving_strategies);
-
   if (guess_.size() != 0) {
     std::vector<f_t> crushed_guess;
     crush_primal_solution(original_problem_, original_lp_, guess_, new_slacks_, crushed_guess);
@@ -1728,6 +1597,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   std::vector<i_t> nonbasic_list;
   basis_update_mpf_t<i_t, f_t> basis_update(original_lp_.num_rows, settings_.refactor_frequency);
   lp_status_t root_status;
+
   if (!enable_concurrent_lp_root_solve()) {
     // RINS/SUBMIP path
     settings_.log.printf("\nSolving LP root relaxation with dual simplex\n");
@@ -1775,6 +1645,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     set_final_solution(solution, -inf);
     return solver_status_;
   }
+
   if (root_status == lp_status_t::NUMERICAL_ISSUES) {
     solver_status_ = mip_status_t::NUMERICAL;
     set_final_solution(solution, -inf);
@@ -1785,7 +1656,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   set_uninitialized_steepest_edge_norms<i_t, f_t>(original_lp_, basic_list, edge_norms_);
 
   root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
-  local_lower_bounds_.assign(settings_.num_bfs_workers, root_objective_);
 
   if (settings_.set_simplex_solution_callback != nullptr) {
     std::vector<f_t> original_x;
@@ -1806,12 +1676,13 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   i_t num_fractional = fractional_variables(settings_, root_relax_soln_.x, var_types_, fractional);
 
   cut_info_t<i_t, f_t> cut_info;
+
   if (num_fractional == 0) {
     set_solution_at_root(solution, cut_info);
     return mip_status_t::OPTIMAL;
   }
 
-  is_running           = true;
+  is_running_          = true;
   lower_bound_ceiling_ = inf;
 
   if (num_fractional != 0 && settings_.max_cut_passes > 0) {
@@ -2026,8 +1897,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
         }
       }
 
-      local_lower_bounds_.assign(settings_.num_bfs_workers, root_objective_);
-
       f_t remove_cuts_start_time = tic();
       mutex_original_lp_.lock();
       remove_cuts(original_lp_,
@@ -2171,58 +2040,39 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       root_vstatus_,
                       original_lp_,
                       log);
+  node_queue_.push(search_tree_.root.get_down_child());
+  node_queue_.push(search_tree_.root.get_up_child());
 
-  settings_.log.printf("Exploring the B&B tree using %d threads (best-first = %d, diving = %d)\n\n",
-                       settings_.num_threads,
-                       settings_.num_bfs_workers,
-                       settings_.num_threads - settings_.num_bfs_workers);
+  settings_.log.printf("Exploring the B&B tree using %d threads\n\n", settings_.num_threads);
 
   exploration_stats_.nodes_explored       = 0;
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
-  active_subtrees_                        = 0;
-  lower_bound_ceiling_                    = inf;
-  should_report_                          = true;
+  min_node_queue_size_                    = 2 * settings_.num_threads;
+
+  if (settings_.diving_settings.coefficient_diving != 0) {
+    calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
+  }
 
   settings_.log.printf(
     " | Explored | Unexplored |    Objective    |     Bound     | IntInf | Depth | Iter/Node |   "
     "Gap    "
     "|  Time  |\n");
+
+  if (settings_.num_threads > 1) {
 #pragma omp parallel num_threads(settings_.num_threads)
-  {
-#pragma omp master
     {
-      auto down_child          = search_tree_.root.get_down_child();
-      auto up_child            = search_tree_.root.get_up_child();
-      i_t initial_size         = 2 * settings_.num_threads;
-      const i_t num_strategies = diving_strategies.size();
-
-#pragma omp taskgroup
-      {
-#pragma omp task
-        exploration_ramp_up(down_child, initial_size);
-
-#pragma omp task
-        exploration_ramp_up(up_child, initial_size);
-      }
-
-      for (i_t i = 0; i < settings_.num_bfs_workers; i++) {
-#pragma omp task
-        best_first_thread(i);
-      }
-
-      if (!diving_strategies.empty()) {
-        for (i_t k = 0; k < settings_.diving_settings.num_diving_workers; k++) {
-          const bnb_worker_type_t diving_type = diving_strategies[k % num_strategies];
-#pragma omp task
-          diving_thread(diving_type);
-        }
-      }
+#pragma omp master
+      run_scheduler();
     }
+
+  } else {
+    single_threaded_solve();
   }
 
-  is_running      = false;
+  is_running_ = false;
+
   f_t lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
                                                             : search_tree_.root.lower_bound;
   set_final_solution(solution, lower_bound);
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index 6db45e153..1d6947681 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <dual_simplex/branch_and_bound_worker.hpp>
 #include <dual_simplex/cuts.hpp>
 #include <dual_simplex/diving_heuristics.hpp>
 #include <dual_simplex/initial_basis.hpp>
@@ -37,38 +38,9 @@ enum class mip_status_t {
   UNSET      = 6,  // The status is not set
 };
 
-// Indicate the search and variable selection algorithms used by each thread
-// in B&B (See [1]).
-//
-// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
-// Berlin, 2007. doi: 10.14279/depositonce-1634.
-enum class bnb_worker_type_t {
-  BEST_FIRST         = 0,  // Best-First + Plunging.
-  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
-  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
-  GUIDED_DIVING = 3,  // Guided diving (9.2.3). If no incumbent is found yet, use pseudocost diving.
-  COEFFICIENT_DIVING = 4  // Coefficient diving (9.2.1)
-};
-
-template <typename i_t, typename f_t>
-class bounds_strengthening_t;
-
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
-template <typename i_t, typename f_t>
-struct bnb_stats_t {
-  f_t start_time                        = 0.0;
-  omp_atomic_t<f_t> total_lp_solve_time = 0.0;
-  omp_atomic_t<i_t> nodes_explored      = 0;
-  omp_atomic_t<i_t> nodes_unexplored    = 0;
-  omp_atomic_t<f_t> total_lp_iters      = 0;
-
-  // This should only be used by the main thread
-  omp_atomic_t<f_t> last_log             = 0.0;
-  omp_atomic_t<i_t> nodes_since_last_log = 0;
-};
-
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
@@ -153,9 +125,6 @@ class branch_and_bound_t {
   std::vector<i_t> var_up_locks_;
   std::vector<i_t> var_down_locks_;
 
-  // Local lower bounds for each thread
-  std::vector<omp_atomic_t<f_t>> local_lower_bounds_;
-
   // Mutex for the original LP
   // The heuristics threads look at the original LP. But the main thread modifies the
   // size of the original LP by adding slacks for cuts. Heuristic threads should lock
@@ -173,7 +142,7 @@ class branch_and_bound_t {
   mip_solution_t<i_t, f_t> incumbent_;
 
   // Structure with the general info of the solver.
-  bnb_stats_t<i_t, f_t> exploration_stats_;
+  branch_and_bound_stats_t<i_t, f_t> exploration_stats_;
 
   // Mutex for repair
   omp_mutex_t mutex_repair_;
@@ -200,14 +169,21 @@ class branch_and_bound_t {
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
 
-  // Count the number of subtrees that are currently being explored.
-  omp_atomic_t<i_t> active_subtrees_;
+  // Count the number of workers per type that either are being executed or
+  // are waiting to be executed.
+  std::array<omp_atomic_t<i_t>, num_search_strategies> active_workers_per_strategy_;
+
+  // Worker pool
+  branch_and_bound_worker_pool_t<i_t, f_t> worker_pool_;
 
   // Global status of the solver.
   omp_atomic_t<mip_status_t> solver_status_;
-  omp_atomic_t<bool> is_running{false};
+  omp_atomic_t<bool> is_running_{false};
 
-  omp_atomic_t<bool> should_report_;
+  // Minimum number of node in the queue. When the queue size is less than
+  // this variable, the nodes are added directly to the queue instead of
+  // the local stack. This also determines the end of the ramp-up phase.
+  i_t min_node_queue_size_;
 
   // In case, a best-first thread encounters a numerical issue when solving a node,
   // its blocks the progression of the lower bound.
@@ -230,82 +206,48 @@ class branch_and_bound_t {
   void add_feasible_solution(f_t leaf_objective,
                              const std::vector<f_t>& leaf_solution,
                              i_t leaf_depth,
-                             bnb_worker_type_t thread_type);
+                             search_strategy_t thread_type);
 
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
-  // Initialize diving heuristics settings
-  void initialize_diving_heuristics_settings(std::vector<bnb_worker_type_t>& diving_strategies);
-
-  // Ramp-up phase of the solver, where we greedily expand the tree until
-  // there is enough unexplored nodes. This is done recursively using OpenMP tasks.
-  void exploration_ramp_up(mip_node_t<i_t, f_t>* node, i_t initial_heap_size);
-
   // We use best-first to pick the `start_node` and then perform a depth-first search
   // from this node (i.e., a plunge). It can only backtrack to a sibling node.
   // Unexplored nodes in the subtree are inserted back into the global heap.
-  void plunge_from(i_t task_id,
-                   mip_node_t<i_t, f_t>* start_node,
-                   lp_problem_t<i_t, f_t>& leaf_problem,
-                   bounds_strengthening_t<i_t, f_t>& node_presolver,
-                   basis_update_mpf_t<i_t, f_t>& basis_update,
-                   std::vector<i_t>& basic_list,
-                   std::vector<i_t>& nonbasic_list);
-
-  // Each "main" thread pops a node from the global heap and then performs a plunge
-  // (i.e., a shallow dive) into the subtree determined by the node.
-  void best_first_thread(i_t task_id);
+  void plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker);
 
   // Perform a deep dive in the subtree determined by the `start_node` in order
   // to find integer feasible solutions.
-  void dive_from(mip_node_t<i_t, f_t>& start_node,
-                 const std::vector<f_t>& start_lower,
-                 const std::vector<f_t>& start_upper,
-                 lp_problem_t<i_t, f_t>& leaf_problem,
-                 bounds_strengthening_t<i_t, f_t>& node_presolver,
-                 basis_update_mpf_t<i_t, f_t>& basis_update,
-                 std::vector<i_t>& basic_list,
-                 std::vector<i_t>& nonbasic_list,
-                 bnb_worker_type_t diving_type);
-
-  // Each diving thread pops the first node from the dive queue and then performs
-  // a deep dive into the subtree determined by the node.
-  void diving_thread(bnb_worker_type_t diving_type);
+  void dive_with(branch_and_bound_worker_t<i_t, f_t>* worker);
+
+  // Run the scheduler whose will schedule and manage
+  // all the other workers.
+  void run_scheduler();
+
+  // Run the branch-and-bound algorithm in single threaded mode.
+  // This disable all diving heuristics.
+  void single_threaded_solve();
 
   // Solve the LP relaxation of a leaf node
   dual::status_t solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
-                               lp_problem_t<i_t, f_t>& leaf_problem,
-                               lp_solution_t<i_t, f_t>& leaf_solution,
-                               std::vector<f_t>& leaf_edge_norms,
-                               basis_update_mpf_t<i_t, f_t>& basis_factors,
-                               std::vector<i_t>& basic_list,
-                               std::vector<i_t>& nonbasic_list,
-                               bounds_strengthening_t<i_t, f_t>& node_presolver,
-                               bnb_worker_type_t thread_type,
-                               bool recompute_bounds_and_basis,
-                               const std::vector<f_t>& root_lower,
-                               const std::vector<f_t>& root_upper,
-                               bnb_stats_t<i_t, f_t>& stats,
+                               branch_and_bound_worker_t<i_t, f_t>* worker,
+                               branch_and_bound_stats_t<i_t, f_t>& stats,
                                logger_t& log);
 
   // Update the tree based on the LP relaxation. Returns the status
   // of the node and, if appropriated, the preferred rounding direction
   // when visiting the children.
-  std::pair<node_status_t, rounding_direction_t> update_tree(mip_node_t<i_t, f_t>* node_ptr,
-                                                             search_tree_t<i_t, f_t>& search_tree,
-                                                             lp_problem_t<i_t, f_t>& leaf_problem,
-                                                             lp_solution_t<i_t, f_t>& leaf_solution,
-                                                             std::vector<f_t>& leaf_edge_norms,
-                                                             bnb_worker_type_t thread_type,
-                                                             dual::status_t lp_status,
-                                                             logger_t& log);
+  std::pair<node_status_t, rounding_direction_t> update_tree(
+    mip_node_t<i_t, f_t>* node_ptr,
+    search_tree_t<i_t, f_t>& search_tree,
+    branch_and_bound_worker_t<i_t, f_t>* worker,
+    dual::status_t lp_status,
+    logger_t& log);
 
   // Selects the variable to branch on.
   branch_variable_t<i_t> variable_selection(mip_node_t<i_t, f_t>* node_ptr,
                                             const std::vector<i_t>& fractional,
-                                            const std::vector<f_t>& solution,
-                                            bnb_worker_type_t type);
+                                            branch_and_bound_worker_t<i_t, f_t>* worker);
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/branch_and_bound_worker.hpp b/cpp/src/dual_simplex/branch_and_bound_worker.hpp
new file mode 100644
index 000000000..99a59a871
--- /dev/null
+++ b/cpp/src/dual_simplex/branch_and_bound_worker.hpp
@@ -0,0 +1,280 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bounds_strengthening.hpp>
+#include <dual_simplex/mip_node.hpp>
+#include <dual_simplex/phase2.hpp>
+
+#include <utilities/pcgenerator.hpp>
+
+#include <array>
+#include <deque>
+#include <mutex>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+constexpr int num_search_strategies = 5;
+
+// Indicate the search and variable selection algorithms used by each thread
+// in B&B (See [1]).
+//
+// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
+// Berlin, 2007. doi: 10.14279/depositonce-1634.
+enum search_strategy_t : int {
+  BEST_FIRST         = 0,  // Best-First + Plunging.
+  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
+  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
+  GUIDED_DIVING      = 3,  // Guided diving (9.2.3).
+  COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
+};
+
+template <typename i_t, typename f_t>
+struct branch_and_bound_stats_t {
+  f_t start_time                         = 0.0;
+  omp_atomic_t<f_t> total_lp_solve_time  = 0.0;
+  omp_atomic_t<int64_t> nodes_explored   = 0;
+  omp_atomic_t<int64_t> nodes_unexplored = 0;
+  omp_atomic_t<int64_t> total_lp_iters   = 0;
+  omp_atomic_t<i_t> nodes_since_last_log = 0;
+  omp_atomic_t<f_t> last_log             = 0.0;
+};
+
+template <typename i_t, typename f_t>
+class branch_and_bound_worker_t {
+ public:
+  const i_t worker_id;
+  omp_atomic_t<search_strategy_t> search_strategy;
+  omp_atomic_t<bool> is_active;
+  omp_atomic_t<f_t> lower_bound;
+
+  lp_problem_t<i_t, f_t> leaf_problem;
+  lp_solution_t<i_t, f_t> leaf_solution;
+  std::vector<f_t> leaf_edge_norms;
+
+  basis_update_mpf_t<i_t, f_t> basis_factors;
+  std::vector<i_t> basic_list;
+  std::vector<i_t> nonbasic_list;
+
+  bounds_strengthening_t<i_t, f_t> node_presolver;
+  std::vector<bool> bounds_changed;
+
+  std::vector<f_t> start_lower;
+  std::vector<f_t> start_upper;
+  mip_node_t<i_t, f_t>* start_node;
+
+  pcgenerator_t rng;
+
+  bool recompute_basis  = true;
+  bool recompute_bounds = true;
+
+  branch_and_bound_worker_t(i_t worker_id,
+                            const lp_problem_t<i_t, f_t>& original_lp,
+                            const csr_matrix_t<i_t, f_t>& Arow,
+                            const std::vector<variable_type_t>& var_type,
+                            const simplex_solver_settings_t<i_t, f_t>& settings)
+    : worker_id(worker_id),
+      search_strategy(BEST_FIRST),
+      is_active(false),
+      lower_bound(-std::numeric_limits<f_t>::infinity()),
+      leaf_problem(original_lp),
+      leaf_solution(original_lp.num_rows, original_lp.num_cols),
+      basis_factors(original_lp.num_rows, settings.refactor_frequency),
+      basic_list(original_lp.num_rows),
+      nonbasic_list(),
+      node_presolver(leaf_problem, Arow, {}, var_type),
+      bounds_changed(original_lp.num_cols, false),
+      rng(settings.random_seed + pcgenerator_t::default_seed + worker_id,
+          pcgenerator_t::default_stream ^ worker_id)
+  {
+  }
+
+  // Set the `start_node` for best-first search.
+  void init_best_first(mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
+  {
+    start_node      = node;
+    start_lower     = original_lp.lower;
+    start_upper     = original_lp.upper;
+    search_strategy = BEST_FIRST;
+    lower_bound     = node->lower_bound;
+    is_active       = true;
+  }
+
+  // Initialize the worker for diving, setting the `start_node`, `start_lower` and
+  // `start_upper`. Returns `true` if the starting node is feasible via
+  // bounds propagation.
+  bool init_diving(mip_node_t<i_t, f_t>* node,
+                   search_strategy_t type,
+                   const lp_problem_t<i_t, f_t>& original_lp,
+                   const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    internal_node = node->detach_copy();
+    start_node    = &internal_node;
+
+    start_lower     = original_lp.lower;
+    start_upper     = original_lp.upper;
+    search_strategy = type;
+    lower_bound     = node->lower_bound;
+    is_active       = true;
+
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
+    return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper);
+  }
+
+  // Set the variables bounds for the LP relaxation of the current node.
+  bool set_lp_variable_bounds(mip_node_t<i_t, f_t>* node_ptr,
+                              const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    // Reset the bound_changed markers
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+
+    // Set the correct bounds for the leaf problem
+    if (recompute_bounds) {
+      leaf_problem.lower = start_lower;
+      leaf_problem.upper = start_upper;
+      node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
+
+    } else {
+      node_ptr->update_branched_variable_bounds(
+        leaf_problem.lower, leaf_problem.upper, bounds_changed);
+    }
+
+    return node_presolver.bounds_strengthening(
+      settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
+  }
+
+ private:
+  // For diving, we need to store the full node instead of
+  // of just a pointer, since it is not stored in the tree anymore.
+  // To keep the same interface across all worker types,
+  // this will be used as a temporary storage and
+  // will be pointed by `start_node`.
+  // For exploration, this will not be used.
+  mip_node_t<i_t, f_t> internal_node;
+};
+
+template <typename i_t, typename f_t>
+class branch_and_bound_worker_pool_t {
+ public:
+  void init(i_t num_workers,
+            const lp_problem_t<i_t, f_t>& original_lp,
+            const csr_matrix_t<i_t, f_t>& Arow,
+            const std::vector<variable_type_t>& var_type,
+            const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    workers_.resize(num_workers);
+    num_idle_workers_ = num_workers;
+    for (i_t i = 0; i < num_workers; ++i) {
+      workers_[i] = std::make_unique<branch_and_bound_worker_t<i_t, f_t>>(
+        i, original_lp, Arow, var_type, settings);
+      idle_workers_.push_front(i);
+    }
+
+    is_initialized = true;
+  }
+
+  // Here, we are assuming that the scheduler is the only
+  // thread that can retrieve/pop an idle worker.
+  branch_and_bound_worker_t<i_t, f_t>* get_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    if (idle_workers_.empty()) {
+      return nullptr;
+    } else {
+      i_t idx = idle_workers_.front();
+      return workers_[idx].get();
+    }
+  }
+
+  // Here, we are assuming that the scheduler is the only
+  // thread that can retrieve/pop an idle worker.
+  void pop_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    if (!idle_workers_.empty()) {
+      idle_workers_.pop_front();
+      num_idle_workers_--;
+    }
+  }
+
+  void return_worker_to_pool(branch_and_bound_worker_t<i_t, f_t>* worker)
+  {
+    worker->is_active = false;
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    idle_workers_.push_back(worker->worker_id);
+    num_idle_workers_++;
+  }
+
+  f_t get_lower_bound()
+  {
+    f_t lower_bound = std::numeric_limits<f_t>::infinity();
+
+    if (is_initialized) {
+      for (i_t i = 0; i < workers_.size(); ++i) {
+        if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) {
+          lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
+        }
+      }
+    }
+
+    return lower_bound;
+  }
+
+  i_t num_idle_workers() { return num_idle_workers_; }
+
+ private:
+  // Worker pool
+  std::vector<std::unique_ptr<branch_and_bound_worker_t<i_t, f_t>>> workers_;
+  bool is_initialized = false;
+
+  omp_mutex_t mutex_;
+  std::deque<i_t> idle_workers_;
+  omp_atomic_t<i_t> num_idle_workers_;
+};
+
+template <typename f_t, typename i_t>
+std::vector<search_strategy_t> get_search_strategies(
+  diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  std::vector<search_strategy_t> types;
+  types.reserve(num_search_strategies);
+  types.push_back(BEST_FIRST);
+  if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); }
+  if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); }
+  if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); }
+  if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); }
+  return types;
+}
+
+template <typename i_t>
+std::array<i_t, num_search_strategies> get_max_workers(
+  i_t num_workers, const std::vector<search_strategy_t>& strategies)
+{
+  std::array<i_t, num_search_strategies> max_num_workers;
+  max_num_workers.fill(0);
+
+  i_t bfs_workers             = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1);
+  max_num_workers[BEST_FIRST] = bfs_workers;
+
+  i_t diving_workers = (num_workers - bfs_workers);
+  i_t m              = strategies.size() - 1;
+
+  for (size_t i = 1, k = 0; i < strategies.size(); ++i) {
+    i_t start                      = (double)k * diving_workers / m;
+    i_t end                        = (double)(k + 1) * diving_workers / m;
+    max_num_workers[strategies[i]] = end - start;
+    ++k;
+  }
+
+  return max_num_workers;
+}
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/diving_heuristics.cpp b/cpp/src/dual_simplex/diving_heuristics.cpp
index a56b4cce3..b3e91bc09 100644
--- a/cpp/src/dual_simplex/diving_heuristics.cpp
+++ b/cpp/src/dual_simplex/diving_heuristics.cpp
@@ -71,9 +71,8 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
                                          const std::vector<f_t>& root_solution,
                                          logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(pc.mutex);
   i_t branch_var                 = -1;
-  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  f_t max_score                  = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   constexpr f_t eps              = 1e-6;
 
@@ -127,6 +126,15 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
     }
   }
 
+  // If we cannot choose the variable, then arbitrarily pick the first
+  // fractional variable and round it down. This only happens when
+  // there is only one fractional variable and its the pseudocost is
+  // infinite for both direction.
+  if (round_dir == rounding_direction_t::NONE) {
+    branch_var = fractional[0];
+    round_dir  = rounding_direction_t::DOWN;
+  }
+
   assert(round_dir != rounding_direction_t::NONE);
   assert(branch_var >= 0);
 
@@ -146,9 +154,8 @@ branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
                                      const std::vector<f_t>& incumbent,
                                      logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(pc.mutex);
   i_t branch_var                 = -1;
-  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  f_t max_score                  = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   constexpr f_t eps              = 1e-6;
 
diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp
index 5ee4f49d1..8b2211c1d 100644
--- a/cpp/src/dual_simplex/mip_node.hpp
+++ b/cpp/src/dual_simplex/mip_node.hpp
@@ -227,18 +227,26 @@ class mip_node_t {
   }
 
   // This method creates a copy of the current node
-  // with its parent set to `nullptr` and `depth = 0`.
+  // with its parent set to `nullptr`
   // This detaches the node from the tree.
   mip_node_t<i_t, f_t> detach_copy() const
   {
-    mip_node_t<i_t, f_t> copy(lower_bound, vstatus);
+    mip_node_t<i_t, f_t> copy;
+    copy.lower_bound        = lower_bound;
+    copy.objective_estimate = objective_estimate;
+    copy.depth              = depth;
+    copy.node_id            = node_id;
+    copy.integer_infeasible = integer_infeasible;
+    copy.vstatus            = vstatus;
     copy.branch_var         = branch_var;
     copy.branch_dir         = branch_dir;
     copy.branch_var_lower   = branch_var_lower;
     copy.branch_var_upper   = branch_var_upper;
     copy.fractional_val     = fractional_val;
-    copy.objective_estimate = objective_estimate;
-    copy.node_id            = node_id;
+    copy.parent             = nullptr;
+    copy.children[0]        = nullptr;
+    copy.children[1]        = nullptr;
+    copy.status             = node_status_t::PENDING;
     return copy;
   }
 
diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp
index 28072795a..b5c9a9ea7 100644
--- a/cpp/src/dual_simplex/node_queue.hpp
+++ b/cpp/src/dual_simplex/node_queue.hpp
@@ -114,26 +114,29 @@ class node_queue_t {
 
   std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
   {
+    std::lock_guard<omp_mutex_t> lock(mutex);
     auto entry = best_first_heap.pop();
+
     if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
+
     return std::nullopt;
   }
 
   std::optional<mip_node_t<i_t, f_t>*> pop_diving()
   {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+
     while (!diving_heap.empty()) {
       auto entry = diving_heap.pop();
+
       if (entry.has_value()) {
         if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
       }
     }
+
     return std::nullopt;
   }
 
-  void lock() { mutex.lock(); }
-
-  void unlock() { mutex.unlock(); }
-
   i_t diving_queue_size()
   {
     std::lock_guard<omp_mutex_t> lock(mutex);
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index 978896887..d104036ea 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -619,7 +619,7 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
   }
 
   constexpr bool run_bounds_strengthening = false;
-  if (run_bounds_strengthening) {
+  if constexpr (run_bounds_strengthening) {
     csr_matrix_t<i_t, f_t> Arow(1, 1, 1);
     problem.A.to_compressed_row(Arow);
 
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 015034b77..682bdaa6f 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -88,7 +88,7 @@ void strong_branch_helper(i_t start,
       }
 
       if (branch == 0) {
-        pc.strong_branch_down[k] = obj - root_obj;
+        pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0);
         if (verbose) {
           settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n",
                               thread_id,
@@ -99,7 +99,7 @@ void strong_branch_helper(i_t start,
                               toc(start_time));
         }
       } else {
-        pc.strong_branch_up[k] = obj - root_obj;
+        pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0);
         if (verbose) {
           settings.log.printf(
             "Thread id %2d remaining %d variable %d branch %d obj %e change down %e change up %e "
@@ -135,6 +135,83 @@ void strong_branch_helper(i_t start,
   }
 }
 
+template <typename i_t, typename f_t>
+f_t trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
+                    const simplex_solver_settings_t<i_t, f_t>& settings,
+                    const std::vector<variable_type_t>& var_types,
+                    const std::vector<variable_status_t>& vstatus,
+                    const std::vector<f_t>& edge_norms,
+                    const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                    const std::vector<i_t>& basic_list,
+                    const std::vector<i_t>& nonbasic_list,
+                    i_t branch_var,
+                    f_t branch_var_lower,
+                    f_t branch_var_upper,
+                    f_t upper_bound,
+                    i_t bnb_lp_iter_per_node,
+                    f_t start_time,
+                    i_t upper_max_lp_iter,
+                    i_t lower_max_lp_iter,
+                    omp_atomic_t<int64_t>& total_lp_iter)
+{
+  lp_problem_t child_problem      = original_lp;
+  child_problem.lower[branch_var] = branch_var_lower;
+  child_problem.upper[branch_var] = branch_var_upper;
+
+  const bool initialize_basis                        = false;
+  simplex_solver_settings_t<i_t, f_t> child_settings = settings;
+  child_settings.set_log(false);
+  i_t lp_iter_upper              = upper_max_lp_iter;
+  i_t lp_iter_lower              = lower_max_lp_iter;
+  child_settings.iteration_limit = std::clamp(bnb_lp_iter_per_node, lp_iter_lower, lp_iter_upper);
+  child_settings.cut_off         = upper_bound + settings.dual_tol;
+  child_settings.inside_mip      = 2;
+  child_settings.scale_columns   = false;
+
+  lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
+  i_t iter                                         = 0;
+  std::vector<variable_status_t> child_vstatus     = vstatus;
+  std::vector<f_t> child_edge_norms                = edge_norms;
+  std::vector<i_t> child_basic_list                = basic_list;
+  std::vector<i_t> child_nonbasic_list             = nonbasic_list;
+  basis_update_mpf_t<i_t, f_t> child_basis_factors = basis_factors;
+
+  // Only refactor the basis if we encounter numerical issues.
+  child_basis_factors.set_refactor_frequency(upper_max_lp_iter);
+
+  dual::status_t status = dual_phase2_with_advanced_basis(2,
+                                                          0,
+                                                          initialize_basis,
+                                                          start_time,
+                                                          child_problem,
+                                                          child_settings,
+                                                          child_vstatus,
+                                                          child_basis_factors,
+                                                          child_basic_list,
+                                                          child_nonbasic_list,
+                                                          solution,
+                                                          iter,
+                                                          child_edge_norms);
+  total_lp_iter += iter;
+  settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n",
+                     branch_var,
+                     child_problem.lower[branch_var],
+                     child_problem.upper[branch_var],
+                     iter,
+                     dual::status_to_string(status).c_str(),
+                     compute_objective(child_problem, solution.x));
+
+  if (status == dual::status_t::DUAL_UNBOUNDED) {
+    // LP was infeasible
+    return std::numeric_limits<f_t>::infinity();
+  } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
+             status == dual::status_t::CUTOFF) {
+    return compute_objective(child_problem, solution.x);
+  } else {
+    return std::numeric_limits<f_t>::quiet_NaN();
+  }
+}
+
 }  // namespace
 
 template <typename i_t, typename f_t>
@@ -350,15 +427,31 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
   pc.update_pseudo_costs_from_strong_branching(fractional, root_soln);
 }
 
+template <typename i_t, typename f_t>
+f_t pseudo_costs_t<i_t, f_t>::calculate_pseudocost_score(i_t j,
+                                                         const std::vector<f_t>& solution,
+                                                         f_t pseudo_cost_up_avg,
+                                                         f_t pseudo_cost_down_avg) const
+{
+  constexpr f_t eps = 1e-6;
+  i_t num_up        = pseudo_cost_num_up[j];
+  i_t num_down      = pseudo_cost_num_down[j];
+  f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
+  f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
+  f_t f_down        = solution[j] - std::floor(solution[j]);
+  f_t f_up          = std::ceil(solution[j]) - solution[j];
+  return std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
+}
+
 template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr,
                                                    f_t leaf_objective)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
-  const f_t change_in_obj = leaf_objective - node_ptr->lower_bound;
+  const f_t change_in_obj = std::max(leaf_objective - node_ptr->lower_bound, 0.0);
   const f_t frac          = node_ptr->branch_dir == rounding_direction_t::DOWN
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
                               : std::ceil(node_ptr->fractional_val) - node_ptr->fractional_val;
+
   if (node_ptr->branch_dir == rounding_direction_t::DOWN) {
     pseudo_cost_sum_down[node_ptr->branch_var] += change_in_obj / frac;
     pseudo_cost_num_down[node_ptr->branch_var]++;
@@ -412,13 +505,55 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
                                                  const std::vector<f_t>& solution,
                                                  logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
+  i_t branch_var = fractional[0];
+  f_t max_score  = -1;
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
 
-  const i_t num_fractional = fractional.size();
-  std::vector<f_t> pseudo_cost_up(num_fractional);
-  std::vector<f_t> pseudo_cost_down(num_fractional);
-  std::vector<f_t> score(num_fractional);
+  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
+  log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
+             num_initialized_down,
+             num_initialized_up,
+             pseudo_cost_down_avg,
+             pseudo_cost_up_avg);
+
+  for (i_t j : fractional) {
+    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+  }
+
+  log.debug("Pseudocost branching on %d. Value %e. Score %e.\n",
+            branch_var,
+            solution[branch_var],
+            max_score);
+
+  return branch_var;
+}
+
+template <typename i_t, typename f_t>
+i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
+  mip_node_t<i_t, f_t>* node_ptr,
+  const std::vector<i_t>& fractional,
+  const std::vector<f_t>& solution,
+  const simplex_solver_settings_t<i_t, f_t>& settings,
+  const std::vector<variable_type_t>& var_types,
+  branch_and_bound_worker_t<i_t, f_t>* worker,
+  const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
+  f_t upper_bound,
+  int max_num_tasks,
+  logger_t& log)
+{
+  constexpr f_t eps = 1e-6;
+  f_t start_time    = bnb_stats.start_time;
+  i_t branch_var    = fractional[0];
+  f_t max_score     = -1;
   i_t num_initialized_down;
   i_t num_initialized_up;
   f_t pseudo_cost_down_avg;
@@ -432,39 +567,162 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
              pseudo_cost_down_avg,
              pseudo_cost_up_avg);
 
-  for (i_t k = 0; k < num_fractional; k++) {
-    const i_t j = fractional[k];
-    if (pseudo_cost_num_down[j] != 0) {
-      pseudo_cost_down[k] = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
-    } else {
-      pseudo_cost_down[k] = pseudo_cost_down_avg;
+  const int64_t branch_and_bound_lp_iters = bnb_stats.total_lp_iters;
+  const int64_t branch_and_bound_explored = bnb_stats.nodes_explored;
+  const i_t branch_and_bound_lp_iter_per_node =
+    branch_and_bound_lp_iters / bnb_stats.nodes_explored;
+
+  i_t reliable_threshold = settings.reliability_branching;
+  if (reliable_threshold < 0) {
+    const i_t max_threshold            = reliability_branching_settings.max_reliable_threshold;
+    const i_t min_threshold            = reliability_branching_settings.min_reliable_threshold;
+    const f_t iter_factor              = reliability_branching_settings.bnb_lp_factor;
+    const i_t iter_offset              = reliability_branching_settings.bnb_lp_offset;
+    const int64_t alpha                = iter_factor * branch_and_bound_lp_iters;
+    const int64_t max_reliability_iter = alpha + reliability_branching_settings.bnb_lp_offset;
+
+    f_t iter_fraction =
+      (max_reliability_iter - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0);
+    iter_fraction = std::min(1.0, iter_fraction);
+    iter_fraction = std::max((alpha - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0),
+                             iter_fraction);
+    reliable_threshold = (1 - iter_fraction) * min_threshold + iter_fraction * max_threshold;
+    reliable_threshold = strong_branching_lp_iter < max_reliability_iter ? reliable_threshold : 0;
+  }
+
+  std::vector<i_t> unreliable_list;
+  omp_mutex_t score_mutex;
+
+  for (i_t j : fractional) {
+    if (pseudo_cost_num_down[j] < reliable_threshold ||
+        pseudo_cost_num_up[j] < reliable_threshold) {
+      unreliable_list.push_back(j);
+      continue;
     }
 
-    if (pseudo_cost_num_up[j] != 0) {
-      pseudo_cost_up[k] = pseudo_cost_sum_up[j] / pseudo_cost_num_up[j];
-    } else {
-      pseudo_cost_up[k] = pseudo_cost_up_avg;
+    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
     }
-    constexpr f_t eps = 1e-6;
-    const f_t f_down  = solution[j] - std::floor(solution[j]);
-    const f_t f_up    = std::ceil(solution[j]) - solution[j];
-    score[k] =
-      std::max(f_down * pseudo_cost_down[k], eps) * std::max(f_up * pseudo_cost_up[k], eps);
   }
 
-  i_t branch_var = fractional[0];
-  f_t max_score  = -1;
-  i_t select     = -1;
-  for (i_t k = 0; k < num_fractional; k++) {
-    if (score[k] > max_score) {
-      max_score  = score[k];
-      branch_var = fractional[k];
-      select     = k;
+  if (unreliable_list.empty()) {
+    log.printf(
+      "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
+
+    return branch_var;
+  }
+
+  const int num_tasks          = std::max(max_num_tasks, 1);
+  const int task_priority      = reliability_branching_settings.task_priority;
+  const i_t max_num_candidates = reliability_branching_settings.max_num_candidates;
+  const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
+
+  assert(task_priority > 0);
+  assert(max_num_candidates > 0);
+  assert(num_candidates > 0);
+  assert(num_tasks > 0);
+
+  log.printf(
+    "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n",
+    strong_branching_lp_iter.load(),
+    branch_and_bound_lp_iters,
+    unreliable_list.size(),
+    num_tasks,
+    reliable_threshold);
+
+  // Shuffle the unreliable list so every variable has the same chance to be selected.
+  if (unreliable_list.size() > max_num_candidates) { worker->rng.shuffle(unreliable_list); }
+
+  if (toc(start_time) > settings.time_limit) {
+    log.printf("Time limit reached");
+    return branch_var;
+  }
+
+#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
+  shared(score_mutex)
+  for (i_t i = 0; i < num_candidates; ++i) {
+    const i_t j = unreliable_list[i];
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    pseudo_cost_mutex_down[j].lock();
+    if (pseudo_cost_num_down[j] < reliable_threshold) {
+      // Do trial branching on the down branch
+      f_t obj = trial_branching(worker->leaf_problem,
+                                settings,
+                                var_types,
+                                node_ptr->vstatus,
+                                worker->leaf_edge_norms,
+                                worker->basis_factors,
+                                worker->basic_list,
+                                worker->nonbasic_list,
+                                j,
+                                worker->leaf_problem.lower[j],
+                                std::floor(solution[j]),
+                                upper_bound,
+                                branch_and_bound_lp_iter_per_node,
+                                start_time,
+                                reliability_branching_settings.upper_max_lp_iter,
+                                reliability_branching_settings.lower_max_lp_iter,
+                                strong_branching_lp_iter);
+
+      if (!std::isnan(obj)) {
+        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+        f_t change_in_x   = solution[j] - std::floor(solution[j]);
+        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_down[j]++;
+      }
     }
+    pseudo_cost_mutex_down[j].unlock();
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    pseudo_cost_mutex_up[j].lock();
+    if (pseudo_cost_num_up[j] < reliable_threshold) {
+      f_t obj = trial_branching(worker->leaf_problem,
+                                settings,
+                                var_types,
+                                node_ptr->vstatus,
+                                worker->leaf_edge_norms,
+                                worker->basis_factors,
+                                worker->basic_list,
+                                worker->nonbasic_list,
+                                j,
+                                std::ceil(solution[j]),
+                                worker->leaf_problem.upper[j],
+                                upper_bound,
+                                branch_and_bound_lp_iter_per_node,
+                                start_time,
+                                reliability_branching_settings.upper_max_lp_iter,
+                                reliability_branching_settings.lower_max_lp_iter,
+                                strong_branching_lp_iter);
+
+      if (!std::isnan(obj)) {
+        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_up[j]++;
+      }
+    }
+    pseudo_cost_mutex_up[j].unlock();
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+
+    score_mutex.lock();
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+    score_mutex.unlock();
   }
 
   log.printf(
-    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]);
+    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
 
   return branch_var;
 }
@@ -475,8 +733,6 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
                                            f_t lower_bound,
                                            logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
-
   const i_t num_fractional = fractional.size();
   f_t estimate             = lower_bound;
 
@@ -487,26 +743,15 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
 
   initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
-  for (i_t k = 0; k < num_fractional; k++) {
-    const i_t j          = fractional[k];
-    f_t pseudo_cost_down = 0;
-    f_t pseudo_cost_up   = 0;
-
-    if (pseudo_cost_num_down[j] != 0) {
-      pseudo_cost_down = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
-    } else {
-      pseudo_cost_down = pseudo_cost_down_avg;
-    }
-
-    if (pseudo_cost_num_up[j] != 0) {
-      pseudo_cost_up = pseudo_cost_sum_up[j] / pseudo_cost_num_up[j];
-    } else {
-      pseudo_cost_up = pseudo_cost_up_avg;
-    }
+  for (i_t j : fractional) {
     constexpr f_t eps = 1e-6;
-    const f_t f_down  = solution[j] - std::floor(solution[j]);
-    const f_t f_up    = std::ceil(solution[j]) - solution[j];
-    estimate += std::min(pseudo_cost_down * f_down, pseudo_cost_up * f_up);
+    i_t num_up        = pseudo_cost_num_up[j];
+    i_t num_down      = pseudo_cost_num_down[j];
+    f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
+    f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
+    f_t f_down        = solution[j] - std::floor(solution[j]);
+    f_t f_up          = std::ceil(solution[j]) - solution[j];
+    estimate += std::min(pc_down * f_down, pc_up * f_up);
   }
 
   log.printf("pseudocost estimate = %e\n", estimate);
diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp
index 4bdde710f..1d3b0deef 100644
--- a/cpp/src/dual_simplex/pseudo_costs.hpp
+++ b/cpp/src/dual_simplex/pseudo_costs.hpp
@@ -7,16 +7,52 @@
 
 #pragma once
 
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/branch_and_bound_worker.hpp>
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/mip_node.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/types.hpp>
 #include <utilities/omp_helpers.hpp>
+#include <utilities/pcgenerator.hpp>
 
 #include <omp.h>
 
 namespace cuopt::linear_programming::dual_simplex {
 
+template <typename i_t, typename f_t>
+struct reliability_branching_settings_t {
+  // Lower bound for the maximum number of LP iterations for a single trial branching
+  i_t lower_max_lp_iter = 10;
+
+  // Upper bound for the maximum number of LP iterations for a single trial branching
+  i_t upper_max_lp_iter = 500;
+
+  // Priority of the tasks created when running the trial branching in parallel.
+  // Set to 1 to have the same priority as the other tasks.
+  i_t task_priority = 5;
+
+  // The maximum number of candidates initialized by strong branching in a single
+  // node
+  i_t max_num_candidates = 100;
+
+  // Define the maximum number of iteration spent in strong branching.
+  // Let `bnb_lp_iter` = total number of iterations in B&B, then
+  // `max iter in strong branching = bnb_lp_factor * bnb_lp_iter + bnb_lp_offset`.
+  // This is used for determining the `reliable_threshold`.
+  f_t bnb_lp_factor = 0.5;
+  i_t bnb_lp_offset = 100000;
+
+  // Maximum and minimum points in curve to determine the value
+  // of the `reliable_threshold` based on the current number of LP
+  // iterations in strong branching and B&B. Since it is a
+  // a curve, the actual value of `reliable_threshold` may be
+  // higher than `max_reliable_threshold`.
+  // Only used when `reliable_threshold` is negative
+  i_t max_reliable_threshold = 5;
+  i_t min_reliable_threshold = 1;
+};
+
 template <typename i_t, typename f_t>
 class pseudo_costs_t {
  public:
@@ -24,7 +60,9 @@ class pseudo_costs_t {
     : pseudo_cost_sum_down(num_variables),
       pseudo_cost_sum_up(num_variables),
       pseudo_cost_num_down(num_variables),
-      pseudo_cost_num_up(num_variables)
+      pseudo_cost_num_up(num_variables),
+      pseudo_cost_mutex_up(num_variables),
+      pseudo_cost_mutex_down(num_variables)
   {
   }
 
@@ -36,6 +74,8 @@ class pseudo_costs_t {
     pseudo_cost_sum_up.assign(num_variables, 0);
     pseudo_cost_num_down.assign(num_variables, 0);
     pseudo_cost_num_up.assign(num_variables, 0);
+    pseudo_cost_mutex_up.resize(num_variables);
+    pseudo_cost_mutex_down.resize(num_variables);
   }
 
   void initialized(i_t& num_initialized_down,
@@ -52,17 +92,37 @@ class pseudo_costs_t {
                          const std::vector<f_t>& solution,
                          logger_t& log);
 
+  i_t reliable_variable_selection(mip_node_t<i_t, f_t>* node_ptr,
+                                  const std::vector<i_t>& fractional,
+                                  const std::vector<f_t>& solution,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                                  const std::vector<variable_type_t>& var_types,
+                                  branch_and_bound_worker_t<i_t, f_t>* worker,
+                                  const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
+                                  f_t upper_bound,
+                                  int max_num_tasks,
+                                  logger_t& log);
+
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& root_soln);
-  std::vector<f_t> pseudo_cost_sum_up;
-  std::vector<f_t> pseudo_cost_sum_down;
-  std::vector<i_t> pseudo_cost_num_up;
-  std::vector<i_t> pseudo_cost_num_down;
+
+  f_t calculate_pseudocost_score(i_t j,
+                                 const std::vector<f_t>& solution,
+                                 f_t pseudo_cost_up_avg,
+                                 f_t pseudo_cost_down_avg) const;
+
+  reliability_branching_settings_t<i_t, f_t> reliability_branching_settings;
+
+  std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_up;
+  std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_down;
+  std::vector<omp_atomic_t<i_t>> pseudo_cost_num_up;
+  std::vector<omp_atomic_t<i_t>> pseudo_cost_num_down;
   std::vector<f_t> strong_branch_down;
   std::vector<f_t> strong_branch_up;
-
-  omp_mutex_t mutex;
+  std::vector<omp_mutex_t> pseudo_cost_mutex_up;
+  std::vector<omp_mutex_t> pseudo_cost_mutex_down;
   omp_atomic_t<i_t> num_strong_branches_completed = 0;
+  omp_atomic_t<int64_t> strong_branching_lp_iter  = 0;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index f9911ee53..34c384949 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -22,18 +22,25 @@ namespace cuopt::linear_programming::dual_simplex {
 
 template <typename i_t, typename f_t>
 struct diving_heuristics_settings_t {
-  i_t num_diving_workers = -1;
-
   // -1 automatic, 0 disabled, 1 enabled
   i_t line_search_diving = -1;
   i_t pseudocost_diving  = -1;
   i_t guided_diving      = -1;
   i_t coefficient_diving = -1;
 
-  i_t min_node_depth         = 10;
-  i_t node_limit             = 500;
+  // The minimum depth to start diving from.
+  i_t min_node_depth = 10;
+
+  // The maximum number of nodes when performing a dive.
+  i_t node_limit = 500;
+
+  // The maximum number of dual simplex iteration allowed
+  // in a single dive. This set in terms of the total number of
+  // iterations in the best-first threads.
   f_t iteration_limit_factor = 0.05;
-  i_t backtrack_limit        = 5;
+
+  // The maximum backtracking allowed.
+  i_t backtrack_limit = 5;
 };
 
 template <typename i_t, typename f_t>
@@ -95,16 +102,14 @@ struct simplex_solver_settings_t {
       reduced_cost_strengthening(-1),
       cut_change_threshold(1e-3),
       cut_min_orthogonality(0.5),
-      num_bfs_workers(std::max(num_threads / 4, 1)),
       random_seed(0),
+      reliability_branching(-1),
       inside_mip(0),
       sub_mip(0),
-      reliability_branching(-1),
       solution_callback(nullptr),
       heuristic_preemption_callback(nullptr),
       concurrent_halt(nullptr)
   {
-    diving_settings.num_diving_workers = std::max(num_threads - num_bfs_workers, 1);
   }
 
   void set_log(bool logging) const { log.log = logging; }
@@ -175,15 +180,20 @@ struct simplex_solver_settings_t {
                                    // strengthening
   f_t cut_change_threshold;        // threshold for cut change
   f_t cut_min_orthogonality;       // minimum orthogonality for cuts
-  i_t num_bfs_workers;             // number of threads dedicated to the best-first search
   i_t mip_batch_pdlp_strong_branching{0};  // 0 if not using batch PDLP for strong branching, 1 if
                                            // using batch PDLP for strong branching
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
+  // Settings for the reliability branching.
+  // - -1: automatic
+  // - 0: disable (use pseudocost branching instead)
+  // - k > 0, a variable is considered reliable if it has been branched on k times.
+  i_t reliability_branching;
+
   i_t inside_mip;  // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node
   i_t sub_mip;     // 0 if in regular MIP solve, 1 if in sub-MIP solve
-  i_t reliability_branching;  // -1 automatic, 0 to disable, >0 to enable reliability branching
+
   std::function<void(std::vector<f_t>&, f_t)> solution_callback;
   std::function<void(const std::vector<f_t>&, f_t)> node_processed_callback;
   std::function<void()> heuristic_preemption_callback;
diff --git a/cpp/src/dual_simplex/solution.hpp b/cpp/src/dual_simplex/solution.hpp
index a678e2fd7..5739cedae 100644
--- a/cpp/src/dual_simplex/solution.hpp
+++ b/cpp/src/dual_simplex/solution.hpp
@@ -72,8 +72,8 @@ class mip_solution_t {
   std::vector<f_t> x;
   f_t objective;
   f_t lower_bound;
-  i_t nodes_explored;
-  i_t simplex_iterations;
+  int64_t nodes_explored;
+  int64_t simplex_iterations;
   bool has_incumbent;
 };
 
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 9dc6ac9c5..0858bb75a 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -98,6 +98,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 1, 0},
+    {CUOPT_MIP_RELIABILITY_BRANCHING, &mip_settings.reliability_branching, -1, std::numeric_limits<i_t>::max(), -1}
   };
 
     // Bool parameters
diff --git a/cpp/src/mip/diversity/lns/rins.cu b/cpp/src/mip/diversity/lns/rins.cu
index cb086e145..7b05f1964 100644
--- a/cpp/src/mip/diversity/lns/rins.cu
+++ b/cpp/src/mip/diversity/lns/rins.cu
@@ -259,20 +259,13 @@ void rins_t<i_t, f_t>::run_rins()
   branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
   branch_and_bound_settings.relative_mip_gap_tol =
     std::min(current_mip_gap, (f_t)settings.target_mip_gap);
-  branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
-  branch_and_bound_settings.num_threads     = 2;
-  branch_and_bound_settings.num_bfs_workers = 1;
-  branch_and_bound_settings.max_cut_passes  = 0;
-  branch_and_bound_settings.sub_mip         = 1;
-
-  // In the future, let RINS use all the diving heuristics. For now,
-  // restricting to guided diving.
-  branch_and_bound_settings.diving_settings.num_diving_workers = 1;
-  branch_and_bound_settings.diving_settings.line_search_diving = 0;
-  branch_and_bound_settings.diving_settings.coefficient_diving = 0;
-  branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
-  branch_and_bound_settings.log.log                            = false;
-  branch_and_bound_settings.log.log_prefix                     = "[RINS] ";
+  branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+  branch_and_bound_settings.num_threads = 1;
+  branch_and_bound_settings.reliability_branching = 0;
+  branch_and_bound_settings.max_cut_passes        = 0;
+  branch_and_bound_settings.sub_mip               = 1;
+  branch_and_bound_settings.log.log               = false;
+  branch_and_bound_settings.log.log_prefix        = "[RINS] ";
   branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector<f_t>& solution,
                                                                        f_t objective) {
     rins_solution_queue.push_back(solution);
diff --git a/cpp/src/mip/diversity/recombiners/sub_mip.cuh b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
index e636e7471..6e1c4d4ae 100644
--- a/cpp/src/mip/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
@@ -104,19 +104,12 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.print_presolve_stats = false;
       branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
       branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
-      branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
-      branch_and_bound_settings.num_threads     = 2;
-      branch_and_bound_settings.num_bfs_workers = 1;
-      branch_and_bound_settings.max_cut_passes  = 0;
-      branch_and_bound_settings.sub_mip         = 1;
-
-      // In the future, let SubMIP use all the diving heuristics. For now,
-      // restricting to guided diving.
-      branch_and_bound_settings.diving_settings.num_diving_workers = 1;
-      branch_and_bound_settings.diving_settings.line_search_diving = 0;
-      branch_and_bound_settings.diving_settings.coefficient_diving = 0;
-      branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
-      branch_and_bound_settings.solution_callback = [this](std::vector<f_t>& solution,
+      branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+      branch_and_bound_settings.num_threads = 1;
+      branch_and_bound_settings.reliability_branching = 0;
+      branch_and_bound_settings.max_cut_passes        = 0;
+      branch_and_bound_settings.sub_mip               = 1;
+      branch_and_bound_settings.solution_callback     = [this](std::vector<f_t>& solution,
                                                            f_t objective) {
         this->solution_callback(solution, objective);
       };
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index 2829babc9..8583a76f8 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -180,9 +180,10 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     branch_and_bound_settings.print_presolve_stats = false;
     branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
     branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
-    branch_and_bound_settings.integer_tol    = context.settings.tolerances.integrality_tolerance;
-    branch_and_bound_settings.max_cut_passes = context.settings.max_cut_passes;
-    branch_and_bound_settings.mir_cuts       = context.settings.mir_cuts;
+    branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+    branch_and_bound_settings.reliability_branching = solver_settings_.reliability_branching;
+    branch_and_bound_settings.max_cut_passes        = context.settings.max_cut_passes;
+    branch_and_bound_settings.mir_cuts              = context.settings.mir_cuts;
     branch_and_bound_settings.mixed_integer_gomory_cuts =
       context.settings.mixed_integer_gomory_cuts;
     branch_and_bound_settings.knapsack_cuts = context.settings.knapsack_cuts;
@@ -192,21 +193,15 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       context.settings.reduced_cost_strengthening;
     branch_and_bound_settings.cut_change_threshold  = context.settings.cut_change_threshold;
     branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality;
+    branch_and_bound_settings.mip_batch_pdlp_strong_branching =
+      context.settings.mip_batch_pdlp_strong_branching;
 
     if (context.settings.num_cpu_threads < 0) {
-      branch_and_bound_settings.num_threads = omp_get_max_threads() - 1;
+      branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1);
     } else {
       branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
     }
 
-    i_t num_threads                           = branch_and_bound_settings.num_threads;
-    i_t num_bfs_workers                       = std::max(1, num_threads / 4);
-    i_t num_diving_workers                    = std::max(1, num_threads - num_bfs_workers);
-    branch_and_bound_settings.num_bfs_workers = num_bfs_workers;
-    branch_and_bound_settings.diving_settings.num_diving_workers = num_diving_workers;
-    branch_and_bound_settings.mip_batch_pdlp_strong_branching =
-      context.settings.mip_batch_pdlp_strong_branching;
-
     // Set the branch and bound -> primal heuristics callback
     branch_and_bound_settings.solution_callback =
       std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::solution_callback,
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index e1b68bf88..f6e66472d 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -10,7 +10,8 @@
 #ifdef _OPENMP
 
 #include <omp.h>
-#include <type_traits>
+#include <memory>
+#include <utility>
 
 namespace cuopt {
 
@@ -18,14 +19,39 @@ namespace cuopt {
 // https://www.openmp.org/spec-html/5.1/openmpse39.html#x224-2570003.9
 class omp_mutex_t {
  public:
-  omp_mutex_t() { omp_init_lock(&mutex); }
-  virtual ~omp_mutex_t() { omp_destroy_lock(&mutex); }
-  void lock() { omp_set_lock(&mutex); }
-  void unlock() { omp_unset_lock(&mutex); }
-  bool try_lock() { return omp_test_lock(&mutex); }
+  omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); }
+
+  omp_mutex_t(const omp_mutex_t&) = delete;
+
+  omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); }
+
+  omp_mutex_t& operator=(const omp_mutex_t&) = delete;
+
+  omp_mutex_t& operator=(omp_mutex_t&& other)
+  {
+    if (&other != this) {
+      if (mutex) { omp_destroy_lock(mutex.get()); }
+      mutex = std::move(other.mutex);
+    }
+    return *this;
+  }
+
+  virtual ~omp_mutex_t()
+  {
+    if (mutex) {
+      omp_destroy_lock(mutex.get());
+      mutex.reset();
+    }
+  }
+
+  void lock() { omp_set_lock(mutex.get()); }
+
+  void unlock() { omp_unset_lock(mutex.get()); }
+
+  bool try_lock() { return omp_test_lock(mutex.get()); }
 
  private:
-  omp_lock_t mutex;
+  std::unique_ptr<omp_lock_t> mutex;
 };
 
 // Wrapper for omp atomic operations. See
@@ -42,7 +68,7 @@ class omp_atomic_t {
     return new_val;
   }
 
-  operator T() { return load(); }
+  operator T() const { return load(); }
   T operator+=(T inc) { return fetch_add(inc) + inc; }
   T operator-=(T inc) { return fetch_sub(inc) - inc; }
 
diff --git a/cpp/src/utilities/pcgenerator.hpp b/cpp/src/utilities/pcgenerator.hpp
new file mode 100644
index 000000000..29a865f02
--- /dev/null
+++ b/cpp/src/utilities/pcgenerator.hpp
@@ -0,0 +1,159 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <stdint.h>
+#include <utility>
+#include <vector>
+
+// Copied from raft/PCGenerator (rng_device.cuh). We cannot use raft implementation
+// on the CPU (.cpp file) since the raft header includes CUDA code.
+// The original code is from https://www.pcg-random.org/.
+namespace cuopt {
+class pcgenerator_t {
+ public:
+  static constexpr uint64_t default_seed   = 0x853c49e6748fea9bULL;
+  static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL;
+
+  /**
+   * @brief ctor. Initializes the PCG
+   * @param rng_state is the generator state used for initializing the generator
+   * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences
+   * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a
+   * unique set of random numbers. This can be achieved by initializing the generator with same
+   * rng_state for all the threads and diststreamt values for subsequence.
+   */
+  pcgenerator_t(const uint64_t seed        = default_seed,
+                const uint64_t subsequence = default_stream,
+                uint64_t offset            = 0)
+  {
+    set_seed(seed, subsequence, offset);
+  }
+
+  // Set the seed, subsequence and offset of the PCG
+  void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0)
+  {
+    state  = uint64_t(0);
+    stream = (subsequence << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    state += seed;
+    next(discard);
+    skipahead(offset);
+  }
+
+  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
+  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  void skipahead(uint64_t offset)
+  {
+    uint64_t G = 1;
+    uint64_t h = 6364136223846793005ULL;
+    uint64_t C = 0;
+    uint64_t f = stream;
+    while (offset) {
+      if (offset & 1) {
+        G = G * h;
+        C = C * h + f;
+      }
+      f = f * (h + 1);
+      h = h * h;
+      offset >>= 1;
+    }
+    state = state * G + C;
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @brief This code is derived from PCG basic code
+   * @{
+   */
+  uint32_t next_u32()
+  {
+    uint32_t ret;
+    uint64_t oldstate   = state;
+    state               = oldstate * 6364136223846793005ULL + stream;
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot        = oldstate >> 59u;
+    ret                 = (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+    return ret;
+  }
+
+  uint64_t next_u64()
+  {
+    uint64_t ret;
+    uint32_t a, b;
+    a   = next_u32();
+    b   = next_u32();
+    ret = uint64_t(a) | (uint64_t(b) << 32);
+    return ret;
+  }
+
+  int32_t next_i32()
+  {
+    int32_t ret;
+    uint32_t val;
+    val = next_u32();
+    ret = int32_t(val & 0x7fffffff);
+    return ret;
+  }
+
+  int64_t next_i64()
+  {
+    int64_t ret;
+    uint64_t val;
+    val = next_u64();
+    ret = int64_t(val & 0x7fffffffffffffff);
+    return ret;
+  }
+
+  float next_float() { return static_cast<float>((next_u32() >> 8) * 0x1.0p-24); }
+
+  double next_double() { return static_cast<double>((next_u64() >> 11) * 0x1.0p-53); }
+
+  template <typename T>
+  T next()
+  {
+    T val;
+    next(val);
+    return val;
+  }
+
+  void next(uint32_t& ret) { ret = next_u32(); }
+  void next(uint64_t& ret) { ret = next_u64(); }
+  void next(int32_t& ret) { ret = next_i32(); }
+  void next(int64_t& ret) { ret = next_i64(); }
+  void next(float& ret) { ret = next_float(); }
+  void next(double& ret) { ret = next_double(); }
+
+  /// Draws a sample from a uniform distribution. The samples are uniformly distributed over
+  /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward
+  /// some numbers in the range (scaling by floating-point).
+  template <typename T>
+  T uniform(T low, T high)
+  {
+    double val = next_double();
+    T range    = high - low;
+    return low + (val * range);
+  }
+
+  // Shuffles the contents of a sequence using the Fisher–Yates algorithm.
+  template <typename T>
+  void shuffle(std::vector<T>& seq)
+  {
+    if (seq.empty()) { return; }
+    for (size_t i = 0; i < seq.size() - 1; ++i) {
+      size_t j = uniform(i, seq.size());
+      if (j != i) std::swap(seq[i], seq[j]);
+    }
+  }
+
+ private:
+  uint64_t state;
+  uint64_t stream;
+};
+}  // namespace cuopt