diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp
index 213e38e5e..308c7087b 100644
--- a/benchmarks/linear_programming/cuopt/run_mip.cpp
+++ b/benchmarks/linear_programming/cuopt/run_mip.cpp
@@ -147,6 +147,7 @@ int run_single_file(std::string file_path,
                     int num_cpu_threads,
                     bool write_log_file,
                     bool log_to_console,
+                    int reliability_branching,
                     double time_limit)
 {
   const raft::handle_t handle_{};
@@ -196,7 +197,6 @@ int run_single_file(std::string file_path,
       }
     }
   }
-
   settings.time_limit                    = time_limit;
   settings.heuristics_only               = heuristics_only;
   settings.num_cpu_threads               = num_cpu_threads;
@@ -204,6 +204,7 @@ int run_single_file(std::string file_path,
   settings.tolerances.relative_tolerance = 1e-12;
   settings.tolerances.absolute_tolerance = 1e-6;
   settings.presolve                      = true;
+  settings.reliability_branching         = reliability_branching;
   cuopt::linear_programming::benchmark_info_t benchmark_info;
   settings.benchmark_info_ptr = &benchmark_info;
   auto start_run_solver       = std::chrono::high_resolution_clock::now();
@@ -256,6 +257,7 @@ void run_single_file_mp(std::string file_path,
                         int num_cpu_threads,
                         bool write_log_file,
                         bool log_to_console,
+                        int reliability_branching,
                         double time_limit)
 {
   std::cout << "running file " << file_path << " on gpu : " << device << std::endl;
@@ -271,6 +273,7 @@ void run_single_file_mp(std::string file_path,
                                   num_cpu_threads,
                                   write_log_file,
                                   log_to_console,
+                                  reliability_branching,
                                   time_limit);
   // this is a bad design to communicate the result but better than adding complexity of IPC or
   // pipes
@@ -354,6 +357,11 @@ int main(int argc, char* argv[])
     .help("track allocations (t/f)")
     .default_value(std::string("f"));
 
+  program.add_argument("--reliability-branching")
+    .help("reliability branching: -1 (automatic), 0 (disable) or k > 0 (use k)")
+    .scan<'i', int>()
+    .default_value(-1);
+
   // Parse arguments
   try {
     program.parse_args(argc, argv);
@@ -376,12 +384,13 @@ int main(int argc, char* argv[])
   std::string result_file;
   int batch_num = -1;
 
-  bool heuristics_only   = program.get<std::string>("--heuristics-only")[0] == 't';
-  int num_cpu_threads    = program.get<int>("--num-cpu-threads");
-  bool write_log_file    = program.get<std::string>("--write-log-file")[0] == 't';
-  bool log_to_console    = program.get<std::string>("--log-to-console")[0] == 't';
-  double memory_limit    = program.get<double>("--memory-limit");
-  bool track_allocations = program.get<std::string>("--track-allocations")[0] == 't';
+  bool heuristics_only      = program.get<std::string>("--heuristics-only")[0] == 't';
+  int num_cpu_threads       = program.get<int>("--num-cpu-threads");
+  bool write_log_file       = program.get<std::string>("--write-log-file")[0] == 't';
+  bool log_to_console       = program.get<std::string>("--log-to-console")[0] == 't';
+  double memory_limit       = program.get<double>("--memory-limit");
+  bool track_allocations    = program.get<std::string>("--track-allocations")[0] == 't';
+  int reliability_branching = program.get<int>("--reliability-branching");
 
   if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; }
 
@@ -469,6 +478,7 @@ int main(int argc, char* argv[])
                                num_cpu_threads,
                                write_log_file,
                                log_to_console,
+                               reliability_branching,
                                time_limit);
           } else if (sys_pid < 0) {
             std::cerr << "Fork failed!" << std::endl;
@@ -509,6 +519,7 @@ int main(int argc, char* argv[])
                     num_cpu_threads,
                     write_log_file,
                     log_to_console,
+                    reliability_branching,
                     time_limit);
   }
 
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 7caf7aeeb..c52c22c85 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -57,6 +57,7 @@
 #define CUOPT_MIP_HEURISTICS_ONLY             "mip_heuristics_only"
 #define CUOPT_MIP_SCALING                     "mip_scaling"
 #define CUOPT_MIP_PRESOLVE                    "mip_presolve"
+#define CUOPT_MIP_RELIABILITY_BRANCHING       "mip_reliability_branching"
 #define CUOPT_MIP_CUT_PASSES                  "mip_cut_passes"
 #define CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS "mip_mixed_integer_rounding_cuts"
 #define CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS   "mip_mixed_integer_gomory_cuts"
diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 863e5d66d..c5c26884f 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -86,6 +86,7 @@ class mip_solver_settings_t {
   f_t time_limit                = std::numeric_limits<f_t>::infinity();
   i_t node_limit                = std::numeric_limits<i_t>::max();
   bool heuristics_only          = false;
+  i_t reliability_branching     = -1;
   i_t num_cpu_threads           = -1;  // -1 means use default number of threads in branch and bound
   i_t max_cut_passes            = 10;  // number of cut passes to make
   i_t mir_cuts                  = -1;
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index 8eca3ba8a..d9783053f 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -223,6 +223,9 @@ class basis_update_mpf_t {
     reset_stats();
   }
 
+  basis_update_mpf_t(const basis_update_mpf_t& other)            = default;
+  basis_update_mpf_t& operator=(const basis_update_mpf_t& other) = default;
+
   void print_stats() const
   {
     i_t total_L_transpose_calls = total_sparse_L_transpose_ + total_dense_L_transpose_;
@@ -381,6 +384,8 @@ class basis_update_mpf_t {
                      std::vector<i_t>& nonbasic_list,
                      std::vector<variable_status_t>& vstatus);
 
+  void set_refactor_frequency(i_t new_frequency) { refactor_frequency_ = new_frequency; }
+
  private:
   void clear()
   {
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index d35415904..c6e8b0d17 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -201,26 +201,26 @@ std::string user_mip_gap(f_t obj_value, f_t lower_bound)
 }
 
 #ifdef SHOW_DIVING_TYPE
-inline char feasible_solution_symbol(bnb_worker_type_t type)
+inline char feasible_solution_symbol(search_strategy_t strategy)
 {
-  switch (type) {
-    case bnb_worker_type_t::BEST_FIRST: return 'B';
-    case bnb_worker_type_t::COEFFICIENT_DIVING: return 'C';
-    case bnb_worker_type_t::LINE_SEARCH_DIVING: return 'L';
-    case bnb_worker_type_t::PSEUDOCOST_DIVING: return 'P';
-    case bnb_worker_type_t::GUIDED_DIVING: return 'G';
+  switch (strategy) {
+    case search_strategy_t::BEST_FIRST: return 'B';
+    case search_strategy_t::COEFFICIENT_DIVING: return 'C';
+    case search_strategy_t::LINE_SEARCH_DIVING: return 'L';
+    case search_strategy_t::PSEUDOCOST_DIVING: return 'P';
+    case search_strategy_t::GUIDED_DIVING: return 'G';
     default: return 'U';
   }
 }
 #else
-inline char feasible_solution_symbol(bnb_worker_type_t type)
+inline char feasible_solution_symbol(search_strategy_t strategy)
 {
-  switch (type) {
-    case bnb_worker_type_t::BEST_FIRST: return 'B';
-    case bnb_worker_type_t::COEFFICIENT_DIVING: return 'D';
-    case bnb_worker_type_t::LINE_SEARCH_DIVING: return 'D';
-    case bnb_worker_type_t::PSEUDOCOST_DIVING: return 'D';
-    case bnb_worker_type_t::GUIDED_DIVING: return 'D';
+  switch (strategy) {
+    case search_strategy_t::BEST_FIRST: return 'B';
+    case search_strategy_t::COEFFICIENT_DIVING: return 'D';
+    case search_strategy_t::LINE_SEARCH_DIVING: return 'D';
+    case search_strategy_t::PSEUDOCOST_DIVING: return 'D';
+    case search_strategy_t::GUIDED_DIVING: return 'D';
     default: return 'U';
   }
 }
@@ -273,7 +273,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   }
 #endif
 
-  upper_bound_ = inf;
+  upper_bound_    = inf;
+  root_objective_ = std::numeric_limits<f_t>::quiet_NaN();
 }
 
 template <typename i_t, typename f_t>
@@ -282,18 +283,21 @@ f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
   f_t lower_bound      = lower_bound_ceiling_.load();
   f_t heap_lower_bound = node_queue_.get_lower_bound();
   lower_bound          = std::min(heap_lower_bound, lower_bound);
+  lower_bound          = std::min(worker_pool_.get_lower_bound(), lower_bound);
 
-  for (i_t i = 0; i < local_lower_bounds_.size(); ++i) {
-    lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound);
+  if (std::isfinite(lower_bound)) {
+    return lower_bound;
+  } else if (std::isfinite(root_objective_)) {
+    return root_objective_;
+  } else {
+    return -inf;
   }
-
-  return std::isfinite(lower_bound) ? lower_bound : -inf;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj)
 {
-  if (is_running) {
+  if (is_running_) {
     f_t user_obj         = compute_user_objective(original_lp_, obj);
     f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
     std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
@@ -671,7 +675,7 @@ template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
                                                          const std::vector<f_t>& leaf_solution,
                                                          i_t leaf_depth,
-                                                         bnb_worker_type_t thread_type)
+                                                         search_strategy_t thread_type)
 {
   bool send_solution = false;
 
@@ -720,91 +724,64 @@ template <typename i_t, typename f_t>
 branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   mip_node_t<i_t, f_t>* node_ptr,
   const std::vector<i_t>& fractional,
-  const std::vector<f_t>& solution,
-  bnb_worker_type_t type)
+  branch_and_bound_worker_t<i_t, f_t>* worker)
 {
   logger_t log;
   log.log                        = false;
   i_t branch_var                 = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   std::vector<f_t> current_incumbent;
+  std::vector<f_t>& solution = worker->leaf_solution.x;
+
+  switch (worker->search_strategy) {
+    case search_strategy_t::BEST_FIRST:
+
+      if (settings_.reliability_branching != 0) {
+        branch_var = pc_.reliable_variable_selection(node_ptr,
+                                                     fractional,
+                                                     solution,
+                                                     settings_,
+                                                     var_types_,
+                                                     worker,
+                                                     exploration_stats_,
+                                                     upper_bound_,
+                                                     worker_pool_.num_idle_workers(),
+                                                     log);
+      } else {
+        branch_var = pc_.variable_selection(fractional, solution, log);
+      }
 
-  // If there is no incumbent, use pseudocost diving instead of guided diving
-  if (upper_bound_ == inf && type == bnb_worker_type_t::GUIDED_DIVING) {
-    type = bnb_worker_type_t::PSEUDOCOST_DIVING;
-  }
+      round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
 
-  switch (type) {
-    case bnb_worker_type_t::BEST_FIRST:
-      branch_var = pc_.variable_selection(fractional, solution, log);
-      round_dir  = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
       return {branch_var, round_dir};
 
-    case bnb_worker_type_t::COEFFICIENT_DIVING:
+    case search_strategy_t::COEFFICIENT_DIVING:
       return coefficient_diving(
         original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log);
 
-    case bnb_worker_type_t::LINE_SEARCH_DIVING:
+    case search_strategy_t::LINE_SEARCH_DIVING:
       return line_search_diving(fractional, solution, root_relax_soln_.x, log);
 
-    case bnb_worker_type_t::PSEUDOCOST_DIVING:
+    case search_strategy_t::PSEUDOCOST_DIVING:
       return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log);
 
-    case bnb_worker_type_t::GUIDED_DIVING:
+    case search_strategy_t::GUIDED_DIVING:
       mutex_upper_.lock();
       current_incumbent = incumbent_.x;
       mutex_upper_.unlock();
       return guided_diving(pc_, fractional, solution, current_incumbent, log);
 
     default:
-      log.debug("Unknown variable selection method: %d\n", type);
+      log.debug("Unknown variable selection method: %d\n", worker->search_strategy);
       return {-1, rounding_direction_t::NONE};
   }
 }
 
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::initialize_diving_heuristics_settings(
-  std::vector<bnb_worker_type_t>& diving_strategies)
-{
-  diving_strategies.reserve(4);
-
-  if (settings_.diving_settings.pseudocost_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::PSEUDOCOST_DIVING);
-  }
-
-  if (settings_.diving_settings.line_search_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::LINE_SEARCH_DIVING);
-  }
-
-  if (settings_.diving_settings.guided_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::GUIDED_DIVING);
-  }
-
-  if (settings_.diving_settings.coefficient_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::COEFFICIENT_DIVING);
-    calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
-  }
-
-  if (diving_strategies.empty()) {
-    settings_.log.printf("Warning: All diving heuristics are disabled!\n");
-  }
-}
-
 template <typename i_t, typename f_t>
 dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   mip_node_t<i_t, f_t>* node_ptr,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  lp_solution_t<i_t, f_t>& leaf_solution,
-  std::vector<f_t>& leaf_edge_norms,
-  basis_update_mpf_t<i_t, f_t>& basis_factors,
-  std::vector<i_t>& basic_list,
-  std::vector<i_t>& nonbasic_list,
-  bounds_strengthening_t<i_t, f_t>& node_presolver,
-  bnb_worker_type_t thread_type,
-  bool recompute_bounds_and_basis,
-  const std::vector<f_t>& root_lower,
-  const std::vector<f_t>& root_upper,
-  bnb_stats_t<i_t, f_t>& stats,
+  branch_and_bound_worker_t<i_t, f_t>* worker,
+  branch_and_bound_stats_t<i_t, f_t>& stats,
   logger_t& log)
 {
 #ifdef DEBUG_BRANCHING
@@ -843,7 +820,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 #endif
 
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  assert(leaf_vstatus.size() == leaf_problem.num_cols);
+  assert(leaf_vstatus.size() == worker->leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
@@ -852,10 +829,10 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
   lp_settings.scale_columns = false;
 
-  if (thread_type != bnb_worker_type_t::BEST_FIRST) {
-    i_t bnb_lp_iters            = exploration_stats_.total_lp_iters;
+  if (worker->search_strategy != search_strategy_t::BEST_FIRST) {
+    int64_t bnb_lp_iters        = exploration_stats_.total_lp_iters;
     f_t factor                  = settings_.diving_settings.iteration_limit_factor;
-    i_t max_iter                = factor * bnb_lp_iters;
+    int64_t max_iter            = factor * bnb_lp_iters;
     lp_settings.iteration_limit = max_iter - stats.total_lp_iters;
     if (lp_settings.iteration_limit <= 0) { return dual::status_t::ITERATION_LIMIT; }
   }
@@ -882,24 +859,9 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
     node_ptr->vstatus[node_ptr->branch_var]);
 #endif
 
-  // Reset the bound_changed markers
-  std::vector<bool> bounds_changed(original_lp_.num_cols, false);
-
-  // Set the correct bounds for the leaf problem
-  if (recompute_bounds_and_basis) {
-    leaf_problem.lower = root_lower;
-    leaf_problem.upper = root_upper;
-    node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
-
-  } else {
-    node_ptr->update_branched_variable_bounds(
-      leaf_problem.lower, leaf_problem.upper, bounds_changed);
-  }
-
-  bool feasible = node_presolver.bounds_strengthening(
-    lp_settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
-
+  bool feasible            = worker->set_lp_variable_bounds(node_ptr, settings_);
   dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED;
+  worker->leaf_edge_norms  = edge_norms_;
 
   if (feasible) {
     i_t node_iter     = 0;
@@ -907,29 +869,29 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
     lp_status = dual_phase2_with_advanced_basis(2,
                                                 0,
-                                                recompute_bounds_and_basis,
+                                                worker->recompute_basis,
                                                 lp_start_time,
-                                                leaf_problem,
+                                                worker->leaf_problem,
                                                 lp_settings,
                                                 leaf_vstatus,
-                                                basis_factors,
-                                                basic_list,
-                                                nonbasic_list,
-                                                leaf_solution,
+                                                worker->basis_factors,
+                                                worker->basic_list,
+                                                worker->nonbasic_list,
+                                                worker->leaf_solution,
                                                 node_iter,
-                                                leaf_edge_norms);
+                                                worker->leaf_edge_norms);
 
     if (lp_status == dual::status_t::NUMERICAL) {
       log.debug("Numerical issue node %d. Resolving from scratch.\n", node_ptr->node_id);
-      lp_status_t second_status = solve_linear_program_with_advanced_basis(leaf_problem,
+      lp_status_t second_status = solve_linear_program_with_advanced_basis(worker->leaf_problem,
                                                                            lp_start_time,
                                                                            lp_settings,
-                                                                           leaf_solution,
-                                                                           basis_factors,
-                                                                           basic_list,
-                                                                           nonbasic_list,
+                                                                           worker->leaf_solution,
+                                                                           worker->basis_factors,
+                                                                           worker->basic_list,
+                                                                           worker->nonbasic_list,
                                                                            leaf_vstatus,
-                                                                           leaf_edge_norms);
+                                                                           worker->leaf_edge_norms);
 
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
@@ -944,20 +906,18 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   return lp_status;
 }
-
 template <typename i_t, typename f_t>
 std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::update_tree(
   mip_node_t<i_t, f_t>* node_ptr,
   search_tree_t<i_t, f_t>& search_tree,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  lp_solution_t<i_t, f_t>& leaf_solution,
-  std::vector<f_t>& leaf_edge_norms,
-  bnb_worker_type_t thread_type,
+  branch_and_bound_worker_t<i_t, f_t>* worker,
   dual::status_t lp_status,
   logger_t& log)
 {
   const f_t abs_fathom_tol                     = settings_.absolute_mip_gap_tol / 10;
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
+  lp_problem_t<i_t, f_t>& leaf_problem         = worker->leaf_problem;
+  lp_solution_t<i_t, f_t>& leaf_solution       = worker->leaf_solution;
 
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
     // Node was infeasible. Do not branch
@@ -997,12 +957,12 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
     }
 #endif
 
-    f_t leaf_objective    = compute_objective(leaf_problem, leaf_solution.x);
-    node_ptr->lower_bound = leaf_objective;
+    f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x);
     search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective);
     pc_.update_pseudo_costs(node_ptr, leaf_objective);
+    node_ptr->lower_bound = leaf_objective;
 
-    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+    if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
       if (settings_.node_processed_callback != nullptr) {
         std::vector<f_t> original_x;
         uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x);
@@ -1012,15 +972,15 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
 
     if (leaf_num_fractional == 0) {
       // Found a integer feasible solution
-      add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, thread_type);
+      add_feasible_solution(
+        leaf_objective, leaf_solution.x, node_ptr->depth, worker->search_strategy);
       search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective);
       search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE);
       return {node_status_t::INTEGER_FEASIBLE, rounding_direction_t::NONE};
 
     } else if (leaf_objective <= upper_bound_ + abs_fathom_tol) {
       // Choose fractional variable to branch on
-      auto [branch_var, round_dir] =
-        variable_selection(node_ptr, leaf_fractional, leaf_solution.x, thread_type);
+      auto [branch_var, round_dir] = variable_selection(node_ptr, leaf_fractional, worker);
 
       assert(leaf_vstatus.size() == leaf_problem.num_cols);
       assert(branch_var >= 0);
@@ -1029,7 +989,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
       // Note that the exploration thread is the only one that can insert new nodes into the heap,
       // and thus, we only need to calculate the objective estimate here (it is used for
       // sorting the nodes for diving).
-      if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+      if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
         logger_t pc_log;
         pc_log.log = false;
         node_ptr->objective_estimate =
@@ -1052,7 +1012,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
       return {node_status_t::FATHOMED, rounding_direction_t::NONE};
     }
   } else {
-    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+    if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
       fetch_min(lower_bound_ceiling_, node_ptr->lower_bound);
       log.printf(
         "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set "
@@ -1070,136 +1030,19 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* node,
-                                                       i_t initial_heap_size)
+void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker)
 {
-  if (solver_status_ != mip_status_t::UNSET) { return; }
-
-  // Note that we do not know which thread will execute the
-  // `exploration_ramp_up` task, so we allow to any thread
-  // to repair the heuristic solution.
-  repair_heuristic_solutions();
-
-  f_t lower_bound = node->lower_bound;
-  f_t upper_bound = upper_bound_;
-  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  f_t abs_gap     = upper_bound - lower_bound;
-
-  if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-    search_tree_.graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
-    search_tree_.update(node, node_status_t::FATHOMED);
-    --exploration_stats_.nodes_unexplored;
-    return;
-  }
-
-  f_t now = toc(exploration_stats_.start_time);
-  f_t time_since_last_log =
-    exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-  if (((exploration_stats_.nodes_since_last_log >= 10 ||
-        abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-       (time_since_last_log >= 1)) ||
-      (time_since_last_log > 30) || now > settings_.time_limit) {
-    bool should_report = should_report_.exchange(false);
-
-    if (should_report) {
-      report(' ', upper_bound, root_objective_, node->depth, node->integer_infeasible);
-      exploration_stats_.nodes_since_last_log = 0;
-      exploration_stats_.last_log             = tic();
-      should_report_                          = true;
-    }
-  }
-
-  if (now > settings_.time_limit) {
-    solver_status_ = mip_status_t::TIME_LIMIT;
-    return;
-  }
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
-
-  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-  std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
-  dual::status_t lp_status         = solve_node_lp(node,
-                                           leaf_problem,
-                                           leaf_solution,
-                                           leaf_edge_norms,
-                                           basis_factors,
-                                           basic_list,
-                                           nonbasic_list,
-                                           node_presolver,
-                                           bnb_worker_type_t::BEST_FIRST,
-                                           true,
-                                           original_lp_.lower,
-                                           original_lp_.upper,
-                                           exploration_stats_,
-                                           settings_.log);
-  if (lp_status == dual::status_t::TIME_LIMIT) {
-    solver_status_ = mip_status_t::TIME_LIMIT;
-    return;
-  }
-
-  ++exploration_stats_.nodes_since_last_log;
-  ++exploration_stats_.nodes_explored;
-  --exploration_stats_.nodes_unexplored;
-
-  auto [node_status, round_dir] = update_tree(node,
-                                              search_tree_,
-                                              leaf_problem,
-                                              leaf_solution,
-                                              leaf_edge_norms,
-                                              bnb_worker_type_t::BEST_FIRST,
-                                              lp_status,
-                                              settings_.log);
-
-  if (node_status == node_status_t::HAS_CHILDREN) {
-    exploration_stats_.nodes_unexplored += 2;
-
-    // If we haven't generated enough nodes to keep the threads busy, continue the ramp up phase
-    if (exploration_stats_.nodes_unexplored < initial_heap_size) {
-#pragma omp task
-      exploration_ramp_up(node->get_down_child(), initial_heap_size);
-
-#pragma omp task
-      exploration_ramp_up(node->get_up_child(), initial_heap_size);
-
-    } else {
-      // We've generated enough nodes, push further nodes onto the heap
-      node_queue_.push(node->get_down_child());
-      node_queue_.push(node->get_up_child());
-    }
-  }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
-                                               mip_node_t<i_t, f_t>* start_node,
-                                               lp_problem_t<i_t, f_t>& leaf_problem,
-                                               bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                               basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                               std::vector<i_t>& basic_list,
-                                               std::vector<i_t>& nonbasic_list)
-{
-  bool recompute_bounds_and_basis = true;
   std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(start_node);
-
-  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
-    if (task_id == 0) { repair_heuristic_solutions(); }
+  stack.push_front(worker->start_node);
+  worker->recompute_basis  = true;
+  worker->recompute_bounds = true;
 
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
     f_t lower_bound = node_ptr->lower_bound;
     f_t upper_bound = upper_bound_;
-    f_t abs_gap     = upper_bound - lower_bound;
     f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
 
     // This is based on three assumptions:
@@ -1208,58 +1051,28 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
     // - The current node and its siblings uses the lower bound of the parent before solving the LP
     // relaxation
     // - The lower bound of the parent is lower or equal to its children
-    assert(task_id < local_lower_bounds_.size());
-    local_lower_bounds_[task_id] = lower_bound;
+    worker->lower_bound = lower_bound;
 
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
       search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
       search_tree_.update(node_ptr, node_status_t::FATHOMED);
-      recompute_bounds_and_basis = true;
+      worker->recompute_basis  = true;
+      worker->recompute_bounds = true;
       --exploration_stats_.nodes_unexplored;
       continue;
     }
 
-    f_t now = toc(exploration_stats_.start_time);
-
-    if (task_id == 0) {
-      f_t time_since_last_log =
-        exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-      if (((exploration_stats_.nodes_since_last_log >= 1000 ||
-            abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-           time_since_last_log >= 1) ||
-          (time_since_last_log > 30) || now > settings_.time_limit) {
-        report(' ', upper_bound, get_lower_bound(), node_ptr->depth, node_ptr->integer_infeasible);
-        exploration_stats_.last_log             = tic();
-        exploration_stats_.nodes_since_last_log = 0;
-      }
-    }
-
-    if (now > settings_.time_limit) {
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
       solver_status_ = mip_status_t::TIME_LIMIT;
       break;
     }
+
     if (exploration_stats_.nodes_explored >= settings_.node_limit) {
       solver_status_ = mip_status_t::NODE_LIMIT;
       break;
     }
 
-    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-    std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
-    dual::status_t lp_status         = solve_node_lp(node_ptr,
-                                             leaf_problem,
-                                             leaf_solution,
-                                             leaf_edge_norms,
-                                             basis_factors,
-                                             basic_list,
-                                             nonbasic_list,
-                                             node_presolver,
-                                             bnb_worker_type_t::BEST_FIRST,
-                                             recompute_bounds_and_basis,
-                                             original_lp_.lower,
-                                             original_lp_.upper,
-                                             exploration_stats_,
-                                             settings_.log);
+    dual::status_t lp_status = solve_node_lp(node_ptr, worker, exploration_stats_, settings_.log);
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
@@ -1272,16 +1085,11 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
     ++exploration_stats_.nodes_explored;
     --exploration_stats_.nodes_unexplored;
 
-    auto [node_status, round_dir] = update_tree(node_ptr,
-                                                search_tree_,
-                                                leaf_problem,
-                                                leaf_solution,
-                                                leaf_edge_norms,
-                                                bnb_worker_type_t::BEST_FIRST,
-                                                lp_status,
-                                                settings_.log);
+    auto [node_status, round_dir] =
+      update_tree(node_ptr, search_tree_, worker, lp_status, settings_.log);
 
-    recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN;
+    worker->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
     if (node_status == node_status_t::HAS_CHILDREN) {
       // The stack should only contain the children of the current parent.
@@ -1297,143 +1105,73 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
       exploration_stats_.nodes_unexplored += 2;
 
       if (round_dir == rounding_direction_t::UP) {
-        stack.push_front(node_ptr->get_down_child());
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_down_child());
+        } else {
+          stack.push_front(node_ptr->get_down_child());
+        }
+
         stack.push_front(node_ptr->get_up_child());
       } else {
-        stack.push_front(node_ptr->get_up_child());
-        stack.push_front(node_ptr->get_down_child());
-      }
-    }
-  }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t task_id)
-{
-  f_t lower_bound = -inf;
-  f_t abs_gap     = inf;
-  f_t rel_gap     = inf;
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_up_child());
+        } else {
+          stack.push_front(node_ptr->get_up_child());
+        }
 
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol &&
-         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
-    // In the current implementation, we are use the active number of subtree to decide
-    // when to stop the execution. We need to increment the counter at the same
-    // time as we pop a node from the queue to avoid some threads exiting
-    // the main loop thinking that the solver has already finished.
-    // This will be not needed in the master-worker model.
-    node_queue_.lock();
-    // If there any node left in the heap, we pop the top node and explore it.
-    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-    if (start_node.has_value()) { active_subtrees_++; };
-    node_queue_.unlock();
-
-    if (start_node.has_value()) {
-      if (upper_bound_ < start_node.value()->lower_bound) {
-        // This node was put on the heap earlier but its lower bound is now greater than the
-        // current upper bound
-        search_tree_.graphviz_node(
-          settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-        search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-        active_subtrees_--;
-        continue;
+        stack.push_front(node_ptr->get_down_child());
       }
-
-      // Best-first search with plunging
-      plunge_from(task_id,
-                  start_node.value(),
-                  leaf_problem,
-                  node_presolver,
-                  basis_factors,
-                  basic_list,
-                  nonbasic_list);
-
-      active_subtrees_--;
     }
-
-    lower_bound = get_lower_bound();
-    abs_gap     = upper_bound_ - lower_bound;
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
   }
 
-  is_running = false;
-
-  // Check if it is the last thread that exited the loop and no
-  // timeout or numerical error has happen.
-  if (solver_status_ == mip_status_t::UNSET) {
-    if (active_subtrees_ > 0) { local_lower_bounds_[task_id] = inf; }
+  if (settings_.num_threads > 1) {
+    worker_pool_.return_worker_to_pool(worker);
+    active_workers_per_strategy_[BEST_FIRST]--;
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
-                                             const std::vector<f_t>& start_lower,
-                                             const std::vector<f_t>& start_upper,
-                                             lp_problem_t<i_t, f_t>& leaf_problem,
-                                             bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                             basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                             std::vector<i_t>& basic_list,
-                                             std::vector<i_t>& nonbasic_list,
-                                             bnb_worker_type_t diving_type)
+void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>* worker)
 {
   logger_t log;
   log.log = false;
 
-  const i_t diving_node_limit      = settings_.diving_settings.node_limit;
-  const i_t diving_backtrack_limit = settings_.diving_settings.backtrack_limit;
-  bool recompute_bounds_and_basis  = true;
-  search_tree_t<i_t, f_t> dive_tree(std::move(start_node));
+  search_strategy_t search_strategy = worker->search_strategy;
+  const i_t diving_node_limit       = settings_.diving_settings.node_limit;
+  const i_t diving_backtrack_limit  = settings_.diving_settings.backtrack_limit;
+
+  worker->recompute_basis  = true;
+  worker->recompute_bounds = true;
+
+  search_tree_t<i_t, f_t> dive_tree(std::move(*worker->start_node));
   std::deque<mip_node_t<i_t, f_t>*> stack;
   stack.push_front(&dive_tree.root);
 
-  bnb_stats_t<i_t, f_t> dive_stats;
+  branch_and_bound_stats_t<i_t, f_t> dive_stats;
   dive_stats.total_lp_iters      = 0;
   dive_stats.total_lp_solve_time = 0;
   dive_stats.nodes_explored      = 0;
   dive_stats.nodes_unexplored    = 1;
 
-  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running_) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
-    f_t lower_bound = node_ptr->lower_bound;
-    f_t upper_bound = upper_bound_;
-    f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    f_t lower_bound     = node_ptr->lower_bound;
+    f_t upper_bound     = upper_bound_;
+    f_t rel_gap         = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    worker->lower_bound = lower_bound;
 
     if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-      recompute_bounds_and_basis = true;
+      worker->recompute_basis  = true;
+      worker->recompute_bounds = true;
       continue;
     }
 
     if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; }
     if (dive_stats.nodes_explored > diving_node_limit) { break; }
 
-    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-    std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
-    dual::status_t lp_status         = solve_node_lp(node_ptr,
-                                             leaf_problem,
-                                             leaf_solution,
-                                             leaf_edge_norms,
-                                             basis_factors,
-                                             basic_list,
-                                             nonbasic_list,
-                                             node_presolver,
-                                             diving_type,
-                                             recompute_bounds_and_basis,
-                                             start_lower,
-                                             start_upper,
-                                             dive_stats,
-                                             log);
+    dual::status_t lp_status = solve_node_lp(node_ptr, worker, dive_stats, log);
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
@@ -1444,15 +1182,10 @@ void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
 
     ++dive_stats.nodes_explored;
 
-    auto [node_status, round_dir] = update_tree(node_ptr,
-                                                dive_tree,
-                                                leaf_problem,
-                                                leaf_solution,
-                                                leaf_edge_norms,
-                                                diving_type,
-                                                lp_status,
-                                                log);
-    recompute_bounds_and_basis    = node_status != node_status_t::HAS_CHILDREN;
+    auto [node_status, round_dir] = update_tree(node_ptr, dive_tree, worker, lp_status, log);
+
+    worker->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
     if (node_status == node_status_t::HAS_CHILDREN) {
       if (round_dir == rounding_direction_t::UP) {
@@ -1470,68 +1203,207 @@ void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
       stack.pop_back();
     }
   }
+
+  worker_pool_.return_worker_to_pool(worker);
+  active_workers_per_strategy_[search_strategy]--;
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::diving_thread(bnb_worker_type_t diving_type)
+void branch_and_bound_t<i_t, f_t>::run_scheduler()
 {
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-  std::vector<bool> bounds_changed(original_lp_.num_cols, false);
+  diving_heuristics_settings_t<i_t, f_t> diving_settings = settings_.diving_settings;
+  const i_t num_workers                                  = 2 * settings_.num_threads;
 
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
+  if (!std::isfinite(upper_bound_)) { diving_settings.guided_diving = false; }
+  std::vector<search_strategy_t> strategies = get_search_strategies(diving_settings);
+  std::array<i_t, num_search_strategies> max_num_workers_per_type =
+    get_max_workers(num_workers, strategies);
+
+  worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, settings_);
+  active_workers_per_strategy_.fill(0);
+
+#ifdef CUOPT_LOG_DEBUG
+  for (auto strategy : strategies) {
+    settings_.log.debug("%c%d: max num of workers = %d",
+                        feasible_solution_symbol(strategy),
+                        strategy,
+                        max_num_workers_per_type[strategy]);
+  }
+#endif
+
+  f_t lower_bound     = get_lower_bound();
+  f_t abs_gap         = upper_bound_ - lower_bound;
+  f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+  i_t last_node_depth = 0;
+  i_t last_int_infeas = 0;
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol &&
+         (active_workers_per_strategy_[0] > 0 || node_queue_.best_first_queue_size() > 0)) {
+    bool launched_any_task = false;
+    lower_bound            = get_lower_bound();
+    abs_gap                = upper_bound_ - lower_bound;
+    rel_gap                = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    repair_heuristic_solutions();
+
+    // If the guided diving was disabled previously due to the lack of an incumbent solution,
+    // re-enable as soon as a new incumbent is found.
+    if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) {
+      if (std::isfinite(upper_bound_)) {
+        diving_settings.guided_diving = settings_.diving_settings.guided_diving;
+        strategies                    = get_search_strategies(diving_settings);
+        max_num_workers_per_type      = get_max_workers(num_workers, strategies);
+
+#ifdef CUOPT_LOG_DEBUG
+        for (auto type : strategies) {
+          settings_.log.debug("%c%d: max num of workers = %d",
+                              feasible_solution_symbol(type),
+                              type,
+                              max_num_workers_per_type[type]);
+        }
+#endif
+      }
+    }
 
-  std::vector<f_t> start_lower;
-  std::vector<f_t> start_upper;
-  bool reset_starting_bounds = true;
-
-  while (solver_status_ == mip_status_t::UNSET && is_running &&
-         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
-    if (reset_starting_bounds) {
-      start_lower = original_lp_.lower;
-      start_upper = original_lp_.upper;
-      std::fill(bounds_changed.begin(), bounds_changed.end(), false);
-      reset_starting_bounds = false;
+    f_t now = toc(exploration_stats_.start_time);
+    f_t time_since_last_log =
+      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+         time_since_last_log >= 1) ||
+        (time_since_last_log > 30) || now > settings_.time_limit) {
+      i_t queue_size = node_queue_.best_first_queue_size();
+      i_t depth      = queue_size > 0 ? node_queue_.bfs_top()->depth : last_node_depth;
+      i_t int_infeas = queue_size > 0 ? node_queue_.bfs_top()->integer_infeasible : last_int_infeas;
+      report(' ', upper_bound_, lower_bound, depth, int_infeas);
+      exploration_stats_.last_log             = tic();
+      exploration_stats_.nodes_since_last_log = 0;
     }
 
-    // In the current implementation, multiple threads can pop the nodes
-    // from the queue, so we need to initialize the lower and upper bound here
-    // to avoid other thread fathoming the node (i.e., deleting) before we can read
-    // the variable bounds from the tree.
-    // This will be not needed in the master-worker model.
-    node_queue_.lock();
-    std::optional<mip_node_t<i_t, f_t>*> node_ptr  = node_queue_.pop_diving();
-    std::optional<mip_node_t<i_t, f_t>> start_node = std::nullopt;
-
-    if (node_ptr.has_value()) {
-      node_ptr.value()->get_variable_bounds(start_lower, start_upper, bounds_changed);
-      start_node = node_ptr.value()->detach_copy();
+    if (now > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
     }
-    node_queue_.unlock();
 
-    if (start_node.has_value()) {
-      reset_starting_bounds = true;
+    for (auto strategy : strategies) {
+      if (active_workers_per_strategy_[strategy] >= max_num_workers_per_type[strategy]) {
+        continue;
+      }
+
+      // Get an idle worker.
+      branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
+      if (worker == nullptr) { break; }
+
+      if (strategy == BEST_FIRST) {
+        // If there any node left in the heap, we pop the top node and explore it.
+        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+
+        if (!start_node.has_value()) { continue; }
+        if (upper_bound_ < start_node.value()->lower_bound) {
+          // This node was put on the heap earlier but its lower bound is now greater than the
+          // current upper bound
+          search_tree_.graphviz_node(
+            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
+          continue;
+        }
 
-      if (upper_bound_ < start_node->lower_bound) { continue; }
-      bool is_feasible =
-        node_presolver.bounds_strengthening(settings_, bounds_changed, start_lower, start_upper);
-      if (!is_feasible) { continue; }
-
-      dive_from(start_node.value(),
-                start_lower,
-                start_upper,
-                leaf_problem,
-                node_presolver,
-                basis_factors,
-                basic_list,
-                nonbasic_list,
-                diving_type);
+        // Remove the worker from the idle list.
+        worker_pool_.pop_idle_worker();
+        worker->init_best_first(start_node.value(), original_lp_);
+        last_node_depth = start_node.value()->depth;
+        last_int_infeas = start_node.value()->integer_infeasible;
+        active_workers_per_strategy_[strategy]++;
+        launched_any_task = true;
+
+#pragma omp task affinity(worker)
+        plunge_with(worker);
+
+      } else {
+        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
+
+        if (!start_node.has_value()) { continue; }
+        if (upper_bound_ < start_node.value()->lower_bound ||
+            start_node.value()->depth < diving_settings.min_node_depth) {
+          continue;
+        }
+
+        bool is_feasible =
+          worker->init_diving(start_node.value(), strategy, original_lp_, settings_);
+        if (!is_feasible) { continue; }
+
+        // Remove the worker from the idle list.
+        worker_pool_.pop_idle_worker();
+        active_workers_per_strategy_[strategy]++;
+        launched_any_task = true;
+
+#pragma omp task affinity(worker)
+        dive_with(worker);
+      }
     }
+
+    // If no new task was launched in this iteration, suspend temporarily the
+    // execution of the scheduler. As of 8/Jan/2026, GCC does not
+    // implement taskyield, but LLVM does.
+    if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+  }
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
+{
+  branch_and_bound_worker_t<i_t, f_t> worker(0, original_lp_, Arow_, var_types_, settings_);
+
+  f_t lower_bound = get_lower_bound();
+  f_t abs_gap     = upper_bound_ - lower_bound;
+  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) {
+    bool launched_any_task = false;
+    lower_bound            = get_lower_bound();
+    abs_gap                = upper_bound_ - lower_bound;
+    rel_gap                = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    repair_heuristic_solutions();
+
+    f_t now = toc(exploration_stats_.start_time);
+    f_t time_since_last_log =
+      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+         time_since_last_log >= 1) ||
+        (time_since_last_log > 30) || now > settings_.time_limit) {
+      i_t depth      = node_queue_.bfs_top()->depth;
+      i_t int_infeas = node_queue_.bfs_top()->integer_infeasible;
+      report(' ', upper_bound_, lower_bound, depth, int_infeas);
+      exploration_stats_.last_log             = tic();
+      exploration_stats_.nodes_since_last_log = 0;
+    }
+
+    if (now > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    }
+
+    // If there any node left in the heap, we pop the top node and explore it.
+    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+
+    if (!start_node.has_value()) { continue; }
+    if (upper_bound_ < start_node.value()->lower_bound) {
+      // This node was put on the heap earlier but its lower bound is now greater than the
+      // current upper bound
+      search_tree_.graphviz_node(
+        settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+      search_tree_.update(start_node.value(), node_status_t::FATHOMED);
+      continue;
+    }
+
+    worker.init_best_first(start_node.value(), original_lp_);
+    plunge_with(&worker);
   }
 }
 
@@ -1675,7 +1547,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                          solver_name.c_str());
     settings_.log.printf("Root relaxation objective %+.8e\n", user_objective);
   } else {
-    settings_.log.printf("Root relaxation returned status: %s\n",
+    settings_.log.printf("Root relaxation returned: %s\n",
                          lp_status_to_string(root_status).c_str());
   }
 
@@ -1692,14 +1564,11 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   log.log                             = false;
   log.log_prefix                      = settings_.log.log_prefix;
   solver_status_                      = mip_status_t::UNSET;
-  is_running                          = false;
+  is_running_                         = false;
   exploration_stats_.nodes_unexplored = 0;
   exploration_stats_.nodes_explored   = 0;
   original_lp_.A.to_compressed_row(Arow_);
 
-  std::vector<bnb_worker_type_t> diving_strategies;
-  initialize_diving_heuristics_settings(diving_strategies);
-
   if (guess_.size() != 0) {
     std::vector<f_t> crushed_guess;
     crush_primal_solution(original_problem_, original_lp_, guess_, new_slacks_, crushed_guess);
@@ -1728,6 +1597,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   std::vector<i_t> nonbasic_list;
   basis_update_mpf_t<i_t, f_t> basis_update(original_lp_.num_rows, settings_.refactor_frequency);
   lp_status_t root_status;
+
   if (!enable_concurrent_lp_root_solve()) {
     // RINS/SUBMIP path
     settings_.log.printf("\nSolving LP root relaxation with dual simplex\n");
@@ -1775,6 +1645,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     set_final_solution(solution, -inf);
     return solver_status_;
   }
+
   if (root_status == lp_status_t::NUMERICAL_ISSUES) {
     solver_status_ = mip_status_t::NUMERICAL;
     set_final_solution(solution, -inf);
@@ -1785,7 +1656,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   set_uninitialized_steepest_edge_norms<i_t, f_t>(original_lp_, basic_list, edge_norms_);
 
   root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
-  local_lower_bounds_.assign(settings_.num_bfs_workers, root_objective_);
 
   if (settings_.set_simplex_solution_callback != nullptr) {
     std::vector<f_t> original_x;
@@ -1806,12 +1676,13 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   i_t num_fractional = fractional_variables(settings_, root_relax_soln_.x, var_types_, fractional);
 
   cut_info_t<i_t, f_t> cut_info;
+
   if (num_fractional == 0) {
     set_solution_at_root(solution, cut_info);
     return mip_status_t::OPTIMAL;
   }
 
-  is_running           = true;
+  is_running_          = true;
   lower_bound_ceiling_ = inf;
 
   if (num_fractional != 0 && settings_.max_cut_passes > 0) {
@@ -2026,8 +1897,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
         }
       }
 
-      local_lower_bounds_.assign(settings_.num_bfs_workers, root_objective_);
-
       f_t remove_cuts_start_time = tic();
       mutex_original_lp_.lock();
       remove_cuts(original_lp_,
@@ -2171,58 +2040,39 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       root_vstatus_,
                       original_lp_,
                       log);
+  node_queue_.push(search_tree_.root.get_down_child());
+  node_queue_.push(search_tree_.root.get_up_child());
 
-  settings_.log.printf("Exploring the B&B tree using %d threads (best-first = %d, diving = %d)\n\n",
-                       settings_.num_threads,
-                       settings_.num_bfs_workers,
-                       settings_.num_threads - settings_.num_bfs_workers);
+  settings_.log.printf("Exploring the B&B tree using %d threads\n\n", settings_.num_threads);
 
   exploration_stats_.nodes_explored       = 0;
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
-  active_subtrees_                        = 0;
-  lower_bound_ceiling_                    = inf;
-  should_report_                          = true;
+  min_node_queue_size_                    = 2 * settings_.num_threads;
+
+  if (settings_.diving_settings.coefficient_diving != 0) {
+    calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
+  }
 
   settings_.log.printf(
     " | Explored | Unexplored |    Objective    |     Bound     | IntInf | Depth | Iter/Node |   "
     "Gap    "
     "|  Time  |\n");
+
+  if (settings_.num_threads > 1) {
 #pragma omp parallel num_threads(settings_.num_threads)
-  {
-#pragma omp master
     {
-      auto down_child          = search_tree_.root.get_down_child();
-      auto up_child            = search_tree_.root.get_up_child();
-      i_t initial_size         = 2 * settings_.num_threads;
-      const i_t num_strategies = diving_strategies.size();
-
-#pragma omp taskgroup
-      {
-#pragma omp task
-        exploration_ramp_up(down_child, initial_size);
-
-#pragma omp task
-        exploration_ramp_up(up_child, initial_size);
-      }
-
-      for (i_t i = 0; i < settings_.num_bfs_workers; i++) {
-#pragma omp task
-        best_first_thread(i);
-      }
-
-      if (!diving_strategies.empty()) {
-        for (i_t k = 0; k < settings_.diving_settings.num_diving_workers; k++) {
-          const bnb_worker_type_t diving_type = diving_strategies[k % num_strategies];
-#pragma omp task
-          diving_thread(diving_type);
-        }
-      }
+#pragma omp master
+      run_scheduler();
     }
+
+  } else {
+    single_threaded_solve();
   }
 
-  is_running      = false;
+  is_running_ = false;
+
   f_t lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
                                                             : search_tree_.root.lower_bound;
   set_final_solution(solution, lower_bound);
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index 6db45e153..1d6947681 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <dual_simplex/branch_and_bound_worker.hpp>
 #include <dual_simplex/cuts.hpp>
 #include <dual_simplex/diving_heuristics.hpp>
 #include <dual_simplex/initial_basis.hpp>
@@ -37,38 +38,9 @@ enum class mip_status_t {
   UNSET      = 6,  // The status is not set
 };
 
-// Indicate the search and variable selection algorithms used by each thread
-// in B&B (See [1]).
-//
-// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
-// Berlin, 2007. doi: 10.14279/depositonce-1634.
-enum class bnb_worker_type_t {
-  BEST_FIRST         = 0,  // Best-First + Plunging.
-  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
-  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
-  GUIDED_DIVING = 3,  // Guided diving (9.2.3). If no incumbent is found yet, use pseudocost diving.
-  COEFFICIENT_DIVING = 4  // Coefficient diving (9.2.1)
-};
-
-template <typename i_t, typename f_t>
-class bounds_strengthening_t;
-
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
-template <typename i_t, typename f_t>
-struct bnb_stats_t {
-  f_t start_time                        = 0.0;
-  omp_atomic_t<f_t> total_lp_solve_time = 0.0;
-  omp_atomic_t<i_t> nodes_explored      = 0;
-  omp_atomic_t<i_t> nodes_unexplored    = 0;
-  omp_atomic_t<f_t> total_lp_iters      = 0;
-
-  // This should only be used by the main thread
-  omp_atomic_t<f_t> last_log             = 0.0;
-  omp_atomic_t<i_t> nodes_since_last_log = 0;
-};
-
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
@@ -153,9 +125,6 @@ class branch_and_bound_t {
   std::vector<i_t> var_up_locks_;
   std::vector<i_t> var_down_locks_;
 
-  // Local lower bounds for each thread
-  std::vector<omp_atomic_t<f_t>> local_lower_bounds_;
-
   // Mutex for the original LP
   // The heuristics threads look at the original LP. But the main thread modifies the
   // size of the original LP by adding slacks for cuts. Heuristic threads should lock
@@ -173,7 +142,7 @@ class branch_and_bound_t {
   mip_solution_t<i_t, f_t> incumbent_;
 
   // Structure with the general info of the solver.
-  bnb_stats_t<i_t, f_t> exploration_stats_;
+  branch_and_bound_stats_t<i_t, f_t> exploration_stats_;
 
   // Mutex for repair
   omp_mutex_t mutex_repair_;
@@ -200,14 +169,21 @@ class branch_and_bound_t {
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
 
-  // Count the number of subtrees that are currently being explored.
-  omp_atomic_t<i_t> active_subtrees_;
+  // Count the number of workers per type that either are being executed or
+  // are waiting to be executed.
+  std::array<omp_atomic_t<i_t>, num_search_strategies> active_workers_per_strategy_;
+
+  // Worker pool
+  branch_and_bound_worker_pool_t<i_t, f_t> worker_pool_;
 
   // Global status of the solver.
   omp_atomic_t<mip_status_t> solver_status_;
-  omp_atomic_t<bool> is_running{false};
+  omp_atomic_t<bool> is_running_{false};
 
-  omp_atomic_t<bool> should_report_;
+  // Minimum number of node in the queue. When the queue size is less than
+  // this variable, the nodes are added directly to the queue instead of
+  // the local stack. This also determines the end of the ramp-up phase.
+  i_t min_node_queue_size_;
 
   // In case, a best-first thread encounters a numerical issue when solving a node,
   // its blocks the progression of the lower bound.
@@ -230,82 +206,48 @@ class branch_and_bound_t {
   void add_feasible_solution(f_t leaf_objective,
                              const std::vector<f_t>& leaf_solution,
                              i_t leaf_depth,
-                             bnb_worker_type_t thread_type);
+                             search_strategy_t thread_type);
 
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
-  // Initialize diving heuristics settings
-  void initialize_diving_heuristics_settings(std::vector<bnb_worker_type_t>& diving_strategies);
-
-  // Ramp-up phase of the solver, where we greedily expand the tree until
-  // there is enough unexplored nodes. This is done recursively using OpenMP tasks.
-  void exploration_ramp_up(mip_node_t<i_t, f_t>* node, i_t initial_heap_size);
-
   // We use best-first to pick the `start_node` and then perform a depth-first search
   // from this node (i.e., a plunge). It can only backtrack to a sibling node.
   // Unexplored nodes in the subtree are inserted back into the global heap.
-  void plunge_from(i_t task_id,
-                   mip_node_t<i_t, f_t>* start_node,
-                   lp_problem_t<i_t, f_t>& leaf_problem,
-                   bounds_strengthening_t<i_t, f_t>& node_presolver,
-                   basis_update_mpf_t<i_t, f_t>& basis_update,
-                   std::vector<i_t>& basic_list,
-                   std::vector<i_t>& nonbasic_list);
-
-  // Each "main" thread pops a node from the global heap and then performs a plunge
-  // (i.e., a shallow dive) into the subtree determined by the node.
-  void best_first_thread(i_t task_id);
+  void plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker);
 
   // Perform a deep dive in the subtree determined by the `start_node` in order
   // to find integer feasible solutions.
-  void dive_from(mip_node_t<i_t, f_t>& start_node,
-                 const std::vector<f_t>& start_lower,
-                 const std::vector<f_t>& start_upper,
-                 lp_problem_t<i_t, f_t>& leaf_problem,
-                 bounds_strengthening_t<i_t, f_t>& node_presolver,
-                 basis_update_mpf_t<i_t, f_t>& basis_update,
-                 std::vector<i_t>& basic_list,
-                 std::vector<i_t>& nonbasic_list,
-                 bnb_worker_type_t diving_type);
-
-  // Each diving thread pops the first node from the dive queue and then performs
-  // a deep dive into the subtree determined by the node.
-  void diving_thread(bnb_worker_type_t diving_type);
+  void dive_with(branch_and_bound_worker_t<i_t, f_t>* worker);
+
+  // Run the scheduler whose will schedule and manage
+  // all the other workers.
+  void run_scheduler();
+
+  // Run the branch-and-bound algorithm in single threaded mode.
+  // This disable all diving heuristics.
+  void single_threaded_solve();
 
   // Solve the LP relaxation of a leaf node
   dual::status_t solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
-                               lp_problem_t<i_t, f_t>& leaf_problem,
-                               lp_solution_t<i_t, f_t>& leaf_solution,
-                               std::vector<f_t>& leaf_edge_norms,
-                               basis_update_mpf_t<i_t, f_t>& basis_factors,
-                               std::vector<i_t>& basic_list,
-                               std::vector<i_t>& nonbasic_list,
-                               bounds_strengthening_t<i_t, f_t>& node_presolver,
-                               bnb_worker_type_t thread_type,
-                               bool recompute_bounds_and_basis,
-                               const std::vector<f_t>& root_lower,
-                               const std::vector<f_t>& root_upper,
-                               bnb_stats_t<i_t, f_t>& stats,
+                               branch_and_bound_worker_t<i_t, f_t>* worker,
+                               branch_and_bound_stats_t<i_t, f_t>& stats,
                                logger_t& log);
 
   // Update the tree based on the LP relaxation. Returns the status
   // of the node and, if appropriated, the preferred rounding direction
   // when visiting the children.
-  std::pair<node_status_t, rounding_direction_t> update_tree(mip_node_t<i_t, f_t>* node_ptr,
-                                                             search_tree_t<i_t, f_t>& search_tree,
-                                                             lp_problem_t<i_t, f_t>& leaf_problem,
-                                                             lp_solution_t<i_t, f_t>& leaf_solution,
-                                                             std::vector<f_t>& leaf_edge_norms,
-                                                             bnb_worker_type_t thread_type,
-                                                             dual::status_t lp_status,
-                                                             logger_t& log);
+  std::pair<node_status_t, rounding_direction_t> update_tree(
+    mip_node_t<i_t, f_t>* node_ptr,
+    search_tree_t<i_t, f_t>& search_tree,
+    branch_and_bound_worker_t<i_t, f_t>* worker,
+    dual::status_t lp_status,
+    logger_t& log);
 
   // Selects the variable to branch on.
   branch_variable_t<i_t> variable_selection(mip_node_t<i_t, f_t>* node_ptr,
                                             const std::vector<i_t>& fractional,
-                                            const std::vector<f_t>& solution,
-                                            bnb_worker_type_t type);
+                                            branch_and_bound_worker_t<i_t, f_t>* worker);
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/branch_and_bound_worker.hpp b/cpp/src/dual_simplex/branch_and_bound_worker.hpp
new file mode 100644
index 000000000..99a59a871
--- /dev/null
+++ b/cpp/src/dual_simplex/branch_and_bound_worker.hpp
@@ -0,0 +1,280 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bounds_strengthening.hpp>
+#include <dual_simplex/mip_node.hpp>
+#include <dual_simplex/phase2.hpp>
+
+#include <utilities/pcgenerator.hpp>
+
+#include <array>
+#include <deque>
+#include <mutex>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+constexpr int num_search_strategies = 5;
+
+// Indicate the search and variable selection algorithms used by each thread
+// in B&B (See [1]).
+//
+// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
+// Berlin, 2007. doi: 10.14279/depositonce-1634.
+enum search_strategy_t : int {
+  BEST_FIRST         = 0,  // Best-First + Plunging.
+  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
+  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
+  GUIDED_DIVING      = 3,  // Guided diving (9.2.3).
+  COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
+};
+
+template <typename i_t, typename f_t>
+struct branch_and_bound_stats_t {
+  f_t start_time                         = 0.0;
+  omp_atomic_t<f_t> total_lp_solve_time  = 0.0;
+  omp_atomic_t<int64_t> nodes_explored   = 0;
+  omp_atomic_t<int64_t> nodes_unexplored = 0;
+  omp_atomic_t<int64_t> total_lp_iters   = 0;
+  omp_atomic_t<i_t> nodes_since_last_log = 0;
+  omp_atomic_t<f_t> last_log             = 0.0;
+};
+
+template <typename i_t, typename f_t>
+class branch_and_bound_worker_t {
+ public:
+  const i_t worker_id;
+  omp_atomic_t<search_strategy_t> search_strategy;
+  omp_atomic_t<bool> is_active;
+  omp_atomic_t<f_t> lower_bound;
+
+  lp_problem_t<i_t, f_t> leaf_problem;
+  lp_solution_t<i_t, f_t> leaf_solution;
+  std::vector<f_t> leaf_edge_norms;
+
+  basis_update_mpf_t<i_t, f_t> basis_factors;
+  std::vector<i_t> basic_list;
+  std::vector<i_t> nonbasic_list;
+
+  bounds_strengthening_t<i_t, f_t> node_presolver;
+  std::vector<bool> bounds_changed;
+
+  std::vector<f_t> start_lower;
+  std::vector<f_t> start_upper;
+  mip_node_t<i_t, f_t>* start_node;
+
+  pcgenerator_t rng;
+
+  bool recompute_basis  = true;
+  bool recompute_bounds = true;
+
+  branch_and_bound_worker_t(i_t worker_id,
+                            const lp_problem_t<i_t, f_t>& original_lp,
+                            const csr_matrix_t<i_t, f_t>& Arow,
+                            const std::vector<variable_type_t>& var_type,
+                            const simplex_solver_settings_t<i_t, f_t>& settings)
+    : worker_id(worker_id),
+      search_strategy(BEST_FIRST),
+      is_active(false),
+      lower_bound(-std::numeric_limits<f_t>::infinity()),
+      leaf_problem(original_lp),
+      leaf_solution(original_lp.num_rows, original_lp.num_cols),
+      basis_factors(original_lp.num_rows, settings.refactor_frequency),
+      basic_list(original_lp.num_rows),
+      nonbasic_list(),
+      node_presolver(leaf_problem, Arow, {}, var_type),
+      bounds_changed(original_lp.num_cols, false),
+      rng(settings.random_seed + pcgenerator_t::default_seed + worker_id,
+          pcgenerator_t::default_stream ^ worker_id)
+  {
+  }
+
+  // Set the `start_node` for best-first search.
+  void init_best_first(mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
+  {
+    start_node      = node;
+    start_lower     = original_lp.lower;
+    start_upper     = original_lp.upper;
+    search_strategy = BEST_FIRST;
+    lower_bound     = node->lower_bound;
+    is_active       = true;
+  }
+
+  // Initialize the worker for diving, setting the `start_node`, `start_lower` and
+  // `start_upper`. Returns `true` if the starting node is feasible via
+  // bounds propagation.
+  bool init_diving(mip_node_t<i_t, f_t>* node,
+                   search_strategy_t type,
+                   const lp_problem_t<i_t, f_t>& original_lp,
+                   const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    internal_node = node->detach_copy();
+    start_node    = &internal_node;
+
+    start_lower     = original_lp.lower;
+    start_upper     = original_lp.upper;
+    search_strategy = type;
+    lower_bound     = node->lower_bound;
+    is_active       = true;
+
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
+    return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper);
+  }
+
+  // Set the variables bounds for the LP relaxation of the current node.
+  bool set_lp_variable_bounds(mip_node_t<i_t, f_t>* node_ptr,
+                              const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    // Reset the bound_changed markers
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+
+    // Set the correct bounds for the leaf problem
+    if (recompute_bounds) {
+      leaf_problem.lower = start_lower;
+      leaf_problem.upper = start_upper;
+      node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
+
+    } else {
+      node_ptr->update_branched_variable_bounds(
+        leaf_problem.lower, leaf_problem.upper, bounds_changed);
+    }
+
+    return node_presolver.bounds_strengthening(
+      settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
+  }
+
+ private:
+  // For diving, we need to store the full node instead of
+  // of just a pointer, since it is not stored in the tree anymore.
+  // To keep the same interface across all worker types,
+  // this will be used as a temporary storage and
+  // will be pointed by `start_node`.
+  // For exploration, this will not be used.
+  mip_node_t<i_t, f_t> internal_node;
+};
+
+template <typename i_t, typename f_t>
+class branch_and_bound_worker_pool_t {
+ public:
+  void init(i_t num_workers,
+            const lp_problem_t<i_t, f_t>& original_lp,
+            const csr_matrix_t<i_t, f_t>& Arow,
+            const std::vector<variable_type_t>& var_type,
+            const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    workers_.resize(num_workers);
+    num_idle_workers_ = num_workers;
+    for (i_t i = 0; i < num_workers; ++i) {
+      workers_[i] = std::make_unique<branch_and_bound_worker_t<i_t, f_t>>(
+        i, original_lp, Arow, var_type, settings);
+      idle_workers_.push_front(i);
+    }
+
+    is_initialized = true;
+  }
+
+  // Here, we are assuming that the scheduler is the only
+  // thread that can retrieve/pop an idle worker.
+  branch_and_bound_worker_t<i_t, f_t>* get_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    if (idle_workers_.empty()) {
+      return nullptr;
+    } else {
+      i_t idx = idle_workers_.front();
+      return workers_[idx].get();
+    }
+  }
+
+  // Here, we are assuming that the scheduler is the only
+  // thread that can retrieve/pop an idle worker.
+  void pop_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    if (!idle_workers_.empty()) {
+      idle_workers_.pop_front();
+      num_idle_workers_--;
+    }
+  }
+
+  void return_worker_to_pool(branch_and_bound_worker_t<i_t, f_t>* worker)
+  {
+    worker->is_active = false;
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    idle_workers_.push_back(worker->worker_id);
+    num_idle_workers_++;
+  }
+
+  f_t get_lower_bound()
+  {
+    f_t lower_bound = std::numeric_limits<f_t>::infinity();
+
+    if (is_initialized) {
+      for (i_t i = 0; i < workers_.size(); ++i) {
+        if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) {
+          lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
+        }
+      }
+    }
+
+    return lower_bound;
+  }
+
+  i_t num_idle_workers() { return num_idle_workers_; }
+
+ private:
+  // Worker pool
+  std::vector<std::unique_ptr<branch_and_bound_worker_t<i_t, f_t>>> workers_;
+  bool is_initialized = false;
+
+  omp_mutex_t mutex_;
+  std::deque<i_t> idle_workers_;
+  omp_atomic_t<i_t> num_idle_workers_;
+};
+
+template <typename f_t, typename i_t>
+std::vector<search_strategy_t> get_search_strategies(
+  diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  std::vector<search_strategy_t> types;
+  types.reserve(num_search_strategies);
+  types.push_back(BEST_FIRST);
+  if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); }
+  if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); }
+  if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); }
+  if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); }
+  return types;
+}
+
+template <typename i_t>
+std::array<i_t, num_search_strategies> get_max_workers(
+  i_t num_workers, const std::vector<search_strategy_t>& strategies)
+{
+  std::array<i_t, num_search_strategies> max_num_workers;
+  max_num_workers.fill(0);
+
+  i_t bfs_workers             = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1);
+  max_num_workers[BEST_FIRST] = bfs_workers;
+
+  i_t diving_workers = (num_workers - bfs_workers);
+  i_t m              = strategies.size() - 1;
+
+  for (size_t i = 1, k = 0; i < strategies.size(); ++i) {
+    i_t start                      = (double)k * diving_workers / m;
+    i_t end                        = (double)(k + 1) * diving_workers / m;
+    max_num_workers[strategies[i]] = end - start;
+    ++k;
+  }
+
+  return max_num_workers;
+}
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/diving_heuristics.cpp b/cpp/src/dual_simplex/diving_heuristics.cpp
index a56b4cce3..b3e91bc09 100644
--- a/cpp/src/dual_simplex/diving_heuristics.cpp
+++ b/cpp/src/dual_simplex/diving_heuristics.cpp
@@ -71,9 +71,8 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
                                          const std::vector<f_t>& root_solution,
                                          logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(pc.mutex);
   i_t branch_var                 = -1;
-  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  f_t max_score                  = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   constexpr f_t eps              = 1e-6;
 
@@ -127,6 +126,15 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
     }
   }
 
+  // If we cannot choose the variable, then arbitrarily pick the first
+  // fractional variable and round it down. This only happens when
+  // there is only one fractional variable and its the pseudocost is
+  // infinite for both direction.
+  if (round_dir == rounding_direction_t::NONE) {
+    branch_var = fractional[0];
+    round_dir  = rounding_direction_t::DOWN;
+  }
+
   assert(round_dir != rounding_direction_t::NONE);
   assert(branch_var >= 0);
 
@@ -146,9 +154,8 @@ branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
                                      const std::vector<f_t>& incumbent,
                                      logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(pc.mutex);
   i_t branch_var                 = -1;
-  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  f_t max_score                  = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   constexpr f_t eps              = 1e-6;
 
diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp
index 5ee4f49d1..8b2211c1d 100644
--- a/cpp/src/dual_simplex/mip_node.hpp
+++ b/cpp/src/dual_simplex/mip_node.hpp
@@ -227,18 +227,26 @@ class mip_node_t {
   }
 
   // This method creates a copy of the current node
-  // with its parent set to `nullptr` and `depth = 0`.
+  // with its parent set to `nullptr`
   // This detaches the node from the tree.
   mip_node_t<i_t, f_t> detach_copy() const
   {
-    mip_node_t<i_t, f_t> copy(lower_bound, vstatus);
+    mip_node_t<i_t, f_t> copy;
+    copy.lower_bound        = lower_bound;
+    copy.objective_estimate = objective_estimate;
+    copy.depth              = depth;
+    copy.node_id            = node_id;
+    copy.integer_infeasible = integer_infeasible;
+    copy.vstatus            = vstatus;
     copy.branch_var         = branch_var;
     copy.branch_dir         = branch_dir;
     copy.branch_var_lower   = branch_var_lower;
     copy.branch_var_upper   = branch_var_upper;
     copy.fractional_val     = fractional_val;
-    copy.objective_estimate = objective_estimate;
-    copy.node_id            = node_id;
+    copy.parent             = nullptr;
+    copy.children[0]        = nullptr;
+    copy.children[1]        = nullptr;
+    copy.status             = node_status_t::PENDING;
     return copy;
   }
 
diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp
index 28072795a..b5c9a9ea7 100644
--- a/cpp/src/dual_simplex/node_queue.hpp
+++ b/cpp/src/dual_simplex/node_queue.hpp
@@ -114,26 +114,29 @@ class node_queue_t {
 
   std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
   {
+    std::lock_guard<omp_mutex_t> lock(mutex);
     auto entry = best_first_heap.pop();
+
     if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
+
     return std::nullopt;
   }
 
   std::optional<mip_node_t<i_t, f_t>*> pop_diving()
   {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+
     while (!diving_heap.empty()) {
       auto entry = diving_heap.pop();
+
       if (entry.has_value()) {
         if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
       }
     }
+
     return std::nullopt;
   }
 
-  void lock() { mutex.lock(); }
-
-  void unlock() { mutex.unlock(); }
-
   i_t diving_queue_size()
   {
     std::lock_guard<omp_mutex_t> lock(mutex);
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index 978896887..d104036ea 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -619,7 +619,7 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
   }
 
   constexpr bool run_bounds_strengthening = false;
-  if (run_bounds_strengthening) {
+  if constexpr (run_bounds_strengthening) {
     csr_matrix_t<i_t, f_t> Arow(1, 1, 1);
     problem.A.to_compressed_row(Arow);
 
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 015034b77..682bdaa6f 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -88,7 +88,7 @@ void strong_branch_helper(i_t start,
       }
 
       if (branch == 0) {
-        pc.strong_branch_down[k] = obj - root_obj;
+        pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0);
         if (verbose) {
           settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n",
                               thread_id,
@@ -99,7 +99,7 @@ void strong_branch_helper(i_t start,
                               toc(start_time));
         }
       } else {
-        pc.strong_branch_up[k] = obj - root_obj;
+        pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0);
         if (verbose) {
           settings.log.printf(
             "Thread id %2d remaining %d variable %d branch %d obj %e change down %e change up %e "
@@ -135,6 +135,83 @@ void strong_branch_helper(i_t start,
   }
 }
 
+template <typename i_t, typename f_t>
+f_t trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
+                    const simplex_solver_settings_t<i_t, f_t>& settings,
+                    const std::vector<variable_type_t>& var_types,
+                    const std::vector<variable_status_t>& vstatus,
+                    const std::vector<f_t>& edge_norms,
+                    const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                    const std::vector<i_t>& basic_list,
+                    const std::vector<i_t>& nonbasic_list,
+                    i_t branch_var,
+                    f_t branch_var_lower,
+                    f_t branch_var_upper,
+                    f_t upper_bound,
+                    i_t bnb_lp_iter_per_node,
+                    f_t start_time,
+                    i_t upper_max_lp_iter,
+                    i_t lower_max_lp_iter,
+                    omp_atomic_t<int64_t>& total_lp_iter)
+{
+  lp_problem_t child_problem      = original_lp;
+  child_problem.lower[branch_var] = branch_var_lower;
+  child_problem.upper[branch_var] = branch_var_upper;
+
+  const bool initialize_basis                        = false;
+  simplex_solver_settings_t<i_t, f_t> child_settings = settings;
+  child_settings.set_log(false);
+  i_t lp_iter_upper              = upper_max_lp_iter;
+  i_t lp_iter_lower              = lower_max_lp_iter;
+  child_settings.iteration_limit = std::clamp(bnb_lp_iter_per_node, lp_iter_lower, lp_iter_upper);
+  child_settings.cut_off         = upper_bound + settings.dual_tol;
+  child_settings.inside_mip      = 2;
+  child_settings.scale_columns   = false;
+
+  lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
+  i_t iter                                         = 0;
+  std::vector<variable_status_t> child_vstatus     = vstatus;
+  std::vector<f_t> child_edge_norms                = edge_norms;
+  std::vector<i_t> child_basic_list                = basic_list;
+  std::vector<i_t> child_nonbasic_list             = nonbasic_list;
+  basis_update_mpf_t<i_t, f_t> child_basis_factors = basis_factors;
+
+  // Only refactor the basis if we encounter numerical issues.
+  child_basis_factors.set_refactor_frequency(upper_max_lp_iter);
+
+  dual::status_t status = dual_phase2_with_advanced_basis(2,
+                                                          0,
+                                                          initialize_basis,
+                                                          start_time,
+                                                          child_problem,
+                                                          child_settings,
+                                                          child_vstatus,
+                                                          child_basis_factors,
+                                                          child_basic_list,
+                                                          child_nonbasic_list,
+                                                          solution,
+                                                          iter,
+                                                          child_edge_norms);
+  total_lp_iter += iter;
+  settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n",
+                     branch_var,
+                     child_problem.lower[branch_var],
+                     child_problem.upper[branch_var],
+                     iter,
+                     dual::status_to_string(status).c_str(),
+                     compute_objective(child_problem, solution.x));
+
+  if (status == dual::status_t::DUAL_UNBOUNDED) {
+    // LP was infeasible
+    return std::numeric_limits<f_t>::infinity();
+  } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
+             status == dual::status_t::CUTOFF) {
+    return compute_objective(child_problem, solution.x);
+  } else {
+    return std::numeric_limits<f_t>::quiet_NaN();
+  }
+}
+
 }  // namespace
 
 template <typename i_t, typename f_t>
@@ -350,15 +427,31 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
   pc.update_pseudo_costs_from_strong_branching(fractional, root_soln);
 }
 
+template <typename i_t, typename f_t>
+f_t pseudo_costs_t<i_t, f_t>::calculate_pseudocost_score(i_t j,
+                                                         const std::vector<f_t>& solution,
+                                                         f_t pseudo_cost_up_avg,
+                                                         f_t pseudo_cost_down_avg) const
+{
+  constexpr f_t eps = 1e-6;
+  i_t num_up        = pseudo_cost_num_up[j];
+  i_t num_down      = pseudo_cost_num_down[j];
+  f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
+  f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
+  f_t f_down        = solution[j] - std::floor(solution[j]);
+  f_t f_up          = std::ceil(solution[j]) - solution[j];
+  return std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
+}
+
 template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr,
                                                    f_t leaf_objective)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
-  const f_t change_in_obj = leaf_objective - node_ptr->lower_bound;
+  const f_t change_in_obj = std::max(leaf_objective - node_ptr->lower_bound, 0.0);
   const f_t frac          = node_ptr->branch_dir == rounding_direction_t::DOWN
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
                               : std::ceil(node_ptr->fractional_val) - node_ptr->fractional_val;
+
   if (node_ptr->branch_dir == rounding_direction_t::DOWN) {
     pseudo_cost_sum_down[node_ptr->branch_var] += change_in_obj / frac;
     pseudo_cost_num_down[node_ptr->branch_var]++;
@@ -412,13 +505,55 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
                                                  const std::vector<f_t>& solution,
                                                  logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
+  i_t branch_var = fractional[0];
+  f_t max_score  = -1;
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
 
-  const i_t num_fractional = fractional.size();
-  std::vector<f_t> pseudo_cost_up(num_fractional);
-  std::vector<f_t> pseudo_cost_down(num_fractional);
-  std::vector<f_t> score(num_fractional);
+  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
+  log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
+             num_initialized_down,
+             num_initialized_up,
+             pseudo_cost_down_avg,
+             pseudo_cost_up_avg);
+
+  for (i_t j : fractional) {
+    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+  }
+
+  log.debug("Pseudocost branching on %d. Value %e. Score %e.\n",
+            branch_var,
+            solution[branch_var],
+            max_score);
+
+  return branch_var;
+}
+
+template <typename i_t, typename f_t>
+i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
+  mip_node_t<i_t, f_t>* node_ptr,
+  const std::vector<i_t>& fractional,
+  const std::vector<f_t>& solution,
+  const simplex_solver_settings_t<i_t, f_t>& settings,
+  const std::vector<variable_type_t>& var_types,
+  branch_and_bound_worker_t<i_t, f_t>* worker,
+  const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
+  f_t upper_bound,
+  int max_num_tasks,
+  logger_t& log)
+{
+  constexpr f_t eps = 1e-6;
+  f_t start_time    = bnb_stats.start_time;
+  i_t branch_var    = fractional[0];
+  f_t max_score     = -1;
   i_t num_initialized_down;
   i_t num_initialized_up;
   f_t pseudo_cost_down_avg;
@@ -432,39 +567,162 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
              pseudo_cost_down_avg,
              pseudo_cost_up_avg);
 
-  for (i_t k = 0; k < num_fractional; k++) {
-    const i_t j = fractional[k];
-    if (pseudo_cost_num_down[j] != 0) {
-      pseudo_cost_down[k] = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
-    } else {
-      pseudo_cost_down[k] = pseudo_cost_down_avg;
+  const int64_t branch_and_bound_lp_iters = bnb_stats.total_lp_iters;
+  const int64_t branch_and_bound_explored = bnb_stats.nodes_explored;
+  const i_t branch_and_bound_lp_iter_per_node =
+    branch_and_bound_lp_iters / bnb_stats.nodes_explored;
+
+  i_t reliable_threshold = settings.reliability_branching;
+  if (reliable_threshold < 0) {
+    const i_t max_threshold            = reliability_branching_settings.max_reliable_threshold;
+    const i_t min_threshold            = reliability_branching_settings.min_reliable_threshold;
+    const f_t iter_factor              = reliability_branching_settings.bnb_lp_factor;
+    const i_t iter_offset              = reliability_branching_settings.bnb_lp_offset;
+    const int64_t alpha                = iter_factor * branch_and_bound_lp_iters;
+    const int64_t max_reliability_iter = alpha + reliability_branching_settings.bnb_lp_offset;
+
+    f_t iter_fraction =
+      (max_reliability_iter - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0);
+    iter_fraction = std::min(1.0, iter_fraction);
+    iter_fraction = std::max((alpha - strong_branching_lp_iter) / (strong_branching_lp_iter + 1.0),
+                             iter_fraction);
+    reliable_threshold = (1 - iter_fraction) * min_threshold + iter_fraction * max_threshold;
+    reliable_threshold = strong_branching_lp_iter < max_reliability_iter ? reliable_threshold : 0;
+  }
+
+  std::vector<i_t> unreliable_list;
+  omp_mutex_t score_mutex;
+
+  for (i_t j : fractional) {
+    if (pseudo_cost_num_down[j] < reliable_threshold ||
+        pseudo_cost_num_up[j] < reliable_threshold) {
+      unreliable_list.push_back(j);
+      continue;
     }
 
-    if (pseudo_cost_num_up[j] != 0) {
-      pseudo_cost_up[k] = pseudo_cost_sum_up[j] / pseudo_cost_num_up[j];
-    } else {
-      pseudo_cost_up[k] = pseudo_cost_up_avg;
+    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
     }
-    constexpr f_t eps = 1e-6;
-    const f_t f_down  = solution[j] - std::floor(solution[j]);
-    const f_t f_up    = std::ceil(solution[j]) - solution[j];
-    score[k] =
-      std::max(f_down * pseudo_cost_down[k], eps) * std::max(f_up * pseudo_cost_up[k], eps);
   }
 
-  i_t branch_var = fractional[0];
-  f_t max_score  = -1;
-  i_t select     = -1;
-  for (i_t k = 0; k < num_fractional; k++) {
-    if (score[k] > max_score) {
-      max_score  = score[k];
-      branch_var = fractional[k];
-      select     = k;
+  if (unreliable_list.empty()) {
+    log.printf(
+      "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
+
+    return branch_var;
+  }
+
+  const int num_tasks          = std::max(max_num_tasks, 1);
+  const int task_priority      = reliability_branching_settings.task_priority;
+  const i_t max_num_candidates = reliability_branching_settings.max_num_candidates;
+  const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
+
+  assert(task_priority > 0);
+  assert(max_num_candidates > 0);
+  assert(num_candidates > 0);
+  assert(num_tasks > 0);
+
+  log.printf(
+    "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n",
+    strong_branching_lp_iter.load(),
+    branch_and_bound_lp_iters,
+    unreliable_list.size(),
+    num_tasks,
+    reliable_threshold);
+
+  // Shuffle the unreliable list so every variable has the same chance to be selected.
+  if (unreliable_list.size() > max_num_candidates) { worker->rng.shuffle(unreliable_list); }
+
+  if (toc(start_time) > settings.time_limit) {
+    log.printf("Time limit reached");
+    return branch_var;
+  }
+
+#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
+  shared(score_mutex)
+  for (i_t i = 0; i < num_candidates; ++i) {
+    const i_t j = unreliable_list[i];
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    pseudo_cost_mutex_down[j].lock();
+    if (pseudo_cost_num_down[j] < reliable_threshold) {
+      // Do trial branching on the down branch
+      f_t obj = trial_branching(worker->leaf_problem,
+                                settings,
+                                var_types,
+                                node_ptr->vstatus,
+                                worker->leaf_edge_norms,
+                                worker->basis_factors,
+                                worker->basic_list,
+                                worker->nonbasic_list,
+                                j,
+                                worker->leaf_problem.lower[j],
+                                std::floor(solution[j]),
+                                upper_bound,
+                                branch_and_bound_lp_iter_per_node,
+                                start_time,
+                                reliability_branching_settings.upper_max_lp_iter,
+                                reliability_branching_settings.lower_max_lp_iter,
+                                strong_branching_lp_iter);
+
+      if (!std::isnan(obj)) {
+        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+        f_t change_in_x   = solution[j] - std::floor(solution[j]);
+        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_down[j]++;
+      }
     }
+    pseudo_cost_mutex_down[j].unlock();
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    pseudo_cost_mutex_up[j].lock();
+    if (pseudo_cost_num_up[j] < reliable_threshold) {
+      f_t obj = trial_branching(worker->leaf_problem,
+                                settings,
+                                var_types,
+                                node_ptr->vstatus,
+                                worker->leaf_edge_norms,
+                                worker->basis_factors,
+                                worker->basic_list,
+                                worker->nonbasic_list,
+                                j,
+                                std::ceil(solution[j]),
+                                worker->leaf_problem.upper[j],
+                                upper_bound,
+                                branch_and_bound_lp_iter_per_node,
+                                start_time,
+                                reliability_branching_settings.upper_max_lp_iter,
+                                reliability_branching_settings.lower_max_lp_iter,
+                                strong_branching_lp_iter);
+
+      if (!std::isnan(obj)) {
+        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_up[j]++;
+      }
+    }
+    pseudo_cost_mutex_up[j].unlock();
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+
+    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+
+    score_mutex.lock();
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+    score_mutex.unlock();
   }
 
   log.printf(
-    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]);
+    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
 
   return branch_var;
 }
@@ -475,8 +733,6 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
                                            f_t lower_bound,
                                            logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
-
   const i_t num_fractional = fractional.size();
   f_t estimate             = lower_bound;
 
@@ -487,26 +743,15 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
 
   initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
-  for (i_t k = 0; k < num_fractional; k++) {
-    const i_t j          = fractional[k];
-    f_t pseudo_cost_down = 0;
-    f_t pseudo_cost_up   = 0;
-
-    if (pseudo_cost_num_down[j] != 0) {
-      pseudo_cost_down = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
-    } else {
-      pseudo_cost_down = pseudo_cost_down_avg;
-    }
-
-    if (pseudo_cost_num_up[j] != 0) {
-      pseudo_cost_up = pseudo_cost_sum_up[j] / pseudo_cost_num_up[j];
-    } else {
-      pseudo_cost_up = pseudo_cost_up_avg;
-    }
+  for (i_t j : fractional) {
     constexpr f_t eps = 1e-6;
-    const f_t f_down  = solution[j] - std::floor(solution[j]);
-    const f_t f_up    = std::ceil(solution[j]) - solution[j];
-    estimate += std::min(pseudo_cost_down * f_down, pseudo_cost_up * f_up);
+    i_t num_up        = pseudo_cost_num_up[j];
+    i_t num_down      = pseudo_cost_num_down[j];
+    f_t pc_up         = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg;
+    f_t pc_down       = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg;
+    f_t f_down        = solution[j] - std::floor(solution[j]);
+    f_t f_up          = std::ceil(solution[j]) - solution[j];
+    estimate += std::min(pc_down * f_down, pc_up * f_up);
   }
 
   log.printf("pseudocost estimate = %e\n", estimate);
diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp
index 4bdde710f..1d3b0deef 100644
--- a/cpp/src/dual_simplex/pseudo_costs.hpp
+++ b/cpp/src/dual_simplex/pseudo_costs.hpp
@@ -7,16 +7,52 @@
 
 #pragma once
 
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/branch_and_bound_worker.hpp>
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/mip_node.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/types.hpp>
 #include <utilities/omp_helpers.hpp>
+#include <utilities/pcgenerator.hpp>
 
 #include <omp.h>
 
 namespace cuopt::linear_programming::dual_simplex {
 
+template <typename i_t, typename f_t>
+struct reliability_branching_settings_t {
+  // Lower bound for the maximum number of LP iterations for a single trial branching
+  i_t lower_max_lp_iter = 10;
+
+  // Upper bound for the maximum number of LP iterations for a single trial branching
+  i_t upper_max_lp_iter = 500;
+
+  // Priority of the tasks created when running the trial branching in parallel.
+  // Set to 1 to have the same priority as the other tasks.
+  i_t task_priority = 5;
+
+  // The maximum number of candidates initialized by strong branching in a single
+  // node
+  i_t max_num_candidates = 100;
+
+  // Define the maximum number of iteration spent in strong branching.
+  // Let `bnb_lp_iter` = total number of iterations in B&B, then
+  // `max iter in strong branching = bnb_lp_factor * bnb_lp_iter + bnb_lp_offset`.
+  // This is used for determining the `reliable_threshold`.
+  f_t bnb_lp_factor = 0.5;
+  i_t bnb_lp_offset = 100000;
+
+  // Maximum and minimum points in curve to determine the value
+  // of the `reliable_threshold` based on the current number of LP
+  // iterations in strong branching and B&B. Since it is a
+  // a curve, the actual value of `reliable_threshold` may be
+  // higher than `max_reliable_threshold`.
+  // Only used when `reliable_threshold` is negative
+  i_t max_reliable_threshold = 5;
+  i_t min_reliable_threshold = 1;
+};
+
 template <typename i_t, typename f_t>
 class pseudo_costs_t {
  public:
@@ -24,7 +60,9 @@ class pseudo_costs_t {
     : pseudo_cost_sum_down(num_variables),
       pseudo_cost_sum_up(num_variables),
       pseudo_cost_num_down(num_variables),
-      pseudo_cost_num_up(num_variables)
+      pseudo_cost_num_up(num_variables),
+      pseudo_cost_mutex_up(num_variables),
+      pseudo_cost_mutex_down(num_variables)
   {
   }
 
@@ -36,6 +74,8 @@ class pseudo_costs_t {
     pseudo_cost_sum_up.assign(num_variables, 0);
     pseudo_cost_num_down.assign(num_variables, 0);
     pseudo_cost_num_up.assign(num_variables, 0);
+    pseudo_cost_mutex_up.resize(num_variables);
+    pseudo_cost_mutex_down.resize(num_variables);
   }
 
   void initialized(i_t& num_initialized_down,
@@ -52,17 +92,37 @@ class pseudo_costs_t {
                          const std::vector<f_t>& solution,
                          logger_t& log);
 
+  i_t reliable_variable_selection(mip_node_t<i_t, f_t>* node_ptr,
+                                  const std::vector<i_t>& fractional,
+                                  const std::vector<f_t>& solution,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                                  const std::vector<variable_type_t>& var_types,
+                                  branch_and_bound_worker_t<i_t, f_t>* worker,
+                                  const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
+                                  f_t upper_bound,
+                                  int max_num_tasks,
+                                  logger_t& log);
+
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& root_soln);
-  std::vector<f_t> pseudo_cost_sum_up;
-  std::vector<f_t> pseudo_cost_sum_down;
-  std::vector<i_t> pseudo_cost_num_up;
-  std::vector<i_t> pseudo_cost_num_down;
+
+  f_t calculate_pseudocost_score(i_t j,
+                                 const std::vector<f_t>& solution,
+                                 f_t pseudo_cost_up_avg,
+                                 f_t pseudo_cost_down_avg) const;
+
+  reliability_branching_settings_t<i_t, f_t> reliability_branching_settings;
+
+  std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_up;
+  std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_down;
+  std::vector<omp_atomic_t<i_t>> pseudo_cost_num_up;
+  std::vector<omp_atomic_t<i_t>> pseudo_cost_num_down;
   std::vector<f_t> strong_branch_down;
   std::vector<f_t> strong_branch_up;
-
-  omp_mutex_t mutex;
+  std::vector<omp_mutex_t> pseudo_cost_mutex_up;
+  std::vector<omp_mutex_t> pseudo_cost_mutex_down;
   omp_atomic_t<i_t> num_strong_branches_completed = 0;
+  omp_atomic_t<int64_t> strong_branching_lp_iter  = 0;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index f9911ee53..34c384949 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -22,18 +22,25 @@ namespace cuopt::linear_programming::dual_simplex {
 
 template <typename i_t, typename f_t>
 struct diving_heuristics_settings_t {
-  i_t num_diving_workers = -1;
-
   // -1 automatic, 0 disabled, 1 enabled
   i_t line_search_diving = -1;
   i_t pseudocost_diving  = -1;
   i_t guided_diving      = -1;
   i_t coefficient_diving = -1;
 
-  i_t min_node_depth         = 10;
-  i_t node_limit             = 500;
+  // The minimum depth to start diving from.
+  i_t min_node_depth = 10;
+
+  // The maximum number of nodes when performing a dive.
+  i_t node_limit = 500;
+
+  // The maximum number of dual simplex iteration allowed
+  // in a single dive. This set in terms of the total number of
+  // iterations in the best-first threads.
   f_t iteration_limit_factor = 0.05;
-  i_t backtrack_limit        = 5;
+
+  // The maximum backtracking allowed.
+  i_t backtrack_limit = 5;
 };
 
 template <typename i_t, typename f_t>
@@ -95,16 +102,14 @@ struct simplex_solver_settings_t {
       reduced_cost_strengthening(-1),
       cut_change_threshold(1e-3),
       cut_min_orthogonality(0.5),
-      num_bfs_workers(std::max(num_threads / 4, 1)),
       random_seed(0),
+      reliability_branching(-1),
       inside_mip(0),
       sub_mip(0),
-      reliability_branching(-1),
       solution_callback(nullptr),
       heuristic_preemption_callback(nullptr),
       concurrent_halt(nullptr)
   {
-    diving_settings.num_diving_workers = std::max(num_threads - num_bfs_workers, 1);
   }
 
   void set_log(bool logging) const { log.log = logging; }
@@ -175,15 +180,20 @@ struct simplex_solver_settings_t {
                                    // strengthening
   f_t cut_change_threshold;        // threshold for cut change
   f_t cut_min_orthogonality;       // minimum orthogonality for cuts
-  i_t num_bfs_workers;             // number of threads dedicated to the best-first search
   i_t mip_batch_pdlp_strong_branching{0};  // 0 if not using batch PDLP for strong branching, 1 if
                                            // using batch PDLP for strong branching
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
+  // Settings for the reliability branching.
+  // - -1: automatic
+  // - 0: disable (use pseudocost branching instead)
+  // - k > 0, a variable is considered reliable if it has been branched on k times.
+  i_t reliability_branching;
+
   i_t inside_mip;  // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node
   i_t sub_mip;     // 0 if in regular MIP solve, 1 if in sub-MIP solve
-  i_t reliability_branching;  // -1 automatic, 0 to disable, >0 to enable reliability branching
+
   std::function<void(std::vector<f_t>&, f_t)> solution_callback;
   std::function<void(const std::vector<f_t>&, f_t)> node_processed_callback;
   std::function<void()> heuristic_preemption_callback;
diff --git a/cpp/src/dual_simplex/solution.hpp b/cpp/src/dual_simplex/solution.hpp
index a678e2fd7..5739cedae 100644
--- a/cpp/src/dual_simplex/solution.hpp
+++ b/cpp/src/dual_simplex/solution.hpp
@@ -72,8 +72,8 @@ class mip_solution_t {
   std::vector<f_t> x;
   f_t objective;
   f_t lower_bound;
-  i_t nodes_explored;
-  i_t simplex_iterations;
+  int64_t nodes_explored;
+  int64_t simplex_iterations;
   bool has_incumbent;
 };
 
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 9dc6ac9c5..0858bb75a 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -98,6 +98,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 1, 0},
+    {CUOPT_MIP_RELIABILITY_BRANCHING, &mip_settings.reliability_branching, -1, std::numeric_limits<i_t>::max(), -1}
   };
 
     // Bool parameters
diff --git a/cpp/src/mip/diversity/lns/rins.cu b/cpp/src/mip/diversity/lns/rins.cu
index cb086e145..7b05f1964 100644
--- a/cpp/src/mip/diversity/lns/rins.cu
+++ b/cpp/src/mip/diversity/lns/rins.cu
@@ -259,20 +259,13 @@ void rins_t<i_t, f_t>::run_rins()
   branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
   branch_and_bound_settings.relative_mip_gap_tol =
     std::min(current_mip_gap, (f_t)settings.target_mip_gap);
-  branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
-  branch_and_bound_settings.num_threads     = 2;
-  branch_and_bound_settings.num_bfs_workers = 1;
-  branch_and_bound_settings.max_cut_passes  = 0;
-  branch_and_bound_settings.sub_mip         = 1;
-
-  // In the future, let RINS use all the diving heuristics. For now,
-  // restricting to guided diving.
-  branch_and_bound_settings.diving_settings.num_diving_workers = 1;
-  branch_and_bound_settings.diving_settings.line_search_diving = 0;
-  branch_and_bound_settings.diving_settings.coefficient_diving = 0;
-  branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
-  branch_and_bound_settings.log.log                            = false;
-  branch_and_bound_settings.log.log_prefix                     = "[RINS] ";
+  branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+  branch_and_bound_settings.num_threads = 1;
+  branch_and_bound_settings.reliability_branching = 0;
+  branch_and_bound_settings.max_cut_passes        = 0;
+  branch_and_bound_settings.sub_mip               = 1;
+  branch_and_bound_settings.log.log               = false;
+  branch_and_bound_settings.log.log_prefix        = "[RINS] ";
   branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector<f_t>& solution,
                                                                        f_t objective) {
     rins_solution_queue.push_back(solution);
diff --git a/cpp/src/mip/diversity/recombiners/sub_mip.cuh b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
index e636e7471..6e1c4d4ae 100644
--- a/cpp/src/mip/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
@@ -104,19 +104,12 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.print_presolve_stats = false;
       branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
       branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
-      branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
-      branch_and_bound_settings.num_threads     = 2;
-      branch_and_bound_settings.num_bfs_workers = 1;
-      branch_and_bound_settings.max_cut_passes  = 0;
-      branch_and_bound_settings.sub_mip         = 1;
-
-      // In the future, let SubMIP use all the diving heuristics. For now,
-      // restricting to guided diving.
-      branch_and_bound_settings.diving_settings.num_diving_workers = 1;
-      branch_and_bound_settings.diving_settings.line_search_diving = 0;
-      branch_and_bound_settings.diving_settings.coefficient_diving = 0;
-      branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
-      branch_and_bound_settings.solution_callback = [this](std::vector<f_t>& solution,
+      branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+      branch_and_bound_settings.num_threads = 1;
+      branch_and_bound_settings.reliability_branching = 0;
+      branch_and_bound_settings.max_cut_passes        = 0;
+      branch_and_bound_settings.sub_mip               = 1;
+      branch_and_bound_settings.solution_callback     = [this](std::vector<f_t>& solution,
                                                            f_t objective) {
         this->solution_callback(solution, objective);
       };
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index 2829babc9..8583a76f8 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -180,9 +180,10 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     branch_and_bound_settings.print_presolve_stats = false;
     branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
     branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
-    branch_and_bound_settings.integer_tol    = context.settings.tolerances.integrality_tolerance;
-    branch_and_bound_settings.max_cut_passes = context.settings.max_cut_passes;
-    branch_and_bound_settings.mir_cuts       = context.settings.mir_cuts;
+    branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+    branch_and_bound_settings.reliability_branching = solver_settings_.reliability_branching;
+    branch_and_bound_settings.max_cut_passes        = context.settings.max_cut_passes;
+    branch_and_bound_settings.mir_cuts              = context.settings.mir_cuts;
     branch_and_bound_settings.mixed_integer_gomory_cuts =
       context.settings.mixed_integer_gomory_cuts;
     branch_and_bound_settings.knapsack_cuts = context.settings.knapsack_cuts;
@@ -192,21 +193,15 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       context.settings.reduced_cost_strengthening;
     branch_and_bound_settings.cut_change_threshold  = context.settings.cut_change_threshold;
     branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality;
+    branch_and_bound_settings.mip_batch_pdlp_strong_branching =
+      context.settings.mip_batch_pdlp_strong_branching;
 
     if (context.settings.num_cpu_threads < 0) {
-      branch_and_bound_settings.num_threads = omp_get_max_threads() - 1;
+      branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1);
     } else {
       branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
     }
 
-    i_t num_threads                           = branch_and_bound_settings.num_threads;
-    i_t num_bfs_workers                       = std::max(1, num_threads / 4);
-    i_t num_diving_workers                    = std::max(1, num_threads - num_bfs_workers);
-    branch_and_bound_settings.num_bfs_workers = num_bfs_workers;
-    branch_and_bound_settings.diving_settings.num_diving_workers = num_diving_workers;
-    branch_and_bound_settings.mip_batch_pdlp_strong_branching =
-      context.settings.mip_batch_pdlp_strong_branching;
-
     // Set the branch and bound -> primal heuristics callback
     branch_and_bound_settings.solution_callback =
       std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::solution_callback,
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index e1b68bf88..f6e66472d 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -10,7 +10,8 @@
 #ifdef _OPENMP
 
 #include <omp.h>
-#include <type_traits>
+#include <memory>
+#include <utility>
 
 namespace cuopt {
 
@@ -18,14 +19,39 @@ namespace cuopt {
 // https://www.openmp.org/spec-html/5.1/openmpse39.html#x224-2570003.9
 class omp_mutex_t {
  public:
-  omp_mutex_t() { omp_init_lock(&mutex); }
-  virtual ~omp_mutex_t() { omp_destroy_lock(&mutex); }
-  void lock() { omp_set_lock(&mutex); }
-  void unlock() { omp_unset_lock(&mutex); }
-  bool try_lock() { return omp_test_lock(&mutex); }
+  omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); }
+
+  omp_mutex_t(const omp_mutex_t&) = delete;
+
+  omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); }
+
+  omp_mutex_t& operator=(const omp_mutex_t&) = delete;
+
+  omp_mutex_t& operator=(omp_mutex_t&& other)
+  {
+    if (&other != this) {
+      if (mutex) { omp_destroy_lock(mutex.get()); }
+      mutex = std::move(other.mutex);
+    }
+    return *this;
+  }
+
+  virtual ~omp_mutex_t()
+  {
+    if (mutex) {
+      omp_destroy_lock(mutex.get());
+      mutex.reset();
+    }
+  }
+
+  void lock() { omp_set_lock(mutex.get()); }
+
+  void unlock() { omp_unset_lock(mutex.get()); }
+
+  bool try_lock() { return omp_test_lock(mutex.get()); }
 
  private:
-  omp_lock_t mutex;
+  std::unique_ptr<omp_lock_t> mutex;
 };
 
 // Wrapper for omp atomic operations. See
@@ -42,7 +68,7 @@ class omp_atomic_t {
     return new_val;
   }
 
-  operator T() { return load(); }
+  operator T() const { return load(); }
   T operator+=(T inc) { return fetch_add(inc) + inc; }
   T operator-=(T inc) { return fetch_sub(inc) - inc; }
 
diff --git a/cpp/src/utilities/pcgenerator.hpp b/cpp/src/utilities/pcgenerator.hpp
new file mode 100644
index 000000000..29a865f02
--- /dev/null
+++ b/cpp/src/utilities/pcgenerator.hpp
@@ -0,0 +1,159 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <stdint.h>
+#include <utility>
+#include <vector>
+
+// Copied from raft/PCGenerator (rng_device.cuh). We cannot use raft implementation
+// on the CPU (.cpp file) since the raft header includes CUDA code.
+// The original code is from https://www.pcg-random.org/.
+namespace cuopt {
+class pcgenerator_t {
+ public:
+  static constexpr uint64_t default_seed   = 0x853c49e6748fea9bULL;
+  static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL;
+
+  /**
+   * @brief ctor. Initializes the PCG
+   * @param rng_state is the generator state used for initializing the generator
+   * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences
+   * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a
+   * unique set of random numbers. This can be achieved by initializing the generator with same
+   * rng_state for all the threads and diststreamt values for subsequence.
+   */
+  pcgenerator_t(const uint64_t seed        = default_seed,
+                const uint64_t subsequence = default_stream,
+                uint64_t offset            = 0)
+  {
+    set_seed(seed, subsequence, offset);
+  }
+
+  // Set the seed, subsequence and offset of the PCG
+  void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0)
+  {
+    state  = uint64_t(0);
+    stream = (subsequence << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    state += seed;
+    next(discard);
+    skipahead(offset);
+  }
+
+  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
+  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  void skipahead(uint64_t offset)
+  {
+    uint64_t G = 1;
+    uint64_t h = 6364136223846793005ULL;
+    uint64_t C = 0;
+    uint64_t f = stream;
+    while (offset) {
+      if (offset & 1) {
+        G = G * h;
+        C = C * h + f;
+      }
+      f = f * (h + 1);
+      h = h * h;
+      offset >>= 1;
+    }
+    state = state * G + C;
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @brief This code is derived from PCG basic code
+   * @{
+   */
+  uint32_t next_u32()
+  {
+    uint32_t ret;
+    uint64_t oldstate   = state;
+    state               = oldstate * 6364136223846793005ULL + stream;
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot        = oldstate >> 59u;
+    ret                 = (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+    return ret;
+  }
+
+  uint64_t next_u64()
+  {
+    uint64_t ret;
+    uint32_t a, b;
+    a   = next_u32();
+    b   = next_u32();
+    ret = uint64_t(a) | (uint64_t(b) << 32);
+    return ret;
+  }
+
+  int32_t next_i32()
+  {
+    int32_t ret;
+    uint32_t val;
+    val = next_u32();
+    ret = int32_t(val & 0x7fffffff);
+    return ret;
+  }
+
+  int64_t next_i64()
+  {
+    int64_t ret;
+    uint64_t val;
+    val = next_u64();
+    ret = int64_t(val & 0x7fffffffffffffff);
+    return ret;
+  }
+
+  float next_float() { return static_cast<float>((next_u32() >> 8) * 0x1.0p-24); }
+
+  double next_double() { return static_cast<double>((next_u64() >> 11) * 0x1.0p-53); }
+
+  template <typename T>
+  T next()
+  {
+    T val;
+    next(val);
+    return val;
+  }
+
+  void next(uint32_t& ret) { ret = next_u32(); }
+  void next(uint64_t& ret) { ret = next_u64(); }
+  void next(int32_t& ret) { ret = next_i32(); }
+  void next(int64_t& ret) { ret = next_i64(); }
+  void next(float& ret) { ret = next_float(); }
+  void next(double& ret) { ret = next_double(); }
+
+  /// Draws a sample from a uniform distribution. The samples are uniformly distributed over
+  /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward
+  /// some numbers in the range (scaling by floating-point).
+  template <typename T>
+  T uniform(T low, T high)
+  {
+    double val = next_double();
+    T range    = high - low;
+    return low + (val * range);
+  }
+
+  // Shuffles the contents of a sequence using the Fisher–Yates algorithm.
+  template <typename T>
+  void shuffle(std::vector<T>& seq)
+  {
+    if (seq.empty()) { return; }
+    for (size_t i = 0; i < seq.size() - 1; ++i) {
+      size_t j = uniform(i, seq.size());
+      if (j != i) std::swap(seq[i], seq[j]);
+    }
+  }
+
+ private:
+  uint64_t state;
+  uint64_t stream;
+};
+}  // namespace cuopt