diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp
index 213e38e5e..71034ff12 100644
--- a/benchmarks/linear_programming/cuopt/run_mip.cpp
+++ b/benchmarks/linear_programming/cuopt/run_mip.cpp
@@ -147,6 +147,7 @@ int run_single_file(std::string file_path,
                     int num_cpu_threads,
                     bool write_log_file,
                     bool log_to_console,
+                    bool reliability_branching,
                     double time_limit)
 {
   const raft::handle_t handle_{};
@@ -197,6 +198,8 @@ int run_single_file(std::string file_path,
     }
   }
 
+  CUOPT_LOG_INFO("Reliability branching: %d\n", reliability_branching);
+
   settings.time_limit                    = time_limit;
   settings.heuristics_only               = heuristics_only;
   settings.num_cpu_threads               = num_cpu_threads;
@@ -204,6 +207,7 @@ int run_single_file(std::string file_path,
   settings.tolerances.relative_tolerance = 1e-12;
   settings.tolerances.absolute_tolerance = 1e-6;
   settings.presolve                      = true;
+  settings.reliability_branching         = reliability_branching;
   cuopt::linear_programming::benchmark_info_t benchmark_info;
   settings.benchmark_info_ptr = &benchmark_info;
   auto start_run_solver       = std::chrono::high_resolution_clock::now();
@@ -256,6 +260,7 @@ void run_single_file_mp(std::string file_path,
                         int num_cpu_threads,
                         bool write_log_file,
                         bool log_to_console,
+                        bool reliability_branching,
                         double time_limit)
 {
   std::cout << "running file " << file_path << " on gpu : " << device << std::endl;
@@ -271,6 +276,7 @@ void run_single_file_mp(std::string file_path,
                                   num_cpu_threads,
                                   write_log_file,
                                   log_to_console,
+                                  reliability_branching,
                                   time_limit);
   // this is a bad design to communicate the result but better than adding complexity of IPC or
   // pipes
@@ -354,6 +360,10 @@ int main(int argc, char* argv[])
     .help("track allocations (t/f)")
     .default_value(std::string("f"));
 
+  program.add_argument("--reliability-branching")
+    .help("enable reliability branching (t/f)")
+    .default_value(std::string("t"));
+
   // Parse arguments
   try {
     program.parse_args(argc, argv);
@@ -376,12 +386,13 @@ int main(int argc, char* argv[])
   std::string result_file;
   int batch_num = -1;
 
-  bool heuristics_only   = program.get<std::string>("--heuristics-only")[0] == 't';
-  int num_cpu_threads    = program.get<int>("--num-cpu-threads");
-  bool write_log_file    = program.get<std::string>("--write-log-file")[0] == 't';
-  bool log_to_console    = program.get<std::string>("--log-to-console")[0] == 't';
-  double memory_limit    = program.get<double>("--memory-limit");
-  bool track_allocations = program.get<std::string>("--track-allocations")[0] == 't';
+  bool heuristics_only       = program.get<std::string>("--heuristics-only")[0] == 't';
+  int num_cpu_threads        = program.get<int>("--num-cpu-threads");
+  bool write_log_file        = program.get<std::string>("--write-log-file")[0] == 't';
+  bool log_to_console        = program.get<std::string>("--log-to-console")[0] == 't';
+  double memory_limit        = program.get<double>("--memory-limit");
+  bool track_allocations     = program.get<std::string>("--track-allocations")[0] == 't';
+  bool reliability_branching = program.get<std::string>("--reliability-branching")[0] == 't';
 
   if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; }
 
@@ -469,6 +480,7 @@ int main(int argc, char* argv[])
                                num_cpu_threads,
                                write_log_file,
                                log_to_console,
+                               reliability_branching,
                                time_limit);
           } else if (sys_pid < 0) {
             std::cerr << "Fork failed!" << std::endl;
@@ -509,6 +521,7 @@ int main(int argc, char* argv[])
                     num_cpu_threads,
                     write_log_file,
                     log_to_console,
+                    reliability_branching,
                     time_limit);
   }
 
diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 4f6320752..f5cb1c80e 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -83,6 +83,9 @@ class mip_solver_settings_t {
   i_t num_cpu_threads  = -1;  // -1 means use default number of threads in branch and bound
   i_t num_gpus         = 1;
   bool log_to_console  = true;
+
+  bool reliability_branching = true;
+
   std::string log_file;
   std::string sol_file;
   std::string user_problem_file;
diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt
index af1415fa9..3e72e33cb 100644
--- a/cpp/src/dual_simplex/CMakeLists.txt
+++ b/cpp/src/dual_simplex/CMakeLists.txt
@@ -13,7 +13,6 @@ set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/crossover.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/folding.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/initial_basis.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/mip_node.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/phase1.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/phase2.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve.cpp
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index afd4f4c9a..038db59b9 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -223,6 +223,9 @@ class basis_update_mpf_t {
     reset_stats();
   }
 
+  basis_update_mpf_t(const basis_update_mpf_t& other)            = default;
+  basis_update_mpf_t& operator=(const basis_update_mpf_t& other) = default;
+
   void print_stats() const
   {
     i_t total_L_transpose_calls = total_sparse_L_transpose_ + total_dense_L_transpose_;
diff --git a/cpp/src/dual_simplex/bnb_worker.hpp b/cpp/src/dual_simplex/bnb_worker.hpp
new file mode 100644
index 000000000..e770edd84
--- /dev/null
+++ b/cpp/src/dual_simplex/bnb_worker.hpp
@@ -0,0 +1,274 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bounds_strengthening.hpp>
+#include <dual_simplex/mip_node.hpp>
+#include <dual_simplex/phase2.hpp>
+
+#include <utilities/pcg.hpp>
+
+#include <array>
+#include <deque>
+#include <mutex>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+constexpr int bnb_num_worker_types = 5;
+
+// Indicate the search and variable selection algorithms used by each thread
+// in B&B (See [1]).
+//
+// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
+// Berlin, 2007. doi: 10.14279/depositonce-1634.
+enum bnb_worker_type_t : int {
+  BEST_FIRST         = 0,  // Best-First + Plunging.
+  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
+  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
+  GUIDED_DIVING      = 3,  // Guided diving (9.2.3).
+  COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
+};
+
+template <typename i_t, typename f_t>
+struct bnb_stats_t {
+  f_t start_time                         = 0.0;
+  omp_atomic_t<f_t> total_lp_solve_time  = 0.0;
+  omp_atomic_t<int64_t> nodes_explored   = 0;
+  omp_atomic_t<int64_t> nodes_unexplored = 0;
+  omp_atomic_t<int64_t> total_lp_iters   = 0;
+  omp_atomic_t<i_t> nodes_since_last_log = 0;
+  omp_atomic_t<f_t> last_log             = 0.0;
+};
+
+template <typename i_t, typename f_t>
+class bnb_worker_data_t {
+ public:
+  const i_t worker_id;
+  omp_atomic_t<bnb_worker_type_t> worker_type;
+  omp_atomic_t<bool> is_active;
+  omp_atomic_t<f_t> lower_bound;
+
+  lp_problem_t<i_t, f_t> leaf_problem;
+  lp_solution_t<i_t, f_t> leaf_solution;
+  std::vector<f_t> leaf_edge_norms;
+
+  basis_update_mpf_t<i_t, f_t> basis_factors;
+  std::vector<i_t> basic_list;
+  std::vector<i_t> nonbasic_list;
+
+  bounds_strengthening_t<i_t, f_t> node_presolver;
+  std::vector<bool> bounds_changed;
+
+  std::vector<f_t> start_lower;
+  std::vector<f_t> start_upper;
+  mip_node_t<i_t, f_t>* start_node;
+
+  PCG rng;
+
+  bool recompute_basis  = true;
+  bool recompute_bounds = true;
+
+  bnb_worker_data_t(i_t worker_id,
+                    const lp_problem_t<i_t, f_t>& original_lp,
+                    const csr_matrix_t<i_t, f_t>& Arow,
+                    const std::vector<variable_type_t>& var_type,
+                    const simplex_solver_settings_t<i_t, f_t>& settings)
+    : worker_id(worker_id),
+      worker_type(BEST_FIRST),
+      is_active(false),
+      lower_bound(-std::numeric_limits<f_t>::infinity()),
+      leaf_problem(original_lp),
+      leaf_solution(original_lp.num_rows, original_lp.num_cols),
+      basis_factors(original_lp.num_rows, settings.refactor_frequency),
+      basic_list(original_lp.num_rows),
+      nonbasic_list(),
+      node_presolver(leaf_problem, Arow, {}, var_type),
+      bounds_changed(original_lp.num_cols, false),
+      rng(PCG::default_seed ^ worker_id, PCG::default_stream + worker_id)
+  {
+  }
+
+  // Set the `start_node` for best-first search.
+  void init_best_first(mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
+  {
+    start_node  = node;
+    start_lower = original_lp.lower;
+    start_upper = original_lp.upper;
+    worker_type = BEST_FIRST;
+    lower_bound = node->lower_bound;
+    is_active   = true;
+  }
+
+  // Initialize the worker for diving, setting the `start_node`, `start_lower` and
+  // `start_upper`. Returns `true` if the starting node is feasible via
+  // bounds propagation.
+  bool init_diving(mip_node_t<i_t, f_t>* node,
+                   bnb_worker_type_t type,
+                   const lp_problem_t<i_t, f_t>& original_lp,
+                   const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    internal_node = node->detach_copy();
+    start_node    = &internal_node;
+
+    start_lower = original_lp.lower;
+    start_upper = original_lp.upper;
+    worker_type = type;
+    lower_bound = node->lower_bound;
+    is_active   = true;
+
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
+
+    return node_presolver.bounds_strengthening(start_lower, start_upper, bounds_changed, settings);
+  }
+
+  // Set the variables bounds for the LP relaxation of the current node.
+  bool set_lp_variable_bounds_for(mip_node_t<i_t, f_t>* node_ptr,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    // Reset the bound_changed markers
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+
+    // Set the correct bounds for the leaf problem
+    if (recompute_bounds) {
+      leaf_problem.lower = start_lower;
+      leaf_problem.upper = start_upper;
+      node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
+
+    } else {
+      node_ptr->update_branched_variable_bounds(
+        leaf_problem.lower, leaf_problem.upper, bounds_changed);
+    }
+
+    return node_presolver.bounds_strengthening(
+      leaf_problem.lower, leaf_problem.upper, bounds_changed, settings);
+  }
+
+ private:
+  // For diving, we need to store the full node instead of
+  // of just a pointer, since it is not store in the tree anymore.
+  // To keep the same interface across all worker types,
+  // this will be used as a temporary storage and
+  // will be pointed by `start_node`.
+  // For exploration, this will not be used.
+  mip_node_t<i_t, f_t> internal_node;
+};
+
+template <typename i_t, typename f_t>
+class bnb_worker_pool_t {
+ public:
+  void init(i_t num_workers,
+            const lp_problem_t<i_t, f_t>& original_lp,
+            const csr_matrix_t<i_t, f_t>& Arow,
+            const std::vector<variable_type_t>& var_type,
+            const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    workers_.resize(num_workers);
+    num_idle_workers_ = num_workers;
+    for (i_t i = 0; i < num_workers; ++i) {
+      workers_[i] =
+        std::make_unique<bnb_worker_data_t<i_t, f_t>>(i, original_lp, Arow, var_type, settings);
+      idle_workers_.push_front(i);
+    }
+  }
+
+  // Here, we are assuming that the master is the only
+  // thread that can retrieve/pop an idle worker.
+  bnb_worker_data_t<i_t, f_t>* get_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    if (idle_workers_.empty()) {
+      return nullptr;
+    } else {
+      i_t idx = idle_workers_.front();
+      return workers_[idx].get();
+    }
+  }
+
+  // Here, we are assuming that the master is the only
+  // thread that can retrieve/pop an idle worker.
+  void pop_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    if (!idle_workers_.empty()) {
+      idle_workers_.pop_front();
+      num_idle_workers_--;
+    }
+  }
+
+  void return_worker_to_pool(bnb_worker_data_t<i_t, f_t>* worker)
+  {
+    worker->is_active = false;
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    idle_workers_.push_back(worker->worker_id);
+    num_idle_workers_++;
+  }
+
+  f_t get_lower_bounds()
+  {
+    f_t lower_bound = std::numeric_limits<f_t>::infinity();
+
+    for (i_t i = 0; i < workers_.size(); ++i) {
+      if (workers_[i]->worker_type == BEST_FIRST && workers_[i]->is_active) {
+        lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
+      }
+    }
+
+    return lower_bound;
+  }
+
+  i_t num_idle_workers() { return num_idle_workers_; }
+
+ private:
+  // Worker pool
+  std::vector<std::unique_ptr<bnb_worker_data_t<i_t, f_t>>> workers_;
+
+  omp_mutex_t mutex_;
+  std::deque<i_t> idle_workers_;
+  omp_atomic_t<i_t> num_idle_workers_;
+};
+
+template <typename f_t, typename i_t>
+std::vector<bnb_worker_type_t> bnb_get_worker_types(diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  std::vector<bnb_worker_type_t> types;
+  types.reserve(bnb_num_worker_types);
+  types.push_back(BEST_FIRST);
+  if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); }
+  if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); }
+  if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); }
+  if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); }
+  return types;
+}
+
+template <typename i_t>
+std::array<i_t, bnb_num_worker_types> bnb_get_max_workers(
+  i_t num_workers, std::vector<bnb_worker_type_t> worker_types)
+{
+  std::array<i_t, bnb_num_worker_types> max_num_workers;
+  max_num_workers.fill(0);
+
+  i_t bfs_workers = std::max(worker_types.size() == 1 ? num_workers : num_workers / 4, 1);
+  max_num_workers[BEST_FIRST] = bfs_workers;
+
+  i_t diving_workers = (num_workers - bfs_workers);
+  i_t m              = worker_types.size() - 1;
+
+  for (size_t i = 1, k = 0; i < worker_types.size(); ++i) {
+    i_t start                        = (double)k * diving_workers / m;
+    i_t end                          = (double)(k + 1) * diving_workers / m;
+    max_num_workers[worker_types[i]] = end - start;
+    ++k;
+  }
+
+  return max_num_workers;
+}
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp
index 4114e7e09..712f25837 100644
--- a/cpp/src/dual_simplex/bounds_strengthening.cpp
+++ b/cpp/src/dual_simplex/bounds_strengthening.cpp
@@ -59,8 +59,7 @@ bounds_strengthening_t<i_t, f_t>::bounds_strengthening_t(
   const csr_matrix_t<i_t, f_t>& Arow,
   const std::vector<char>& row_sense,
   const std::vector<variable_type_t>& var_types)
-  : bounds_changed(problem.num_cols, false),
-    A(problem.A),
+  : A(problem.A),
     Arow(Arow),
     var_types(var_types),
     delta_min_activity(problem.num_rows),
@@ -93,6 +92,7 @@ template <typename i_t, typename f_t>
 bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
   std::vector<f_t>& lower_bounds,
   std::vector<f_t>& upper_bounds,
+  const std::vector<bool>& bounds_changed,
   const simplex_solver_settings_t<i_t, f_t>& settings)
 {
   const i_t m = A.m;
diff --git a/cpp/src/dual_simplex/bounds_strengthening.hpp b/cpp/src/dual_simplex/bounds_strengthening.hpp
index e7e218b82..6a43ef44c 100644
--- a/cpp/src/dual_simplex/bounds_strengthening.hpp
+++ b/cpp/src/dual_simplex/bounds_strengthening.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -22,10 +22,9 @@ class bounds_strengthening_t {
 
   bool bounds_strengthening(std::vector<f_t>& lower_bounds,
                             std::vector<f_t>& upper_bounds,
+                            const std::vector<bool>& bounds_changed,
                             const simplex_solver_settings_t<i_t, f_t>& settings);
 
-  std::vector<bool> bounds_changed;
-
  private:
   const csc_matrix_t<i_t, f_t>& A;
   const csr_matrix_t<i_t, f_t>& Arow;
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index 7fbe90284..2ab39d1d9 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -250,18 +250,14 @@ f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
   f_t lower_bound      = lower_bound_ceiling_.load();
   f_t heap_lower_bound = node_queue_.get_lower_bound();
   lower_bound          = std::min(heap_lower_bound, lower_bound);
-
-  for (i_t i = 0; i < local_lower_bounds_.size(); ++i) {
-    lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound);
-  }
-
+  lower_bound          = std::min(worker_pool_.get_lower_bounds(), lower_bound);
   return std::isfinite(lower_bound) ? lower_bound : -inf;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj)
 {
-  if (is_running) {
+  if (is_running_) {
     f_t user_obj         = compute_user_objective(original_lp_, obj);
     f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
     std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
@@ -286,7 +282,7 @@ void branch_and_bound_t<i_t, f_t>::report(char symbol, f_t obj, f_t lower_bound,
   i_t nodes_unexplored = exploration_stats_.nodes_unexplored;
   f_t user_obj         = compute_user_objective(original_lp_, obj);
   f_t user_lower       = compute_user_objective(original_lp_, lower_bound);
-  f_t iter_node        = exploration_stats_.total_lp_iters / nodes_explored;
+  f_t iter_node        = (f_t)exploration_stats_.total_lp_iters / nodes_explored;
   std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
   settings_.log.printf("%c %10d   %10lu    %+13.6e    %+10.6e  %6d    %7.1e     %s %9.2f\n",
                        symbol,
@@ -569,24 +565,37 @@ template <typename i_t, typename f_t>
 branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   mip_node_t<i_t, f_t>* node_ptr,
   const std::vector<i_t>& fractional,
-  const std::vector<f_t>& solution,
-  bnb_worker_type_t type)
+  bnb_worker_data_t<i_t, f_t>* worker_data)
 {
   logger_t log;
   log.log                        = false;
   i_t branch_var                 = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   std::vector<f_t> current_incumbent;
+  std::vector<f_t>& solution = worker_data->leaf_solution.x;
 
-  // If there is no incumbent, use pseudocost diving instead of guided diving
-  if (upper_bound_ == inf && type == bnb_worker_type_t::GUIDED_DIVING) {
-    type = bnb_worker_type_t::PSEUDOCOST_DIVING;
-  }
-
-  switch (type) {
+  switch (worker_data->worker_type) {
     case bnb_worker_type_t::BEST_FIRST:
-      branch_var = pc_.variable_selection(fractional, solution, log);
-      round_dir  = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
+
+      if (settings_.reliability_branching_settings.enable) {
+        simplex_solver_settings_t<i_t, f_t> rb_settings      = settings_;
+        rb_settings.reliability_branching_settings.num_tasks = worker_pool_.num_idle_workers();
+
+        branch_var = pc_.reliable_variable_selection(node_ptr,
+                                                     fractional,
+                                                     solution,
+                                                     rb_settings,
+                                                     var_types_,
+                                                     worker_data,
+                                                     exploration_stats_,
+                                                     upper_bound_,
+                                                     log);
+      } else {
+        branch_var = pc_.variable_selection(fractional, solution, log);
+      }
+
+      round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
+
       return {branch_var, round_dir};
 
     case bnb_worker_type_t::COEFFICIENT_DIVING:
@@ -606,29 +615,19 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
       return guided_diving(pc_, fractional, solution, current_incumbent, log);
 
     default:
-      log.debug("Unknown variable selection method: %d\n", type);
+      log.debug("Unknown variable selection method: %d\n", worker_data->worker_type);
       return {-1, rounding_direction_t::NONE};
   }
 }
 
 template <typename i_t, typename f_t>
-dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
-  mip_node_t<i_t, f_t>* node_ptr,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  lp_solution_t<i_t, f_t>& leaf_solution,
-  basis_update_mpf_t<i_t, f_t>& basis_factors,
-  std::vector<i_t>& basic_list,
-  std::vector<i_t>& nonbasic_list,
-  bounds_strengthening_t<i_t, f_t>& node_presolver,
-  bnb_worker_type_t thread_type,
-  bool recompute_bounds_and_basis,
-  const std::vector<f_t>& root_lower,
-  const std::vector<f_t>& root_upper,
-  bnb_stats_t<i_t, f_t>& stats,
-  logger_t& log)
+dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
+                                                           bnb_worker_data_t<i_t, f_t>* worker_data,
+                                                           bnb_stats_t<i_t, f_t>& stats,
+                                                           logger_t& log)
 {
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  assert(leaf_vstatus.size() == leaf_problem.num_cols);
+  assert(leaf_vstatus.size() == worker_data->leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
@@ -637,10 +636,10 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
   lp_settings.scale_columns = false;
 
-  if (thread_type != bnb_worker_type_t::BEST_FIRST) {
-    i_t bnb_lp_iters            = exploration_stats_.total_lp_iters;
+  if (worker_data->worker_type != bnb_worker_type_t::BEST_FIRST) {
+    int64_t bnb_lp_iters        = exploration_stats_.total_lp_iters;
     f_t factor                  = settings_.diving_settings.iteration_limit_factor;
-    i_t max_iter                = factor * bnb_lp_iters;
+    int64_t max_iter            = factor * bnb_lp_iters;
     lp_settings.iteration_limit = max_iter - stats.total_lp_iters;
     if (lp_settings.iteration_limit <= 0) { return dual::status_t::ITERATION_LIMIT; }
   }
@@ -667,56 +666,40 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
     node_ptr->vstatus[node_ptr->branch_var]);
 #endif
 
-  // Reset the bound_changed markers
-  std::fill(node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-
-  // Set the correct bounds for the leaf problem
-  if (recompute_bounds_and_basis) {
-    leaf_problem.lower = root_lower;
-    leaf_problem.upper = root_upper;
-    node_ptr->get_variable_bounds(
-      leaf_problem.lower, leaf_problem.upper, node_presolver.bounds_changed);
-
-  } else {
-    node_ptr->update_branched_variable_bounds(
-      leaf_problem.lower, leaf_problem.upper, node_presolver.bounds_changed);
-  }
-
-  bool feasible =
-    node_presolver.bounds_strengthening(leaf_problem.lower, leaf_problem.upper, lp_settings);
-
-  dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED;
+  bool feasible                = worker_data->set_lp_variable_bounds_for(node_ptr, settings_);
+  dual::status_t lp_status     = dual::status_t::DUAL_UNBOUNDED;
+  worker_data->leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
 
   if (feasible) {
-    i_t node_iter                    = 0;
-    f_t lp_start_time                = tic();
-    std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
+    i_t node_iter     = 0;
+    f_t lp_start_time = tic();
 
     lp_status = dual_phase2_with_advanced_basis(2,
                                                 0,
-                                                recompute_bounds_and_basis,
+                                                worker_data->recompute_basis,
                                                 lp_start_time,
-                                                leaf_problem,
+                                                worker_data->leaf_problem,
                                                 lp_settings,
                                                 leaf_vstatus,
-                                                basis_factors,
-                                                basic_list,
-                                                nonbasic_list,
-                                                leaf_solution,
+                                                worker_data->basis_factors,
+                                                worker_data->basic_list,
+                                                worker_data->nonbasic_list,
+                                                worker_data->leaf_solution,
                                                 node_iter,
-                                                leaf_edge_norms);
+                                                worker_data->leaf_edge_norms);
 
     if (lp_status == dual::status_t::NUMERICAL) {
       log.printf("Numerical issue node %d. Resolving from scratch.\n", node_ptr->node_id);
-      lp_status_t second_status = solve_linear_program_with_advanced_basis(leaf_problem,
-                                                                           lp_start_time,
-                                                                           lp_settings,
-                                                                           leaf_solution,
-                                                                           basis_factors,
-                                                                           basic_list,
-                                                                           nonbasic_list,
-                                                                           leaf_vstatus,
-                                                                           leaf_edge_norms);
+      lp_status_t second_status =
+        solve_linear_program_with_advanced_basis(worker_data->leaf_problem,
+                                                 lp_start_time,
+                                                 lp_settings,
+                                                 worker_data->leaf_solution,
+                                                 worker_data->basis_factors,
+                                                 worker_data->basic_list,
+                                                 worker_data->nonbasic_list,
+                                                 leaf_vstatus,
+                                                 worker_data->leaf_edge_norms);
 
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
@@ -731,19 +714,18 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   return lp_status;
 }
-
 template <typename i_t, typename f_t>
 std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::update_tree(
   mip_node_t<i_t, f_t>* node_ptr,
   search_tree_t<i_t, f_t>& search_tree,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  lp_solution_t<i_t, f_t>& leaf_solution,
-  bnb_worker_type_t thread_type,
+  bnb_worker_data_t<i_t, f_t>* worker_data,
   dual::status_t lp_status,
   logger_t& log)
 {
   const f_t abs_fathom_tol                     = settings_.absolute_mip_gap_tol / 10;
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
+  lp_problem_t<i_t, f_t>& leaf_problem         = worker_data->leaf_problem;
+  lp_solution_t<i_t, f_t>& leaf_solution       = worker_data->leaf_solution;
 
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
     // Node was infeasible. Do not branch
@@ -766,12 +748,12 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
     i_t leaf_num_fractional =
       fractional_variables(settings_, leaf_solution.x, var_types_, leaf_fractional);
 
-    f_t leaf_objective    = compute_objective(leaf_problem, leaf_solution.x);
-    node_ptr->lower_bound = leaf_objective;
+    f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x);
     search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective);
     pc_.update_pseudo_costs(node_ptr, leaf_objective);
+    node_ptr->lower_bound = leaf_objective;
 
-    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+    if (worker_data->worker_type == bnb_worker_type_t::BEST_FIRST) {
       if (settings_.node_processed_callback != nullptr) {
         std::vector<f_t> original_x;
         uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x);
@@ -781,15 +763,15 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
 
     if (leaf_num_fractional == 0) {
       // Found a integer feasible solution
-      add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, thread_type);
+      add_feasible_solution(
+        leaf_objective, leaf_solution.x, node_ptr->depth, worker_data->worker_type);
       search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective);
       search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE);
       return {node_status_t::INTEGER_FEASIBLE, rounding_direction_t::NONE};
 
     } else if (leaf_objective <= upper_bound_ + abs_fathom_tol) {
       // Choose fractional variable to branch on
-      auto [branch_var, round_dir] =
-        variable_selection(node_ptr, leaf_fractional, leaf_solution.x, thread_type);
+      auto [branch_var, round_dir] = variable_selection(node_ptr, leaf_fractional, worker_data);
 
       assert(leaf_vstatus.size() == leaf_problem.num_cols);
       assert(branch_var >= 0);
@@ -798,7 +780,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
       // Note that the exploration thread is the only one that can insert new nodes into the heap,
       // and thus, we only need to calculate the objective estimate here (it is used for
       // sorting the nodes for diving).
-      if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+      if (worker_data->worker_type == bnb_worker_type_t::BEST_FIRST) {
         logger_t pc_log;
         pc_log.log = false;
         node_ptr->objective_estimate =
@@ -816,7 +798,7 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
       return {node_status_t::FATHOMED, rounding_direction_t::NONE};
     }
   } else {
-    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+    if (worker_data->worker_type == bnb_worker_type_t::BEST_FIRST) {
       fetch_min(lower_bound_ceiling_, node_ptr->lower_bound);
       log.printf(
         "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set "
@@ -834,133 +816,20 @@ std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::upd
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* node,
-                                                       i_t initial_heap_size)
+void branch_and_bound_t<i_t, f_t>::plunge_with(bnb_worker_data_t<i_t, f_t>* worker_data,
+                                               mip_solve_mode_t mode)
 {
-  if (solver_status_ != mip_status_t::UNSET) { return; }
-
-  // Note that we do not know which thread will execute the
-  // `exploration_ramp_up` task, so we allow to any thread
-  // to repair the heuristic solution.
-  repair_heuristic_solutions();
-
-  f_t lower_bound = node->lower_bound;
-  f_t upper_bound = upper_bound_;
-  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  f_t abs_gap     = upper_bound - lower_bound;
-
-  if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-    search_tree_.graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
-    search_tree_.update(node, node_status_t::FATHOMED);
-    --exploration_stats_.nodes_unexplored;
-    return;
-  }
-
-  f_t now = toc(exploration_stats_.start_time);
-  f_t time_since_last_log =
-    exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-  if (((exploration_stats_.nodes_since_last_log >= 10 ||
-        abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-       (time_since_last_log >= 1)) ||
-      (time_since_last_log > 30) || now > settings_.time_limit) {
-    bool should_report = should_report_.exchange(false);
-
-    if (should_report) {
-      report(' ', upper_bound, root_objective_, node->depth);
-      exploration_stats_.nodes_since_last_log = 0;
-      exploration_stats_.last_log             = tic();
-      should_report_                          = true;
-    }
-  }
-
-  if (now > settings_.time_limit) {
-    solver_status_ = mip_status_t::TIME_LIMIT;
-    return;
-  }
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
-
-  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-  dual::status_t lp_status = solve_node_lp(node,
-                                           leaf_problem,
-                                           leaf_solution,
-                                           basis_factors,
-                                           basic_list,
-                                           nonbasic_list,
-                                           node_presolver,
-                                           bnb_worker_type_t::BEST_FIRST,
-                                           true,
-                                           original_lp_.lower,
-                                           original_lp_.upper,
-                                           exploration_stats_,
-                                           settings_.log);
-  if (lp_status == dual::status_t::TIME_LIMIT) {
-    solver_status_ = mip_status_t::TIME_LIMIT;
-    return;
-  }
-
-  ++exploration_stats_.nodes_since_last_log;
-  ++exploration_stats_.nodes_explored;
-  --exploration_stats_.nodes_unexplored;
-
-  auto [node_status, round_dir] = update_tree(node,
-                                              search_tree_,
-                                              leaf_problem,
-                                              leaf_solution,
-                                              bnb_worker_type_t::BEST_FIRST,
-                                              lp_status,
-                                              settings_.log);
-
-  if (node_status == node_status_t::HAS_CHILDREN) {
-    exploration_stats_.nodes_unexplored += 2;
-
-    // If we haven't generated enough nodes to keep the threads busy, continue the ramp up phase
-    if (exploration_stats_.nodes_unexplored < initial_heap_size) {
-#pragma omp task
-      exploration_ramp_up(node->get_down_child(), initial_heap_size);
-
-#pragma omp task
-      exploration_ramp_up(node->get_up_child(), initial_heap_size);
-
-    } else {
-      // We've generated enough nodes, push further nodes onto the heap
-      node_queue_.push(node->get_down_child());
-      node_queue_.push(node->get_up_child());
-    }
-  }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
-                                               mip_node_t<i_t, f_t>* start_node,
-                                               lp_problem_t<i_t, f_t>& leaf_problem,
-                                               bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                               basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                               std::vector<i_t>& basic_list,
-                                               std::vector<i_t>& nonbasic_list)
-{
-  bool recompute_bounds_and_basis = true;
   std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(start_node);
-
-  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
-    if (task_id == 0) { repair_heuristic_solutions(); }
+  stack.push_front(worker_data->start_node);
+  worker_data->recompute_basis  = true;
+  worker_data->recompute_bounds = true;
 
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
     f_t lower_bound = node_ptr->lower_bound;
     f_t upper_bound = upper_bound_;
-    f_t abs_gap     = upper_bound - lower_bound;
     f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
 
     // This is based on three assumptions:
@@ -969,56 +838,29 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
     // - The current node and its siblings uses the lower bound of the parent before solving the LP
     // relaxation
     // - The lower bound of the parent is lower or equal to its children
-    assert(task_id < local_lower_bounds_.size());
-    local_lower_bounds_[task_id] = lower_bound;
+    worker_data->lower_bound = lower_bound;
 
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
       search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
       search_tree_.update(node_ptr, node_status_t::FATHOMED);
-      recompute_bounds_and_basis = true;
+      worker_data->recompute_basis  = true;
+      worker_data->recompute_bounds = true;
       --exploration_stats_.nodes_unexplored;
       continue;
     }
 
-    f_t now = toc(exploration_stats_.start_time);
-
-    if (task_id == 0) {
-      f_t time_since_last_log =
-        exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-      if (((exploration_stats_.nodes_since_last_log >= 1000 ||
-            abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-           time_since_last_log >= 1) ||
-          (time_since_last_log > 30) || now > settings_.time_limit) {
-        report(' ', upper_bound, get_lower_bound(), node_ptr->depth);
-        exploration_stats_.last_log             = tic();
-        exploration_stats_.nodes_since_last_log = 0;
-      }
-    }
-
-    if (now > settings_.time_limit) {
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
       solver_status_ = mip_status_t::TIME_LIMIT;
       break;
     }
+
     if (exploration_stats_.nodes_explored >= settings_.node_limit) {
       solver_status_ = mip_status_t::NODE_LIMIT;
       break;
     }
 
-    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-    dual::status_t lp_status = solve_node_lp(node_ptr,
-                                             leaf_problem,
-                                             leaf_solution,
-                                             basis_factors,
-                                             basic_list,
-                                             nonbasic_list,
-                                             node_presolver,
-                                             bnb_worker_type_t::BEST_FIRST,
-                                             recompute_bounds_and_basis,
-                                             original_lp_.lower,
-                                             original_lp_.upper,
-                                             exploration_stats_,
-                                             settings_.log);
+    dual::status_t lp_status =
+      solve_node_lp(node_ptr, worker_data, exploration_stats_, settings_.log);
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
@@ -1031,15 +873,11 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
     ++exploration_stats_.nodes_explored;
     --exploration_stats_.nodes_unexplored;
 
-    auto [node_status, round_dir] = update_tree(node_ptr,
-                                                search_tree_,
-                                                leaf_problem,
-                                                leaf_solution,
-                                                bnb_worker_type_t::BEST_FIRST,
-                                                lp_status,
-                                                settings_.log);
+    auto [node_status, round_dir] =
+      update_tree(node_ptr, search_tree_, worker_data, lp_status, settings_.log);
 
-    recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN;
+    worker_data->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker_data->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
     if (node_status == node_status_t::HAS_CHILDREN) {
       // The stack should only contain the children of the current parent.
@@ -1055,102 +893,45 @@ void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
       exploration_stats_.nodes_unexplored += 2;
 
       if (round_dir == rounding_direction_t::UP) {
-        stack.push_front(node_ptr->get_down_child());
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_down_child());
+        } else {
+          stack.push_front(node_ptr->get_down_child());
+        }
+
         stack.push_front(node_ptr->get_up_child());
       } else {
-        stack.push_front(node_ptr->get_up_child());
-        stack.push_front(node_ptr->get_down_child());
-      }
-    }
-  }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t task_id)
-{
-  f_t lower_bound = -inf;
-  f_t abs_gap     = inf;
-  f_t rel_gap     = inf;
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_up_child());
+        } else {
+          stack.push_front(node_ptr->get_up_child());
+        }
 
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol &&
-         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
-    // In the current implementation, we are use the active number of subtree to decide
-    // when to stop the execution. We need to increment the counter at the same
-    // time as we pop a node from the queue to avoid some threads exiting
-    // the main loop thinking that the solver has already finished.
-    // This will be not needed in the master-worker model.
-    node_queue_.lock();
-    // If there any node left in the heap, we pop the top node and explore it.
-    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-    if (start_node.has_value()) { active_subtrees_++; };
-    node_queue_.unlock();
-
-    if (start_node.has_value()) {
-      if (upper_bound_ < start_node.value()->lower_bound) {
-        // This node was put on the heap earlier but its lower bound is now greater than the
-        // current upper bound
-        search_tree_.graphviz_node(
-          settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-        search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-        active_subtrees_--;
-        continue;
+        stack.push_front(node_ptr->get_down_child());
       }
-
-      // Best-first search with plunging
-      plunge_from(task_id,
-                  start_node.value(),
-                  leaf_problem,
-                  node_presolver,
-                  basis_factors,
-                  basic_list,
-                  nonbasic_list);
-
-      active_subtrees_--;
     }
-
-    lower_bound = get_lower_bound();
-    abs_gap     = upper_bound_ - lower_bound;
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
   }
 
-  is_running = false;
-
-  // Check if it is the last thread that exited the loop and no
-  // timeout or numerical error has happen.
-  if (solver_status_ == mip_status_t::UNSET) {
-    if (active_subtrees_ > 0) { local_lower_bounds_[task_id] = inf; }
+  if (mode == mip_solve_mode_t::BNB_PARALLEL) {
+    worker_pool_.return_worker_to_pool(worker_data);
+    active_workers_per_type_[BEST_FIRST]--;
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
-                                             const std::vector<f_t>& start_lower,
-                                             const std::vector<f_t>& start_upper,
-                                             lp_problem_t<i_t, f_t>& leaf_problem,
-                                             bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                             basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                             std::vector<i_t>& basic_list,
-                                             std::vector<i_t>& nonbasic_list,
-                                             bnb_worker_type_t diving_type)
+void branch_and_bound_t<i_t, f_t>::dive_with(bnb_worker_data_t<i_t, f_t>* worker_data)
 {
   logger_t log;
   log.log = false;
 
+  bnb_worker_type_t diving_type    = worker_data->worker_type;
   const i_t diving_node_limit      = settings_.diving_settings.node_limit;
   const i_t diving_backtrack_limit = settings_.diving_settings.backtrack_limit;
-  bool recompute_bounds_and_basis  = true;
-  search_tree_t<i_t, f_t> dive_tree(std::move(start_node));
+
+  worker_data->recompute_basis  = true;
+  worker_data->recompute_bounds = true;
+
+  search_tree_t<i_t, f_t> dive_tree(std::move(*worker_data->start_node));
   std::deque<mip_node_t<i_t, f_t>*> stack;
   stack.push_front(&dive_tree.root);
 
@@ -1160,36 +941,25 @@ void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
   dive_stats.nodes_explored      = 0;
   dive_stats.nodes_unexplored    = 1;
 
-  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running_) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
-    f_t lower_bound = node_ptr->lower_bound;
-    f_t upper_bound = upper_bound_;
-    f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    f_t lower_bound          = node_ptr->lower_bound;
+    f_t upper_bound          = upper_bound_;
+    f_t rel_gap              = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    worker_data->lower_bound = lower_bound;
 
     if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-      recompute_bounds_and_basis = true;
+      worker_data->recompute_basis  = true;
+      worker_data->recompute_bounds = true;
       continue;
     }
 
     if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; }
     if (dive_stats.nodes_explored > diving_node_limit) { break; }
 
-    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
-    dual::status_t lp_status = solve_node_lp(node_ptr,
-                                             leaf_problem,
-                                             leaf_solution,
-                                             basis_factors,
-                                             basic_list,
-                                             nonbasic_list,
-                                             node_presolver,
-                                             diving_type,
-                                             recompute_bounds_and_basis,
-                                             start_lower,
-                                             start_upper,
-                                             dive_stats,
-                                             log);
+    dual::status_t lp_status = solve_node_lp(node_ptr, worker_data, dive_stats, log);
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
@@ -1200,9 +970,10 @@ void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
 
     ++dive_stats.nodes_explored;
 
-    auto [node_status, round_dir] =
-      update_tree(node_ptr, dive_tree, leaf_problem, leaf_solution, diving_type, lp_status, log);
-    recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN;
+    auto [node_status, round_dir] = update_tree(node_ptr, dive_tree, worker_data, lp_status, log);
+
+    worker_data->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker_data->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
     if (node_status == node_status_t::HAS_CHILDREN) {
       if (round_dir == rounding_direction_t::UP) {
@@ -1220,68 +991,220 @@ void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
       stack.pop_back();
     }
   }
+
+  worker_pool_.return_worker_to_pool(worker_data);
+  active_workers_per_type_[diving_type]--;
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::run_scheduler()
+{
+  diving_heuristics_settings_t<i_t, f_t> diving_settings = settings_.diving_settings;
+  const i_t num_workers                                  = 2 * settings_.num_threads;
+
+  if (!std::isfinite(upper_bound_)) { diving_settings.guided_diving = false; }
+  std::vector<bnb_worker_type_t> worker_types = bnb_get_worker_types(diving_settings);
+  std::array<i_t, bnb_num_worker_types> max_num_workers_per_type =
+    bnb_get_max_workers(num_workers, worker_types);
+
+#ifdef CUOPT_LOG_DEBUG
+  for (auto type : worker_types) {
+    settings_.log.debug("%c%d: max num of workers = %d",
+                        feasible_solution_symbol(type),
+                        type,
+                        max_num_workers_per_type[type]);
+  }
+#endif
+
+  worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, settings_);
+  active_workers_per_type_.fill(0);
+
+  f_t lower_bound     = get_lower_bound();
+  f_t abs_gap         = upper_bound_ - lower_bound;
+  f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+  i_t last_node_depth = 0;
+
+  is_running_ = true;
+
+  settings_.log.printf(
+    "  | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
+    "|  Time  |\n");
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol &&
+         (active_workers_per_type_[0] > 0 || node_queue_.best_first_queue_size() > 0)) {
+    bool launched_any_task = false;
+    lower_bound            = get_lower_bound();
+    abs_gap                = upper_bound_ - lower_bound;
+    rel_gap                = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    repair_heuristic_solutions();
+
+    // If the guided diving was disabled previously due to the lack of an incumbent solution,
+    // re-enable as soon as a new incumbent is found.
+    if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) {
+      if (std::isfinite(upper_bound_)) {
+        diving_settings.guided_diving = settings_.diving_settings.guided_diving;
+        worker_types                  = bnb_get_worker_types(diving_settings);
+        max_num_workers_per_type      = bnb_get_max_workers(num_workers, worker_types);
+
+#ifdef CUOPT_LOG_DEBUG
+        for (auto type : worker_types) {
+          settings_.log.debug("%c%d: max num of workers = %d",
+                              feasible_solution_symbol(type),
+                              type,
+                              max_num_workers_per_type[type]);
+        }
+#endif
+      }
+    }
+
+    f_t now = toc(exploration_stats_.start_time);
+    f_t time_since_last_log =
+      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+         time_since_last_log >= 1) ||
+        (time_since_last_log > 30) || now > settings_.time_limit) {
+      i_t depth =
+        node_queue_.best_first_queue_size() > 0 ? node_queue_.bfs_top()->depth : last_node_depth;
+      report(' ', upper_bound_, lower_bound, depth);
+      exploration_stats_.last_log             = tic();
+      exploration_stats_.nodes_since_last_log = 0;
+    }
+
+    if (now > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    }
+
+    for (auto type : worker_types) {
+      if (active_workers_per_type_[type] >= max_num_workers_per_type[type]) { continue; }
+
+      // Get an idle worker.
+      bnb_worker_data_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
+      if (worker == nullptr) { break; }
+
+      if (type == BEST_FIRST) {
+        // If there any node left in the heap, we pop the top node and explore it.
+        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+
+        if (!start_node.has_value()) { continue; }
+        if (upper_bound_ < start_node.value()->lower_bound) {
+          // This node was put on the heap earlier but its lower bound is now greater than the
+          // current upper bound
+          search_tree_.graphviz_node(
+            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
+          continue;
+        }
+
+        // Remove the worker from the idle list.
+        worker_pool_.pop_idle_worker();
+        worker->init_best_first(start_node.value(), original_lp_);
+        last_node_depth = start_node.value()->depth;
+        active_workers_per_type_[type]++;
+        launched_any_task = true;
+
+#pragma omp task affinity(worker)
+        plunge_with(worker, mip_solve_mode_t::BNB_PARALLEL);
+
+      } else {
+        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
+
+        if (!start_node.has_value()) { continue; }
+        if (upper_bound_ < start_node.value()->lower_bound ||
+            start_node.value()->depth < diving_settings.min_node_depth) {
+          continue;
+        }
+
+        bool is_feasible = worker->init_diving(start_node.value(), type, original_lp_, settings_);
+        if (!is_feasible) { continue; }
+
+        // Remove the worker from the idle list.
+        worker_pool_.pop_idle_worker();
+        active_workers_per_type_[type]++;
+        launched_any_task = true;
+
+#pragma omp task affinity(worker)
+        dive_with(worker);
+      }
+    }
+
+    // If no new task was launched in this iteration, suspend temporarily the
+    // execution of the master. As of 8/Jan/2026, GCC does not
+    // implement taskyield, but LLVM does.
+    if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+  }
+
+#pragma omp taskwait
+  is_running_ = false;
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::diving_thread(bnb_worker_type_t diving_type)
+void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
 {
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
-
-  std::vector<f_t> start_lower;
-  std::vector<f_t> start_upper;
-  bool reset_starting_bounds = true;
-
-  while (solver_status_ == mip_status_t::UNSET && is_running &&
-         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
-    if (reset_starting_bounds) {
-      start_lower = original_lp_.lower;
-      start_upper = original_lp_.upper;
-      std::fill(node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-      reset_starting_bounds = false;
+  bnb_worker_data_t<i_t, f_t> worker(0, original_lp_, Arow_, var_types_, settings_);
+
+  f_t lower_bound     = get_lower_bound();
+  f_t abs_gap         = upper_bound_ - lower_bound;
+  f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+  i_t last_node_depth = 0;
+
+  is_running_ = true;
+
+  settings_.log.printf(
+    "  | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
+    "|  Time  |\n");
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) {
+    bool launched_any_task = false;
+    lower_bound            = get_lower_bound();
+    abs_gap                = upper_bound_ - lower_bound;
+    rel_gap                = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    repair_heuristic_solutions();
+
+    f_t now = toc(exploration_stats_.start_time);
+    f_t time_since_last_log =
+      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+         time_since_last_log >= 1) ||
+        (time_since_last_log > 30) || now > settings_.time_limit) {
+      i_t depth =
+        node_queue_.best_first_queue_size() > 0 ? node_queue_.bfs_top()->depth : last_node_depth;
+      report(' ', upper_bound_, lower_bound, depth);
+      exploration_stats_.last_log             = tic();
+      exploration_stats_.nodes_since_last_log = 0;
     }
 
-    // In the current implementation, multiple threads can pop the nodes
-    // from the queue, so we need to initialize the lower and upper bound here
-    // to avoid other thread fathoming the node (i.e., deleting) before we can read
-    // the variable bounds from the tree.
-    // This will be not needed in the master-worker model.
-    node_queue_.lock();
-    std::optional<mip_node_t<i_t, f_t>*> node_ptr  = node_queue_.pop_diving();
-    std::optional<mip_node_t<i_t, f_t>> start_node = std::nullopt;
-
-    if (node_ptr.has_value()) {
-      node_ptr.value()->get_variable_bounds(
-        start_lower, start_upper, node_presolver.bounds_changed);
-      start_node = node_ptr.value()->detach_copy();
+    if (now > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
     }
-    node_queue_.unlock();
-
-    if (start_node.has_value()) {
-      reset_starting_bounds = true;
-
-      if (upper_bound_ < start_node->lower_bound) { continue; }
-      bool is_feasible = node_presolver.bounds_strengthening(start_lower, start_upper, settings_);
-      if (!is_feasible) { continue; }
-
-      dive_from(start_node.value(),
-                start_lower,
-                start_upper,
-                leaf_problem,
-                node_presolver,
-                basis_factors,
-                basic_list,
-                nonbasic_list,
-                diving_type);
+
+    // If there any node left in the heap, we pop the top node and explore it.
+    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+
+    if (!start_node.has_value()) { continue; }
+    if (upper_bound_ < start_node.value()->lower_bound) {
+      // This node was put on the heap earlier but its lower bound is now greater than the
+      // current upper bound
+      search_tree_.graphviz_node(
+        settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+      search_tree_.update(start_node.value(), node_status_t::FATHOMED);
+      continue;
     }
+
+    worker.init_best_first(start_node.value(), original_lp_);
+    plunge_with(&worker, mip_solve_mode_t::BNB_SINGLE_THREADED);
   }
+
+  is_running_ = false;
 }
 
 template <typename i_t, typename f_t>
@@ -1295,15 +1218,19 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
   // Root node path
   lp_status_t root_status;
-  std::future<lp_status_t> root_status_future;
-  root_status_future = std::async(std::launch::async,
-                                  &solve_linear_program_advanced<i_t, f_t>,
-                                  std::ref(original_lp_),
-                                  exploration_stats_.start_time,
-                                  std::ref(lp_settings),
-                                  std::ref(root_relax_soln_),
-                                  std::ref(root_vstatus_),
-                                  std::ref(edge_norms_));
+
+  // Note that we need to explicitly declared `root_status` as a shared variable here since
+  // it is local to the thread that are executing the enclosing task.
+#pragma omp task shared(root_status)
+  {
+    root_status = solve_linear_program_advanced<i_t, f_t>(original_lp_,
+                                                          exploration_stats_.start_time,
+                                                          lp_settings,
+                                                          root_relax_soln_,
+                                                          root_vstatus_,
+                                                          edge_norms_);
+  }
+
   // Wait for the root relaxation solution to be sent by the diversity manager or dual simplex
   // to finish
   while (!root_crossover_solution_set_.load(std::memory_order_acquire) &&
@@ -1346,7 +1273,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
     // Check if crossover was stopped by dual simplex
     if (crossover_status == crossover_status_t::OPTIMAL) {
       set_root_concurrent_halt(1);  // Stop dual simplex
-      root_status = root_status_future.get();
+#pragma omp taskwait
 
       // Override the root relaxation solution with the crossover solution
       root_relax_soln_ = root_crossover_soln_;
@@ -1357,13 +1284,13 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       solver_name      = "Barrier/PDLP and Crossover";
 
     } else {
-      root_status    = root_status_future.get();
+#pragma omp taskwait
       user_objective = root_relax_soln_.user_objective;
       iter           = root_relax_soln_.iterations;
       solver_name    = "Dual Simplex";
     }
   } else {
-    root_status    = root_status_future.get();
+#pragma omp taskwait
     user_objective = root_relax_soln_.user_objective;
     iter           = root_relax_soln_.iterations;
     solver_name    = "Dual Simplex";
@@ -1377,7 +1304,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                          solver_name.c_str());
     settings_.log.printf("Root relaxation objective %+.8e\n", user_objective);
   } else {
-    settings_.log.printf("Root relaxation returned status: %s\n",
+    settings_.log.printf("Root relaxation returned: %s\n",
                          lp_status_to_string(root_status).c_str());
   }
 
@@ -1388,41 +1315,18 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 }
 
 template <typename i_t, typename f_t>
-mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solution)
+mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solution,
+                                                 mip_solve_mode_t solve_mode)
 {
   logger_t log;
   log.log                             = false;
   log.log_prefix                      = settings_.log.log_prefix;
   solver_status_                      = mip_status_t::UNSET;
-  is_running                          = false;
+  is_running_                         = false;
   exploration_stats_.nodes_unexplored = 0;
   exploration_stats_.nodes_explored   = 0;
   original_lp_.A.to_compressed_row(Arow_);
 
-  std::vector<bnb_worker_type_t> diving_strategies;
-  diving_strategies.reserve(4);
-
-  if (settings_.diving_settings.pseudocost_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::PSEUDOCOST_DIVING);
-  }
-
-  if (settings_.diving_settings.line_search_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::LINE_SEARCH_DIVING);
-  }
-
-  if (settings_.diving_settings.guided_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::GUIDED_DIVING);
-  }
-
-  if (settings_.diving_settings.coefficient_diving != 0) {
-    diving_strategies.push_back(bnb_worker_type_t::COEFFICIENT_DIVING);
-    calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
-  }
-
-  if (diving_strategies.empty()) {
-    settings_.log.printf("Warning: All diving heuristics are disabled!\n");
-  }
-
   if (guess_.size() != 0) {
     std::vector<f_t> crushed_guess;
     crush_primal_solution(original_problem_, original_lp_, guess_, new_slacks_, crushed_guess);
@@ -1446,6 +1350,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.inside_mip                = 1;
   lp_settings.concurrent_halt           = get_root_concurrent_halt();
+
   // RINS/SUBMIP path
   if (!enable_concurrent_lp_root_solve()) {
     settings_.log.printf("\nSolving LP root relaxation with dual simplex\n");
@@ -1492,7 +1397,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   set_uninitialized_steepest_edge_norms<i_t, f_t>(edge_norms_);
 
   root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
-  local_lower_bounds_.assign(settings_.num_bfs_workers, root_objective_);
 
   if (settings_.set_simplex_solution_callback != nullptr) {
     std::vector<f_t> original_x;
@@ -1567,59 +1471,28 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       root_vstatus_,
                       original_lp_,
                       log);
+  node_queue_.push(search_tree_.root.get_down_child());
+  node_queue_.push(search_tree_.root.get_up_child());
 
-  settings_.log.printf("Exploring the B&B tree using %d threads (best-first = %d, diving = %d)\n\n",
-                       settings_.num_threads,
-                       settings_.num_bfs_workers,
-                       settings_.num_threads - settings_.num_bfs_workers);
+  settings_.log.printf("Exploring the B&B tree using %d threads\n\n", settings_.num_threads);
 
   exploration_stats_.nodes_explored       = 0;
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
-  active_subtrees_                        = 0;
-  is_running                              = true;
   lower_bound_ceiling_                    = inf;
-  should_report_                          = true;
-
-  settings_.log.printf(
-    "  | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
-    "|  Time  |\n");
-
-#pragma omp parallel num_threads(settings_.num_threads)
-  {
-#pragma omp master
-    {
-      auto down_child          = search_tree_.root.get_down_child();
-      auto up_child            = search_tree_.root.get_up_child();
-      i_t initial_size         = 2 * settings_.num_threads;
-      const i_t num_strategies = diving_strategies.size();
-
-#pragma omp taskgroup
-      {
-#pragma omp task
-        exploration_ramp_up(down_child, initial_size);
-
-#pragma omp task
-        exploration_ramp_up(up_child, initial_size);
-      }
+  min_node_queue_size_                    = 2 * settings_.num_threads;
 
-      for (i_t i = 0; i < settings_.num_bfs_workers; i++) {
-#pragma omp task
-        best_first_thread(i);
-      }
+  if (settings_.diving_settings.coefficient_diving != 0) {
+    calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
+  }
 
-      if (!diving_strategies.empty()) {
-        for (i_t k = 0; k < settings_.diving_settings.num_diving_workers; k++) {
-          const bnb_worker_type_t diving_type = diving_strategies[k % num_strategies];
-#pragma omp task
-          diving_thread(diving_type);
-        }
-      }
-    }
+  if (solve_mode == mip_solve_mode_t::BNB_PARALLEL) {
+    run_scheduler();
+  } else {
+    single_threaded_solve();
   }
 
-  is_running      = false;
   f_t lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
                                                             : search_tree_.root.lower_bound;
   set_final_solution(solution, lower_bound);
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index 327f99bc4..3efaaa6d6 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <dual_simplex/bnb_worker.hpp>
 #include <dual_simplex/diving_heuristics.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/mip_node.hpp>
@@ -35,38 +36,14 @@ enum class mip_status_t {
   UNSET      = 6,  // The status is not set
 };
 
-// Indicate the search and variable selection algorithms used by each thread
-// in B&B (See [1]).
-//
-// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
-// Berlin, 2007. doi: 10.14279/depositonce-1634.
-enum class bnb_worker_type_t {
-  BEST_FIRST         = 0,  // Best-First + Plunging.
-  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
-  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
-  GUIDED_DIVING = 3,  // Guided diving (9.2.3). If no incumbent is found yet, use pseudocost diving.
-  COEFFICIENT_DIVING = 4  // Coefficient diving (9.2.1)
+enum class mip_solve_mode_t {
+  BNB_PARALLEL        = 0,  // Parallel B&B (default)
+  BNB_SINGLE_THREADED = 1,  // Single threaded B&B for SubMIP and RINS
 };
 
-template <typename i_t, typename f_t>
-class bounds_strengthening_t;
-
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
-template <typename i_t, typename f_t>
-struct bnb_stats_t {
-  f_t start_time                        = 0.0;
-  omp_atomic_t<f_t> total_lp_solve_time = 0.0;
-  omp_atomic_t<i_t> nodes_explored      = 0;
-  omp_atomic_t<i_t> nodes_unexplored    = 0;
-  omp_atomic_t<f_t> total_lp_iters      = 0;
-
-  // This should only be used by the main thread
-  omp_atomic_t<f_t> last_log             = 0.0;
-  omp_atomic_t<i_t> nodes_since_last_log = 0;
-};
-
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
@@ -114,7 +91,7 @@ class branch_and_bound_t {
   lp_status_t solve_root_relaxation(simplex_solver_settings_t<i_t, f_t> const& lp_settings);
 
   // The main entry routine. Returns the solver status and populates solution with the incumbent.
-  mip_status_t solve(mip_solution_t<i_t, f_t>& solution);
+  mip_status_t solve(mip_solution_t<i_t, f_t>& solution, mip_solve_mode_t solve_mode);
 
  private:
   const user_problem_t<i_t, f_t>& original_problem_;
@@ -135,9 +112,6 @@ class branch_and_bound_t {
   std::vector<i_t> var_up_locks_;
   std::vector<i_t> var_down_locks_;
 
-  // Local lower bounds for each thread
-  std::vector<omp_atomic_t<f_t>> local_lower_bounds_;
-
   // Mutex for upper bound
   omp_mutex_t mutex_upper_;
 
@@ -175,14 +149,21 @@ class branch_and_bound_t {
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
 
-  // Count the number of subtrees that are currently being explored.
-  omp_atomic_t<i_t> active_subtrees_;
+  // Count the number of workers per type that either are being executed or
+  // are waiting to be executed.
+  std::array<omp_atomic_t<i_t>, bnb_num_worker_types> active_workers_per_type_;
+
+  // Worker pool
+  bnb_worker_pool_t<i_t, f_t> worker_pool_;
 
   // Global status of the solver.
   omp_atomic_t<mip_status_t> solver_status_;
-  omp_atomic_t<bool> is_running{false};
+  omp_atomic_t<bool> is_running_{false};
 
-  omp_atomic_t<bool> should_report_;
+  // Minimum number of node in the queue. When the queue size is less than
+  // this variable, the nodes are added directly to the queue instead of
+  // the local stack. This also determines the end of the ramp-up phase.
+  i_t min_node_queue_size_;
 
   // In case, a best-first thread encounters a numerical issue when solving a node,
   // its blocks the progression of the lower bound.
@@ -204,72 +185,43 @@ class branch_and_bound_t {
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
-  // Ramp-up phase of the solver, where we greedily expand the tree until
-  // there is enough unexplored nodes. This is done recursively using OpenMP tasks.
-  void exploration_ramp_up(mip_node_t<i_t, f_t>* node, i_t initial_heap_size);
-
   // We use best-first to pick the `start_node` and then perform a depth-first search
   // from this node (i.e., a plunge). It can only backtrack to a sibling node.
   // Unexplored nodes in the subtree are inserted back into the global heap.
-  void plunge_from(i_t task_id,
-                   mip_node_t<i_t, f_t>* start_node,
-                   lp_problem_t<i_t, f_t>& leaf_problem,
-                   bounds_strengthening_t<i_t, f_t>& node_presolver,
-                   basis_update_mpf_t<i_t, f_t>& basis_update,
-                   std::vector<i_t>& basic_list,
-                   std::vector<i_t>& nonbasic_list);
-
-  // Each "main" thread pops a node from the global heap and then performs a plunge
-  // (i.e., a shallow dive) into the subtree determined by the node.
-  void best_first_thread(i_t task_id);
+  void plunge_with(bnb_worker_data_t<i_t, f_t>* worker_data, mip_solve_mode_t mode);
 
   // Perform a deep dive in the subtree determined by the `start_node` in order
   // to find integer feasible solutions.
-  void dive_from(mip_node_t<i_t, f_t>& start_node,
-                 const std::vector<f_t>& start_lower,
-                 const std::vector<f_t>& start_upper,
-                 lp_problem_t<i_t, f_t>& leaf_problem,
-                 bounds_strengthening_t<i_t, f_t>& node_presolver,
-                 basis_update_mpf_t<i_t, f_t>& basis_update,
-                 std::vector<i_t>& basic_list,
-                 std::vector<i_t>& nonbasic_list,
-                 bnb_worker_type_t diving_type);
-
-  // Each diving thread pops the first node from the dive queue and then performs
-  // a deep dive into the subtree determined by the node.
-  void diving_thread(bnb_worker_type_t diving_type);
+  void dive_with(bnb_worker_data_t<i_t, f_t>* worker_data);
+
+  // Run the scheduler (aka the master) whose will schedule and manage
+  // all the other workers.
+  void run_scheduler();
+
+  // Run the branch-and-bound algorithm in single threaded mode.
+  // This disable all diving heuristics.
+  void single_threaded_solve();
 
   // Solve the LP relaxation of a leaf node
   dual::status_t solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
-                               lp_problem_t<i_t, f_t>& leaf_problem,
-                               lp_solution_t<i_t, f_t>& leaf_solution,
-                               basis_update_mpf_t<i_t, f_t>& basis_factors,
-                               std::vector<i_t>& basic_list,
-                               std::vector<i_t>& nonbasic_list,
-                               bounds_strengthening_t<i_t, f_t>& node_presolver,
-                               bnb_worker_type_t thread_type,
-                               bool recompute_bounds_and_basis,
-                               const std::vector<f_t>& root_lower,
-                               const std::vector<f_t>& root_upper,
+                               bnb_worker_data_t<i_t, f_t>* worker_data,
                                bnb_stats_t<i_t, f_t>& stats,
                                logger_t& log);
 
   // Update the tree based on the LP relaxation. Returns the status
   // of the node and, if appropriated, the preferred rounding direction
   // when visiting the children.
-  std::pair<node_status_t, rounding_direction_t> update_tree(mip_node_t<i_t, f_t>* node_ptr,
-                                                             search_tree_t<i_t, f_t>& search_tree,
-                                                             lp_problem_t<i_t, f_t>& leaf_problem,
-                                                             lp_solution_t<i_t, f_t>& leaf_solution,
-                                                             bnb_worker_type_t thread_type,
-                                                             dual::status_t lp_status,
-                                                             logger_t& log);
+  std::pair<node_status_t, rounding_direction_t> update_tree(
+    mip_node_t<i_t, f_t>* node_ptr,
+    search_tree_t<i_t, f_t>& search_tree,
+    bnb_worker_data_t<i_t, f_t>* worker_data,
+    dual::status_t lp_status,
+    logger_t& log);
 
   // Selects the variable to branch on.
   branch_variable_t<i_t> variable_selection(mip_node_t<i_t, f_t>* node_ptr,
                                             const std::vector<i_t>& fractional,
-                                            const std::vector<f_t>& solution,
-                                            bnb_worker_type_t type);
+                                            bnb_worker_data_t<i_t, f_t>* worker_data);
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/diving_heuristics.cpp b/cpp/src/dual_simplex/diving_heuristics.cpp
index a56b4cce3..44101c0c6 100644
--- a/cpp/src/dual_simplex/diving_heuristics.cpp
+++ b/cpp/src/dual_simplex/diving_heuristics.cpp
@@ -71,7 +71,6 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
                                          const std::vector<f_t>& root_solution,
                                          logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(pc.mutex);
   i_t branch_var                 = -1;
   f_t max_score                  = std::numeric_limits<f_t>::lowest();
   rounding_direction_t round_dir = rounding_direction_t::NONE;
@@ -146,7 +145,6 @@ branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
                                      const std::vector<f_t>& incumbent,
                                      logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(pc.mutex);
   i_t branch_var                 = -1;
   f_t max_score                  = std::numeric_limits<f_t>::lowest();
   rounding_direction_t round_dir = rounding_direction_t::NONE;
diff --git a/cpp/src/dual_simplex/mip_node.cpp b/cpp/src/dual_simplex/mip_node.cpp
deleted file mode 100644
index 547950fad..000000000
--- a/cpp/src/dual_simplex/mip_node.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* clang-format off */
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-/* clang-format on */
-
-#include <dual_simplex/mip_node.hpp>
-
-namespace cuopt::linear_programming::dual_simplex {
-
-bool inactive_status(node_status_t status)
-{
-  return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE ||
-          status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL);
-}
-
-}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp
index de147132a..610fa66e5 100644
--- a/cpp/src/dual_simplex/mip_node.hpp
+++ b/cpp/src/dual_simplex/mip_node.hpp
@@ -29,7 +29,11 @@ enum class node_status_t : int {
 
 enum class rounding_direction_t : int8_t { NONE = -1, DOWN = 0, UP = 1 };
 
-bool inactive_status(node_status_t status);
+inline bool inactive_status(node_status_t status)
+{
+  return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE ||
+          status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL);
+}
 
 template <typename i_t, typename f_t>
 class mip_node_t {
diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp
index 28072795a..b5c9a9ea7 100644
--- a/cpp/src/dual_simplex/node_queue.hpp
+++ b/cpp/src/dual_simplex/node_queue.hpp
@@ -114,26 +114,29 @@ class node_queue_t {
 
   std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
   {
+    std::lock_guard<omp_mutex_t> lock(mutex);
     auto entry = best_first_heap.pop();
+
     if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
+
     return std::nullopt;
   }
 
   std::optional<mip_node_t<i_t, f_t>*> pop_diving()
   {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+
     while (!diving_heap.empty()) {
       auto entry = diving_heap.pop();
+
       if (entry.has_value()) {
         if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
       }
     }
+
     return std::nullopt;
   }
 
-  void lock() { mutex.lock(); }
-
-  void unlock() { mutex.unlock(); }
-
   i_t diving_queue_size()
   {
     std::lock_guard<omp_mutex_t> lock(mutex);
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index bbfe18d61..179979a32 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -621,7 +621,7 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
   }
 
   constexpr bool run_bounds_strengthening = false;
-  if (run_bounds_strengthening) {
+  if constexpr (run_bounds_strengthening) {
     csr_matrix_t<i_t, f_t> Arow(1, 1, 1);
     problem.A.to_compressed_row(Arow);
 
@@ -629,8 +629,8 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
 
     // Empty var_types means that all variables are continuous
     bounds_strengthening_t<i_t, f_t> strengthening(problem, Arow, row_sense, {});
-    std::fill(strengthening.bounds_changed.begin(), strengthening.bounds_changed.end(), true);
-    strengthening.bounds_strengthening(problem.lower, problem.upper, settings);
+    std::vector<bool> bounds_changed(problem.num_cols, true);
+    strengthening.bounds_strengthening(problem.lower, problem.upper, bounds_changed, settings);
   }
 
   settings.log.debug(
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 0a8f660e9..31bed6756 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -133,6 +133,77 @@ void strong_branch_helper(i_t start,
   }
 }
 
+template <typename i_t, typename f_t>
+f_t trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
+                    const simplex_solver_settings_t<i_t, f_t>& settings,
+                    const std::vector<variable_type_t>& var_types,
+                    const std::vector<variable_status_t>& vstatus,
+                    const std::vector<f_t>& edge_norms,
+                    const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                    const std::vector<i_t>& basic_list,
+                    const std::vector<i_t>& nonbasic_list,
+                    i_t branch_var,
+                    f_t branch_var_lower,
+                    f_t branch_var_upper,
+                    f_t upper_bound,
+                    i_t bnb_lp_iter_per_node,
+                    f_t start_time,
+                    omp_atomic_t<int64_t>& total_lp_iter)
+{
+  lp_problem_t child_problem      = original_lp;
+  child_problem.lower[branch_var] = branch_var_lower;
+  child_problem.upper[branch_var] = branch_var_upper;
+
+  simplex_solver_settings_t<i_t, f_t> child_settings = settings;
+  child_settings.set_log(false);
+  i_t lp_iter_upper              = settings.reliability_branching_settings.upper_max_lp_iter;
+  i_t lp_iter_lower              = settings.reliability_branching_settings.lower_max_lp_iter;
+  child_settings.iteration_limit = std::clamp(bnb_lp_iter_per_node, lp_iter_lower, lp_iter_upper);
+  child_settings.cut_off         = upper_bound + settings.dual_tol;
+  child_settings.inside_mip      = 2;
+  child_settings.scale_columns   = false;
+
+  lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
+  i_t iter                                         = 0;
+  std::vector<variable_status_t> child_vstatus     = vstatus;
+  std::vector<f_t> child_edge_norms                = edge_norms;
+  std::vector<i_t> child_basic_list                = basic_list;
+  std::vector<i_t> child_nonbasic_list             = nonbasic_list;
+  basis_update_mpf_t<i_t, f_t> child_basis_factors = basis_factors;
+
+  dual::status_t status = dual_phase2_with_advanced_basis(2,
+                                                          0,
+                                                          false,
+                                                          start_time,
+                                                          child_problem,
+                                                          child_settings,
+                                                          child_vstatus,
+                                                          child_basis_factors,
+                                                          child_basic_list,
+                                                          child_nonbasic_list,
+                                                          solution,
+                                                          iter,
+                                                          child_edge_norms);
+  total_lp_iter += iter;
+  settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n",
+                     branch_var,
+                     child_problem.lower[branch_var],
+                     child_problem.upper[branch_var],
+                     iter,
+                     dual::status_to_string(status).c_str(),
+                     compute_objective(child_problem, solution.x));
+
+  if (status == dual::status_t::DUAL_UNBOUNDED) {
+    // LP was infeasible
+    return std::numeric_limits<f_t>::infinity();
+  } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
+             status == dual::status_t::CUTOFF) {
+    return compute_objective(child_problem, solution.x);
+  } else {
+    return std::numeric_limits<f_t>::quiet_NaN();
+  }
+}
+
 }  // namespace
 
 template <typename i_t, typename f_t>
@@ -156,40 +227,37 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       settings.num_threads,
                       fractional.size());
 
-#pragma omp parallel num_threads(settings.num_threads)
-  {
-    i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
-
-    // Here we are creating more tasks than the number of threads
-    // such that they can be scheduled dynamically to the threads.
-#pragma omp for schedule(dynamic, 1)
-    for (i_t k = 0; k < n; k++) {
-      i_t start = std::floor(k * fractional.size() / n);
-      i_t end   = std::floor((k + 1) * fractional.size() / n);
-
-      constexpr bool verbose = false;
-      if (verbose) {
-        settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
-                            omp_get_thread_num(),
-                            k,
-                            start,
-                            end,
-                            end - start);
-      }
-
-      strong_branch_helper(start,
-                           end,
-                           start_time,
-                           original_lp,
-                           settings,
-                           var_types,
-                           fractional,
-                           root_obj,
-                           root_soln,
-                           root_vstatus,
-                           edge_norms,
-                           pc);
+  // Here we are creating more tasks than the number of threads
+  // to overcome any load unbalanced between the iterations
+  i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
+
+#pragma omp taskloop grainsize(1)
+  for (i_t k = 0; k < n; k++) {
+    i_t start = std::floor(k * fractional.size() / n);
+    i_t end   = std::floor((k + 1) * fractional.size() / n);
+
+    constexpr bool verbose = false;
+    if (verbose) {
+      settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
+                          omp_get_thread_num(),
+                          k,
+                          start,
+                          end,
+                          end - start);
     }
+
+    strong_branch_helper(start,
+                         end,
+                         start_time,
+                         original_lp,
+                         settings,
+                         var_types,
+                         fractional,
+                         root_obj,
+                         root_soln,
+                         root_vstatus,
+                         edge_norms,
+                         pc);
   }
 
   pc.update_pseudo_costs_from_strong_branching(fractional, root_soln);
@@ -199,7 +267,6 @@ template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr,
                                                    f_t leaf_objective)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
   const f_t change_in_obj = leaf_objective - node_ptr->lower_bound;
   const f_t frac          = node_ptr->branch_dir == rounding_direction_t::DOWN
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
@@ -217,7 +284,7 @@ template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::initialized(i_t& num_initialized_down,
                                            i_t& num_initialized_up,
                                            f_t& pseudo_cost_down_avg,
-                                           f_t& pseudo_cost_up_avg) const
+                                           f_t& pseudo_cost_up_avg)
 {
   num_initialized_down = 0;
   num_initialized_up   = 0;
@@ -257,8 +324,6 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
                                                  const std::vector<f_t>& solution,
                                                  logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
-
   const i_t num_fractional = fractional.size();
   std::vector<f_t> pseudo_cost_up(num_fractional);
   std::vector<f_t> pseudo_cost_down(num_fractional);
@@ -279,6 +344,7 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
 
   for (i_t k = 0; k < num_fractional; k++) {
     const i_t j = fractional[k];
+
     if (pseudo_cost_num_down[j] != 0) {
       pseudo_cost_down[k] = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
     } else {
@@ -290,6 +356,7 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
     } else {
       pseudo_cost_up[k] = pseudo_cost_up_avg;
     }
+
     constexpr f_t eps = 1e-6;
     const f_t f_down  = solution[j] - std::floor(solution[j]);
     const f_t f_up    = std::ceil(solution[j]) - solution[j];
@@ -300,6 +367,7 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
   i_t branch_var = fractional[0];
   f_t max_score  = -1;
   i_t select     = -1;
+
   for (i_t k = 0; k < num_fractional; k++) {
     if (score[k] > max_score) {
       max_score  = score[k];
@@ -308,8 +376,193 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
     }
   }
 
+  log.debug("Pseudocost branching on %d. Value %e. Score %e.\n",
+            branch_var,
+            solution[branch_var],
+            score[select]);
+
+  return branch_var;
+}
+
+template <typename i_t, typename f_t>
+i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
+  mip_node_t<i_t, f_t>* node_ptr,
+  const std::vector<i_t>& fractional,
+  const std::vector<f_t>& solution,
+  const simplex_solver_settings_t<i_t, f_t>& settings,
+  const std::vector<variable_type_t>& var_types,
+  bnb_worker_data_t<i_t, f_t>* worker_data,
+  const bnb_stats_t<i_t, f_t>& bnb_stats,
+  f_t upper_bound,
+  logger_t& log)
+{
+  f_t start_time = bnb_stats.start_time;
+  i_t branch_var = fractional[0];
+  f_t max_score  = -1;
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
+
+  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
+             num_initialized_down,
+             num_initialized_up,
+             pseudo_cost_down_avg,
+             pseudo_cost_up_avg);
+
+  const int64_t bnb_total_lp_iter  = bnb_stats.total_lp_iters;
+  const int64_t bnb_nodes_explored = bnb_stats.nodes_explored;
+  const i_t bnb_lp_iter_per_node   = bnb_total_lp_iter / bnb_stats.nodes_explored;
+
+  i_t reliable_threshold = settings.reliability_branching_settings.reliable_threshold;
+  if (reliable_threshold < 0) {
+    const i_t max_threshold = settings.reliability_branching_settings.max_reliable_threshold;
+    const i_t min_threshold = settings.reliability_branching_settings.min_reliable_threshold;
+    const f_t iter_factor   = settings.reliability_branching_settings.bnb_lp_factor;
+    const i_t iter_offset   = settings.reliability_branching_settings.bnb_lp_offset;
+    const int64_t alpha     = iter_factor * bnb_total_lp_iter;
+    const int64_t max_iter  = alpha + settings.reliability_branching_settings.bnb_lp_offset;
+
+    f_t gamma          = (max_iter - sb_total_lp_iter) / (sb_total_lp_iter + 1.0);
+    gamma              = std::min(1.0, gamma);
+    gamma              = std::max((alpha - sb_total_lp_iter) / (sb_total_lp_iter + 1.0), gamma);
+    reliable_threshold = (1 - gamma) * min_threshold + gamma * max_threshold;
+    reliable_threshold = sb_total_lp_iter < max_iter ? reliable_threshold : 0;
+  }
+
+  std::vector<i_t> unreliable_list;
+  omp_mutex_t score_mutex;
+
+  for (auto j : fractional) {
+    if (pseudo_cost_num_down[j] < reliable_threshold ||
+        pseudo_cost_num_up[j] < reliable_threshold) {
+      unreliable_list.push_back(j);
+      continue;
+    }
+
+    f_t pc_up   = pseudo_cost_num_up[j] > 0 ? pseudo_cost_sum_up[j] / pseudo_cost_num_up[j]
+                                            : pseudo_cost_up_avg;
+    f_t pc_down = pseudo_cost_sum_down[j] > 0 ? pseudo_cost_sum_down[j] / pseudo_cost_num_down[j]
+                                              : pseudo_cost_down_avg;
+
+    constexpr f_t eps = 1e-6;
+    const f_t f_down  = solution[j] - std::floor(solution[j]);
+    const f_t f_up    = std::ceil(solution[j]) - solution[j];
+    f_t score         = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+  }
+
+  if (unreliable_list.empty()) {
+    log.printf(
+      "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
+
+    return branch_var;
+  }
+
+  const int num_tasks          = std::max(settings.reliability_branching_settings.num_tasks, 1);
+  const int task_priority      = settings.reliability_branching_settings.task_priority;
+  const i_t max_num_candidates = settings.reliability_branching_settings.max_num_candidates;
+  const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
+
+  assert(task_priority > 0);
+  assert(max_num_candidates > 0);
+  assert(num_candidates > 0);
+
+  log.printf(
+    "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n",
+    sb_total_lp_iter.load(),
+    bnb_total_lp_iter,
+    unreliable_list.size(),
+    num_tasks,
+    reliable_threshold);
+
+  // Shuffle the unreliable list so every variable has the same chance to be selected.
+  if (unreliable_list.size() > max_num_candidates) { worker_data->rng.shuffle(unreliable_list); }
+
+#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks)
+  for (i_t i = 0; i < num_candidates; ++i) {
+    const i_t j = unreliable_list[i];
+    std::lock_guard<omp_mutex_t> lock(pseudo_cost_mutex[j]);
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+    if (pseudo_cost_num_down[j] < reliable_threshold) {
+      // Do trial branching on the down branch
+      f_t obj = trial_branching(worker_data->leaf_problem,
+                                settings,
+                                var_types,
+                                node_ptr->vstatus,
+                                worker_data->leaf_edge_norms,
+                                worker_data->basis_factors,
+                                worker_data->basic_list,
+                                worker_data->nonbasic_list,
+                                j,
+                                worker_data->leaf_problem.lower[j],
+                                std::floor(solution[j]),
+                                upper_bound,
+                                bnb_lp_iter_per_node,
+                                start_time,
+                                sb_total_lp_iter);
+
+      if (!std::isnan(obj)) {
+        f_t change_in_obj = obj - node_ptr->lower_bound;
+        f_t change_in_x   = solution[j] - std::floor(solution[j]);
+        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_down[j]++;
+      }
+    }
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+    if (pseudo_cost_num_up[j] < reliable_threshold) {
+      f_t obj = trial_branching(worker_data->leaf_problem,
+                                settings,
+                                var_types,
+                                node_ptr->vstatus,
+                                worker_data->leaf_edge_norms,
+                                worker_data->basis_factors,
+                                worker_data->basic_list,
+                                worker_data->nonbasic_list,
+                                j,
+                                std::ceil(solution[j]),
+                                worker_data->leaf_problem.upper[j],
+                                upper_bound,
+                                bnb_lp_iter_per_node,
+                                start_time,
+                                sb_total_lp_iter);
+
+      if (!std::isnan(obj)) {
+        f_t change_in_obj = obj - node_ptr->lower_bound;
+        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_up[j]++;
+      }
+    }
+
+    if (toc(start_time) > settings.time_limit) { continue; }
+    f_t pc_up   = pseudo_cost_num_up[j] > 0 ? pseudo_cost_sum_up[j] / pseudo_cost_num_up[j]
+                                            : pseudo_cost_up_avg;
+    f_t pc_down = pseudo_cost_sum_down[j] > 0 ? pseudo_cost_sum_down[j] / pseudo_cost_num_down[j]
+                                              : pseudo_cost_down_avg;
+
+    constexpr f_t eps = 1e-6;
+    const f_t f_down  = solution[j] - std::floor(solution[j]);
+    const f_t f_up    = std::ceil(solution[j]) - solution[j];
+    f_t score         = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
+
+    std::lock_guard<omp_mutex_t> score_lock(score_mutex);
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+    }
+  }
+
   log.printf(
-    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]);
+    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
 
   return branch_var;
 }
@@ -320,8 +573,6 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
                                            f_t lower_bound,
                                            logger_t& log)
 {
-  std::lock_guard<omp_mutex_t> lock(mutex);
-
   const i_t num_fractional = fractional.size();
   f_t estimate             = lower_bound;
 
@@ -333,7 +584,8 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
   initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
   for (i_t k = 0; k < num_fractional; k++) {
-    const i_t j          = fractional[k];
+    const i_t j = fractional[k];
+
     f_t pseudo_cost_down = 0;
     f_t pseudo_cost_up   = 0;
 
@@ -348,10 +600,12 @@ f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
     } else {
       pseudo_cost_up = pseudo_cost_up_avg;
     }
+
     constexpr f_t eps = 1e-6;
     const f_t f_down  = solution[j] - std::floor(solution[j]);
     const f_t f_up    = std::ceil(solution[j]) - solution[j];
-    estimate += std::min(pseudo_cost_down * f_down, pseudo_cost_up * f_up);
+    estimate +=
+      std::min(std::max(pseudo_cost_down * f_down, eps), std::max(pseudo_cost_up * f_up, eps));
   }
 
   log.printf("pseudocost estimate = %e\n", estimate);
diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp
index 4bab438fa..8d566483a 100644
--- a/cpp/src/dual_simplex/pseudo_costs.hpp
+++ b/cpp/src/dual_simplex/pseudo_costs.hpp
@@ -7,11 +7,14 @@
 
 #pragma once
 
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bnb_worker.hpp>
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/mip_node.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/types.hpp>
 #include <utilities/omp_helpers.hpp>
+#include <utilities/pcg.hpp>
 
 #include <omp.h>
 
@@ -24,7 +27,8 @@ class pseudo_costs_t {
     : pseudo_cost_sum_down(num_variables),
       pseudo_cost_sum_up(num_variables),
       pseudo_cost_num_down(num_variables),
-      pseudo_cost_num_up(num_variables)
+      pseudo_cost_num_up(num_variables),
+      pseudo_cost_mutex(num_variables)
   {
   }
 
@@ -36,12 +40,13 @@ class pseudo_costs_t {
     pseudo_cost_sum_up.assign(num_variables, 0);
     pseudo_cost_num_down.assign(num_variables, 0);
     pseudo_cost_num_up.assign(num_variables, 0);
+    pseudo_cost_mutex.resize(num_variables);
   }
 
   void initialized(i_t& num_initialized_down,
                    i_t& num_initialized_up,
                    f_t& pseudo_cost_down_avg,
-                   f_t& pseudo_cost_up_avg) const;
+                   f_t& pseudo_cost_up_avg);
 
   f_t obj_estimate(const std::vector<i_t>& fractional,
                    const std::vector<f_t>& solution,
@@ -52,17 +57,27 @@ class pseudo_costs_t {
                          const std::vector<f_t>& solution,
                          logger_t& log);
 
+  i_t reliable_variable_selection(mip_node_t<i_t, f_t>* node_ptr,
+                                  const std::vector<i_t>& fractional,
+                                  const std::vector<f_t>& solution,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                                  const std::vector<variable_type_t>& var_types,
+                                  bnb_worker_data_t<i_t, f_t>* worker_data,
+                                  const bnb_stats_t<i_t, f_t>& bnb_stats,
+                                  f_t upper_bound,
+                                  logger_t& log);
+
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& root_soln);
-  std::vector<f_t> pseudo_cost_sum_up;
-  std::vector<f_t> pseudo_cost_sum_down;
-  std::vector<i_t> pseudo_cost_num_up;
-  std::vector<i_t> pseudo_cost_num_down;
+  std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_up;
+  std::vector<omp_atomic_t<f_t>> pseudo_cost_sum_down;
+  std::vector<omp_atomic_t<i_t>> pseudo_cost_num_up;
+  std::vector<omp_atomic_t<i_t>> pseudo_cost_num_down;
   std::vector<f_t> strong_branch_down;
   std::vector<f_t> strong_branch_up;
-
-  omp_mutex_t mutex;
+  std::vector<omp_mutex_t> pseudo_cost_mutex;
   omp_atomic_t<i_t> num_strong_branches_completed = 0;
+  omp_atomic_t<int64_t> sb_total_lp_iter          = 0;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index d86f84c39..2fdaf0356 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -22,18 +22,69 @@ namespace cuopt::linear_programming::dual_simplex {
 
 template <typename i_t, typename f_t>
 struct diving_heuristics_settings_t {
-  i_t num_diving_workers = -1;
-
   // -1 automatic, 0 disabled, 1 enabled
   i_t line_search_diving = -1;
   i_t pseudocost_diving  = -1;
   i_t guided_diving      = -1;
   i_t coefficient_diving = -1;
 
-  i_t min_node_depth         = 10;
-  i_t node_limit             = 500;
+  // The minimum depth to start diving from.
+  i_t min_node_depth = 10;
+
+  // The maximum number of nodes when performing a dive.
+  i_t node_limit = 500;
+
+  // The maximum number of dual simplex iteration allowed
+  // in a single dive. This set in terms of the total number of
+  // iterations in the best-first threads.
   f_t iteration_limit_factor = 0.05;
-  i_t backtrack_limit        = 5;
+
+  // The maximum backtracking allowed.
+  i_t backtrack_limit = 5;
+};
+
+template <typename i_t, typename f_t>
+struct reliability_branching_settings_t {
+  // Enable or disable reliability branching
+  bool enable = false;
+
+  // Lower bound for the maximum number of LP iterations for a single trial branching
+  i_t lower_max_lp_iter = 10;
+
+  // Upper bound for the maximum number of LP iterations for a single trial branching
+  i_t upper_max_lp_iter = 500;
+
+  // Priority of the tasks created when running the trial branching in parallel.
+  // Set to 1 to have the same priority as the other tasks.
+  i_t task_priority = 5;
+
+  // The number of tasks spawned for performing strong branching.
+  i_t num_tasks = -1;
+
+  // The maximum number of candidates initialized by strong branching in a single
+  // node
+  i_t max_num_candidates = 100;
+
+  // Define the maximum number of iteration spent in strong branching.
+  // Let `bnb_lp_iter` = total number of iterations in B&B, then
+  // `max iter in strong branching = bnb_lp_factor * bnb_lp_iter + bnb_lp_offset`.
+  // This is used for determining the `reliable_threshold`.
+  f_t bnb_lp_factor = 0.5;
+  i_t bnb_lp_offset = 100000;
+
+  // Threshold for determining for the number of pseudocost updates. Used for
+  // determining if the pseudocost is reliable or not.
+  // - <0: automatic
+  // - 0: disable (use pseudocost branching instead)
+  // - >0: will use the value for the threshold.
+  i_t reliable_threshold = -1;
+
+  // Maximum and minimum points of the curve to determine the value
+  // of the `reliable_threshold` based on the current number of LP
+  // iterations in strong branching and B&B.
+  // Only used when `reliable_threshold` is negative
+  i_t max_reliable_threshold = 5;
+  i_t min_reliable_threshold = 1;
 };
 
 template <typename i_t, typename f_t>
@@ -87,14 +138,12 @@ struct simplex_solver_settings_t {
       iteration_log_frequency(1000),
       first_iteration_log(2),
       num_threads(omp_get_max_threads() - 1),
-      num_bfs_workers(std::max(num_threads / 4, 1)),
       random_seed(0),
       inside_mip(0),
       solution_callback(nullptr),
       heuristic_preemption_callback(nullptr),
       concurrent_halt(nullptr)
   {
-    diving_settings.num_diving_workers = std::max(num_threads - num_bfs_workers, 1);
   }
 
   void set_log(bool logging) const { log.log = logging; }
@@ -154,9 +203,10 @@ struct simplex_solver_settings_t {
   i_t first_iteration_log;         // number of iterations to log at beginning of solve
   i_t num_threads;                 // number of threads to use
   i_t random_seed;                 // random seed
-  i_t num_bfs_workers;             // number of threads dedicated to the best-first search
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
+  reliability_branching_settings_t<i_t, f_t>
+    reliability_branching_settings;  // Settings for reliability branching
 
   i_t inside_mip;  // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node
   std::function<void(std::vector<f_t>&, f_t)> solution_callback;
diff --git a/cpp/src/dual_simplex/solution.hpp b/cpp/src/dual_simplex/solution.hpp
index d1d745cbd..4e8bcaf75 100644
--- a/cpp/src/dual_simplex/solution.hpp
+++ b/cpp/src/dual_simplex/solution.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -72,8 +72,8 @@ class mip_solution_t {
   std::vector<f_t> x;
   f_t objective;
   f_t lower_bound;
-  i_t nodes_explored;
-  i_t simplex_iterations;
+  int64_t nodes_explored;
+  int64_t simplex_iterations;
   bool has_incumbent;
 };
 
diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp
index 1f31a757d..cec1eb92d 100644
--- a/cpp/src/dual_simplex/solve.cpp
+++ b/cpp/src/dual_simplex/solve.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -599,7 +599,7 @@ i_t solve(const user_problem_t<i_t, f_t>& problem,
   if (is_mip(problem) && !settings.relaxation) {
     branch_and_bound_t branch_and_bound(problem, settings);
     mip_solution_t<i_t, f_t> mip_solution(problem.num_cols);
-    mip_status_t mip_status = branch_and_bound.solve(mip_solution);
+    mip_status_t mip_status = branch_and_bound.solve(mip_solution, mip_solve_mode_t::BNB_PARALLEL);
     if (mip_status == mip_status_t::OPTIMAL) {
       status = 0;
     } else {
@@ -638,7 +638,7 @@ i_t solve_mip_with_guess(const user_problem_t<i_t, f_t>& problem,
   if (is_mip(problem)) {
     branch_and_bound_t branch_and_bound(problem, settings);
     branch_and_bound.set_initial_guess(guess);
-    mip_status_t mip_status = branch_and_bound.solve(solution);
+    mip_status_t mip_status = branch_and_bound.solve(solution, mip_solve_mode_t::BNB_PARALLEL);
     if (mip_status == mip_status_t::OPTIMAL) {
       status = 0;
     } else {
diff --git a/cpp/src/mip/diversity/lns/rins.cu b/cpp/src/mip/diversity/lns/rins.cu
index 7456b59ed..041447204 100644
--- a/cpp/src/mip/diversity/lns/rins.cu
+++ b/cpp/src/mip/diversity/lns/rins.cu
@@ -33,19 +33,6 @@ rins_t<i_t, f_t>::rins_t(mip_solver_context_t<i_t, f_t>& context_,
   time_limit = settings.default_time_limit;
 }
 
-template <typename i_t, typename f_t>
-rins_thread_t<i_t, f_t>::~rins_thread_t()
-{
-  this->request_termination();
-}
-
-template <typename i_t, typename f_t>
-void rins_thread_t<i_t, f_t>::run_worker()
-{
-  raft::common::nvtx::range fun_scope("Running RINS");
-  rins_ptr->run_rins();
-}
-
 template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::new_best_incumbent_callback(const std::vector<f_t>& solution)
 {
@@ -56,19 +43,16 @@ template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::node_callback(const std::vector<f_t>& solution, f_t objective)
 {
   if (!enabled) return;
-
   node_count++;
 
   if (node_count - node_count_at_last_improvement < settings.nodes_after_later_improvement) return;
-
   if (node_count - node_count_at_last_rins > settings.node_freq) {
-    // opportunistic early test w/ atomic to avoid having to take the lock
-    if (!rins_thread->cpu_thread_done) return;
-    std::lock_guard<std::mutex> lock(rins_mutex);
-    if (rins_thread->cpu_thread_done && dm.population.current_size() > 0 &&
-        dm.population.is_feasible()) {
-      lp_optimal_solution = solution;
-      rins_thread->start_cpu_solver();
+    // If multiple threads call this function, only the first one will launch
+    // a new task for RINS. The other are ignored.
+    if (!launch_new_task.exchange(false)) return;
+    if (dm.population.current_size() > 0 && dm.population.is_feasible()) {
+#pragma omp task
+      run_rins(solution);
     }
   }
 }
@@ -76,8 +60,6 @@ void rins_t<i_t, f_t>::node_callback(const std::vector<f_t>& solution, f_t objec
 template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::enable()
 {
-  rins_thread              = std::make_unique<rins_thread_t<i_t, f_t>>();
-  rins_thread->rins_ptr    = this;
   seed                     = cuopt::seed_generator::get_seed();
   problem_copy             = std::make_unique<problem_t<i_t, f_t>>(*problem_ptr);
   problem_copy->handle_ptr = &rins_handle;
@@ -85,19 +67,16 @@ void rins_t<i_t, f_t>::enable()
 }
 
 template <typename i_t, typename f_t>
-void rins_t<i_t, f_t>::stop_rins()
+void rins_t<i_t, f_t>::run_rins(std::vector<f_t> lp_optimal_solution)
 {
-  enabled = false;
-  if (rins_thread) rins_thread->request_termination();
-  rins_thread.reset();
-}
+  raft::common::nvtx::range fun_scope("Running RINS");
 
-template <typename i_t, typename f_t>
-void rins_t<i_t, f_t>::run_rins()
-{
   if (total_calls == 0) RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device()));
 
-  if (!dm.population.is_feasible()) return;
+  if (!dm.population.is_feasible()) {
+    launch_new_task = true;
+    return;
+  }
 
   cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch");
   cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch");
@@ -127,10 +106,13 @@ void rins_t<i_t, f_t>::run_rins()
   cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch");
 
   cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible");
-  if (!best_sol.get_feasible()) { return; }
+  if (!best_sol.get_feasible()) {
+    launch_new_task = true;
+    return;
+  }
 
   i_t sol_size_before_rins = best_sol.assignment.size();
-  auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream());
+  auto lp_opt_device       = cuopt::device_copy(lp_optimal_solution, rins_handle.get_stream());
   cuopt_assert(lp_opt_device.size() == problem_copy->n_variables, "Assignment size mismatch");
   cuopt_assert(best_sol.assignment.size() == problem_copy->n_variables, "Assignment size mismatch");
 
@@ -151,6 +133,7 @@ void rins_t<i_t, f_t>::run_rins()
   // abort if the fractional ratio is too low
   if (fractional_ratio < settings.min_fractional_ratio) {
     CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting");
+    launch_new_task = true;
     return;
   }
 
@@ -175,6 +158,7 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (n_to_fix == 0) {
     CUOPT_LOG_DEBUG("RINS no variables to fix");
+    launch_new_task = true;
     return;
   }
 
@@ -220,21 +204,25 @@ void rins_t<i_t, f_t>::run_rins()
   mip_solver_context_t<i_t, f_t> fj_context(
     &rins_handle, &fixed_problem, context.settings, context.scaling);
   fj_t<i_t, f_t> fj(fj_context);
+  fj_t<i_t, f_t>* fj_ptr = &fj;
   solution_t<i_t, f_t> fj_solution(fixed_problem);
   fj_solution.copy_new_assignment(cuopt::host_copy(fixed_assignment, rins_handle.get_stream()));
   std::vector<f_t> default_weights(fixed_problem.n_constraints, 1.);
-  cpu_fj_thread_t<i_t, f_t> cpu_fj_thread;
-  cpu_fj_thread.fj_cpu             = fj.create_cpu_climber(fj_solution,
-                                               default_weights,
-                                               default_weights,
-                                               0.,
-                                               context.preempt_heuristic_solver_,
-                                               fj_settings_t{},
-                                               true);
-  cpu_fj_thread.fj_ptr             = &fj;
-  cpu_fj_thread.fj_cpu->log_prefix = "[RINS] ";
-  cpu_fj_thread.time_limit         = time_limit;
-  cpu_fj_thread.start_cpu_solver();
+
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_cpu =
+    fj.create_cpu_climber(fj_solution,
+                          default_weights,
+                          default_weights,
+                          0.,
+                          context.preempt_heuristic_solver_,
+                          fj_settings_t{},
+                          true);
+  auto fj_cpu_ptr = fj_cpu.get();
+
+  fj_cpu->log_prefix = "[RINS] ";
+
+#pragma omp task
+  fj_ptr->cpu_solve(*fj_cpu_ptr, time_limit);
 
   f_t lower_bound = context.branch_and_bound_ptr ? context.branch_and_bound_ptr->get_lower_bound()
                                                  : -std::numeric_limits<f_t>::infinity();
@@ -256,26 +244,21 @@ void rins_t<i_t, f_t>::run_rins()
   branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
   branch_and_bound_settings.relative_mip_gap_tol =
     std::min(current_mip_gap, (f_t)settings.target_mip_gap);
-  branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
-  branch_and_bound_settings.num_threads     = 2;
-  branch_and_bound_settings.num_bfs_workers = 1;
-
-  // In the future, let RINS use all the diving heuristics. For now,
-  // restricting to guided diving.
-  branch_and_bound_settings.diving_settings.num_diving_workers = 1;
-  branch_and_bound_settings.diving_settings.line_search_diving = 0;
-  branch_and_bound_settings.diving_settings.coefficient_diving = 0;
-  branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
-  branch_and_bound_settings.log.log                            = false;
-  branch_and_bound_settings.log.log_prefix                     = "[RINS] ";
+  branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+  branch_and_bound_settings.num_threads = 1;
+  branch_and_bound_settings.reliability_branching_settings.enable = false;
+  branch_and_bound_settings.log.log                               = false;
+  branch_and_bound_settings.log.log_prefix                        = "[RINS] ";
   branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector<f_t>& solution,
                                                                        f_t objective) {
     rins_solution_queue.push_back(solution);
   };
+
   dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(branch_and_bound_problem,
                                                               branch_and_bound_settings);
   branch_and_bound.set_initial_guess(cuopt::host_copy(fixed_assignment, rins_handle.get_stream()));
-  branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
+  branch_and_bound_status = branch_and_bound.solve(
+    branch_and_bound_solution, dual_simplex::mip_solve_mode_t::BNB_SINGLE_THREADED);
 
   if (!std::isnan(branch_and_bound_solution.objective)) {
     CUOPT_LOG_DEBUG("RINS submip solution found. Objective %.16e. Status %d",
@@ -305,15 +288,13 @@ void rins_t<i_t, f_t>::run_rins()
     time_limit = std::min(time_limit + 2, settings.max_time_limit);
   }
 
-  cpu_fj_thread.stop_cpu_solver();
-  bool fj_solution_found = cpu_fj_thread.wait_for_cpu_solver();
-  CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", cpu_fj_thread.fj_cpu->iterations);
-  if (fj_solution_found) {
-    CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e",
-                    cpu_fj_thread.fj_cpu->h_best_objective);
-    rins_solution_queue.push_back(cpu_fj_thread.fj_cpu->h_best_assignment);
+#pragma omp taskwait
+
+  CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations);
+  if (fj_cpu->feasible_found) {
+    CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", fj_cpu->h_best_objective);
+    rins_solution_queue.push_back(fj_cpu->h_best_assignment);
   }
-  // Thread will be automatically terminated and joined by destructor
 
   bool improvement_found = false;
   for (auto& fixed_sol : rins_solution_queue) {
@@ -348,15 +329,14 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (improvement_found) total_success++;
   CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success);
+  launch_new_task = true;
 }
 
 #if MIP_INSTANTIATE_FLOAT
-template class rins_thread_t<int, float>;
 template class rins_t<int, float>;
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
-template class rins_thread_t<int, double>;
 template class rins_t<int, double>;
 #endif
 
diff --git a/cpp/src/mip/diversity/lns/rins.cuh b/cpp/src/mip/diversity/lns/rins.cuh
index 3d16875ce..65808325b 100644
--- a/cpp/src/mip/diversity/lns/rins.cuh
+++ b/cpp/src/mip/diversity/lns/rins.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
  * reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,16 +20,10 @@
 #include <mip/diversity/population.cuh>
 #include <mip/solution/solution.cuh>
 #include <mip/solver.cuh>
-#include <mip/utilities/cpu_worker_thread.cuh>
 
+#include <utilities/omp_helpers.hpp>
 #include <utilities/timer.hpp>
 
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include <random>
-#include <string>
-#include <thread>
 #include <vector>
 
 namespace cuopt::linear_programming::detail {
@@ -55,18 +49,6 @@ struct rins_settings_t {
 template <typename i_t, typename f_t>
 class rins_t;
 
-template <typename i_t, typename f_t>
-struct rins_thread_t : public cpu_worker_thread_base_t<rins_thread_t<i_t, f_t>> {
-  ~rins_thread_t();
-
-  void run_worker();
-  void on_terminate() {}
-  void on_start() {}
-  bool get_result() { return true; }
-
-  rins_t<i_t, f_t>* rins_ptr{nullptr};
-};
-
 template <typename i_t, typename f_t>
 class rins_t {
  public:
@@ -77,9 +59,10 @@ class rins_t {
   void node_callback(const std::vector<f_t>& solution, f_t objective);
   void new_best_incumbent_callback(const std::vector<f_t>& solution);
   void enable();
-  void stop_rins();
 
-  void run_rins();
+  // We need to copy the vector since many tasks may be created
+  // for running this routine.
+  void run_rins(std::vector<f_t> lp_optimal_solution);
 
   mip_solver_context_t<i_t, f_t>& context;
   problem_t<i_t, f_t>* problem_ptr;
@@ -91,23 +74,19 @@ class rins_t {
   std::unique_ptr<problem_t<i_t, f_t>> problem_copy;
   raft::handle_t rins_handle;
 
-  std::vector<f_t> lp_optimal_solution;
-
   f_t fixrate{0.5};
   i_t total_calls{0};
   i_t total_success{0};
   f_t time_limit{10.};
   i_t seed;
 
-  std::atomic<bool> enabled{false};
-  std::atomic<f_t> lower_bound{0.};
-
-  std::atomic<i_t> node_count{0};
-  std::atomic<i_t> node_count_at_last_rins{0};
-  std::atomic<i_t> node_count_at_last_improvement{0};
-  std::mutex rins_mutex;
+  omp_atomic_t<bool> enabled{false};
+  omp_atomic_t<f_t> lower_bound{0.};
 
-  std::unique_ptr<rins_thread_t<i_t, f_t>> rins_thread;
+  omp_atomic_t<i_t> node_count{0};
+  omp_atomic_t<i_t> node_count_at_last_rins{0};
+  omp_atomic_t<i_t> node_count_at_last_improvement{0};
+  omp_atomic_t<bool> launch_new_task{true};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/diversity/recombiners/sub_mip.cuh b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
index 82670437a..340e0b9ff 100644
--- a/cpp/src/mip/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
@@ -101,16 +101,9 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.print_presolve_stats = false;
       branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
       branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
-      branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
-      branch_and_bound_settings.num_threads     = 2;
-      branch_and_bound_settings.num_bfs_workers = 1;
-
-      // In the future, let SubMIP use all the diving heuristics. For now,
-      // restricting to guided diving.
-      branch_and_bound_settings.diving_settings.num_diving_workers = 1;
-      branch_and_bound_settings.diving_settings.line_search_diving = 0;
-      branch_and_bound_settings.diving_settings.coefficient_diving = 0;
-      branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
+      branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+      branch_and_bound_settings.num_threads = 1;
+      branch_and_bound_settings.reliability_branching_settings.enable = false;
       branch_and_bound_settings.solution_callback = [this](std::vector<f_t>& solution,
                                                            f_t objective) {
         this->solution_callback(solution, objective);
@@ -120,7 +113,8 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.log.log = false;
       dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(branch_and_bound_problem,
                                                                   branch_and_bound_settings);
-      branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution);
+      branch_and_bound_status = branch_and_bound.solve(
+        branch_and_bound_solution, dual_simplex::mip_solve_mode_t::BNB_SINGLE_THREADED);
       if (solution_vector.size() > 0) {
         cuopt_assert(fixed_assignment.size() == branch_and_bound_solution.x.size(),
                      "Assignment size mismatch");
diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip/feasibility_jump/feasibility_jump.cuh
index 36d50af32..5e3bfd123 100644
--- a/cpp/src/mip/feasibility_jump/feasibility_jump.cuh
+++ b/cpp/src/mip/feasibility_jump/feasibility_jump.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -210,7 +210,7 @@ class fj_t {
     std::atomic<bool>& preemption_flag,
     fj_settings_t settings = fj_settings_t{},
     bool randomize_params  = false);
-  bool cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
+  void cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
                  f_t time_limit = +std::numeric_limits<f_t>::infinity());
   i_t alloc_max_climbers(i_t desired_climbers);
   void resize_vectors(const raft::handle_t* handle_ptr);
diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cu b/cpp/src/mip/feasibility_jump/fj_cpu.cu
index 8d534dfff..14f0271af 100644
--- a/cpp/src/mip/feasibility_jump/fj_cpu.cu
+++ b/cpp/src/mip/feasibility_jump/fj_cpu.cu
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -934,7 +934,7 @@ std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_t<i_t, f_t>::create_cpu_climber(
 }
 
 template <typename i_t, typename f_t>
-bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
+void fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
 {
   raft::common::nvtx::range scope("fj_cpu");
 
@@ -1050,50 +1050,14 @@ bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_l
   CUOPT_LOG_TRACE("\n=== Final Timing Statistics ===\n");
   print_timing_stats(fj_cpu);
 #endif
-
-  return fj_cpu.feasible_found;
-}
-
-template <typename i_t, typename f_t>
-cpu_fj_thread_t<i_t, f_t>::~cpu_fj_thread_t()
-{
-  this->request_termination();
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::run_worker()
-{
-  bool solution_found   = fj_ptr->cpu_solve(*fj_cpu, time_limit);
-  cpu_fj_solution_found = solution_found;
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::on_terminate()
-{
-  if (fj_cpu) fj_cpu->halted = true;
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::on_start()
-{
-  cuopt_assert(fj_cpu != nullptr, "fj_cpu must not be null");
-  fj_cpu->halted = false;
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::stop_cpu_solver()
-{
-  fj_cpu->halted = true;
 }
 
 #if MIP_INSTANTIATE_FLOAT
 template class fj_t<int, float>;
-template class cpu_fj_thread_t<int, float>;
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
 template class fj_t<int, double>;
-template class cpu_fj_thread_t<int, double>;
 #endif
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cuh b/cpp/src/mip/feasibility_jump/fj_cpu.cuh
index 4b9cfc0cc..1d2d63ea7 100644
--- a/cpp/src/mip/feasibility_jump/fj_cpu.cuh
+++ b/cpp/src/mip/feasibility_jump/fj_cpu.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -8,16 +8,12 @@
 #pragma once
 
 #include <atomic>
-#include <condition_variable>
 #include <functional>
-#include <limits>
-#include <mutex>
-#include <thread>
 #include <unordered_set>
 #include <vector>
 
 #include <mip/feasibility_jump/feasibility_jump.cuh>
-#include <mip/utilities/cpu_worker_thread.cuh>
+#include <utilities/omp_helpers.hpp>
 
 namespace cuopt::linear_programming::detail {
 
@@ -90,7 +86,7 @@ struct fj_cpu_climber_t {
 
   // vector<bool> is actually likely beneficial here since we're memory bound
   std::vector<bool> flip_move_computed;
-  ;
+
   // CSR nnz offset -> (delta, score)
   std::vector<std::pair<f_t, fj_staged_score_t>> cached_mtm_moves;
 
@@ -119,21 +115,4 @@ struct fj_cpu_climber_t {
   std::atomic<bool>& preemption_flag;
 };
 
-template <typename i_t, typename f_t>
-struct cpu_fj_thread_t : public cpu_worker_thread_base_t<cpu_fj_thread_t<i_t, f_t>> {
-  ~cpu_fj_thread_t();
-
-  void run_worker();
-  void on_terminate();
-  void on_start();
-  bool get_result() { return cpu_fj_solution_found; }
-
-  void stop_cpu_solver();
-
-  std::atomic<bool> cpu_fj_solution_found{false};
-  f_t time_limit{+std::numeric_limits<f_t>::infinity()};
-  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_cpu;
-  fj_t<i_t, f_t>* fj_ptr{nullptr};
-};
-
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/local_search/local_search.cu b/cpp/src/mip/local_search/local_search.cu
index ecd277065..805057485 100644
--- a/cpp/src/mip/local_search/local_search.cu
+++ b/cpp/src/mip/local_search/local_search.cu
@@ -20,8 +20,6 @@
 
 #include <cuda_profiler_api.h>
 
-#include <future>
-
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
@@ -44,13 +42,9 @@ local_search_t<i_t, f_t>::local_search_t(mip_solver_context_t<i_t, f_t>& context
     rng(cuopt::seed_generator::get_seed()),
     problem_with_objective_cut(*context.problem_ptr, context.problem_ptr->handle_ptr)
 {
-  for (auto& cpu_fj : ls_cpu_fj) {
-    cpu_fj.fj_ptr = &fj;
-  }
-  for (auto& cpu_fj : scratch_cpu_fj) {
-    cpu_fj.fj_ptr = &fj;
-  }
-  scratch_cpu_fj_on_lp_opt.fj_ptr = &fj;
+  fj_ptr = &fj;
+  ls_cpu_fj.resize(num_threads_for_cpu_fj);
+  scratch_cpu_fj.resize(num_threads_for_scratch_cpu_fj);
 }
 
 static double local_search_best_obj       = std::numeric_limits<double>::max();
@@ -72,16 +66,16 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
   i_t counter = 0;
   for (auto& cpu_fj : scratch_cpu_fj) {
     if (counter > 0) solution.assign_random_within_bounds(0.4);
-    cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution,
-                                                      default_weights,
-                                                      default_weights,
-                                                      0.,
-                                                      context.preempt_heuristic_solver_,
-                                                      fj_settings_t{},
-                                                      /*randomize=*/counter > 0);
-
-    cpu_fj.fj_cpu->log_prefix           = "******* scratch " + std::to_string(counter) + ": ";
-    cpu_fj.fj_cpu->improvement_callback = [&population](f_t obj, const std::vector<f_t>& h_vec) {
+    cpu_fj = fj_ptr->create_cpu_climber(solution,
+                                        default_weights,
+                                        default_weights,
+                                        0.,
+                                        context.preempt_heuristic_solver_,
+                                        fj_settings_t{},
+                                        /*randomize=*/counter > 0);
+
+    cpu_fj->log_prefix           = "******* scratch " + std::to_string(counter) + ": ";
+    cpu_fj->improvement_callback = [&population](f_t obj, const std::vector<f_t>& h_vec) {
       population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
       if (obj < local_search_best_obj) {
         CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g",
@@ -95,8 +89,9 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
     counter++;
   };
 
-  for (auto& cpu_fj : scratch_cpu_fj) {
-    cpu_fj.start_cpu_solver();
+  for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) {
+#pragma omp task
+    fj_ptr->cpu_solve(*scratch_cpu_fj[i]);
   }
 }
 
@@ -112,34 +107,37 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
   solution_lp.copy_new_assignment(
     host_copy(lp_optimal_solution, context.problem_ptr->handle_ptr->get_stream()));
   solution_lp.round_random_nearest(500);
-  scratch_cpu_fj_on_lp_opt.fj_cpu = fj.create_cpu_climber(
+  scratch_cpu_fj_on_lp_opt = fj.create_cpu_climber(
     solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_);
-  scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: ";
-  scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback =
-    [this, &population](f_t obj, const std::vector<f_t>& h_vec) {
-      population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
-      if (obj < local_search_best_obj) {
-        CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g",
-                        context.problem_ptr->get_user_obj_from_solver_obj(obj),
-                        context.problem_ptr->get_user_obj_from_solver_obj(
-                          population.is_feasible() ? population.best_feasible().get_objective()
-                                                   : std::numeric_limits<f_t>::max()));
-        local_search_best_obj = obj;
-      }
-    };
+  scratch_cpu_fj_on_lp_opt->log_prefix           = "******* scratch on LP optimal: ";
+  scratch_cpu_fj_on_lp_opt->improvement_callback = [this, &population](
+                                                     f_t obj, const std::vector<f_t>& h_vec) {
+    population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
+    if (obj < local_search_best_obj) {
+      CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g",
+                      context.problem_ptr->get_user_obj_from_solver_obj(obj),
+                      context.problem_ptr->get_user_obj_from_solver_obj(
+                        population.is_feasible() ? population.best_feasible().get_objective()
+                                                 : std::numeric_limits<f_t>::max()));
+      local_search_best_obj = obj;
+    }
+  };
 
   // default weights
   cudaDeviceSynchronize();
-  scratch_cpu_fj_on_lp_opt.start_cpu_solver();
+
+#pragma omp task
+  fj_ptr->cpu_solve(*scratch_cpu_fj_on_lp_opt);
 }
 
 template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::stop_cpufj_scratch_threads()
 {
   for (auto& cpu_fj : scratch_cpu_fj) {
-    cpu_fj.request_termination();
+    cpu_fj->halted = true;
   }
-  scratch_cpu_fj_on_lp_opt.request_termination();
+
+  if (scratch_cpu_fj_on_lp_opt) scratch_cpu_fj_on_lp_opt->halted = true;
 }
 
 template <typename i_t, typename f_t>
@@ -155,20 +153,22 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
   auto h_weights          = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream());
   auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream());
   for (auto& cpu_fj : ls_cpu_fj) {
-    cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution,
-                                                      h_weights,
-                                                      h_weights,
-                                                      h_objective_weight,
-                                                      context.preempt_heuristic_solver_,
-                                                      fj_settings_t{},
-                                                      true);
+    cpu_fj = fj_ptr->create_cpu_climber(solution,
+                                        h_weights,
+                                        h_weights,
+                                        h_objective_weight,
+                                        context.preempt_heuristic_solver_,
+                                        fj_settings_t{},
+                                        true);
   }
 
   auto solution_copy = solution;
 
   // Start CPU solver in background thread
-  for (auto& cpu_fj : ls_cpu_fj) {
-    cpu_fj.start_cpu_solver();
+  for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
+    auto ptr = &ls_cpu_fj[i];
+#pragma omp task depend(inout : ptr)
+    fj_ptr->cpu_solve(*ls_cpu_fj[i]);
   }
 
   // Run GPU solver and measure execution time
@@ -176,9 +176,8 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
   in_fj.settings.time_limit = timer.remaining_time();
   in_fj.solve(solution);
 
-  // Stop CPU solver
-  for (auto& cpu_fj : ls_cpu_fj) {
-    cpu_fj.stop_cpu_solver();
+  for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
+    ls_cpu_fj[i]->halted = true;
   }
 
   auto gpu_fj_end        = std::chrono::high_resolution_clock::now();
@@ -188,13 +187,15 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
 
   f_t best_cpu_obj = std::numeric_limits<f_t>::max();
   // // Wait for CPU solver to finish
-  for (auto& cpu_fj : ls_cpu_fj) {
-    bool cpu_sol_found = cpu_fj.wait_for_cpu_solver();
-    if (cpu_sol_found) {
-      f_t cpu_obj = cpu_fj.fj_cpu->h_best_objective;
+  for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
+    auto ptr = &ls_cpu_fj[i];
+#pragma omp taskwait depend(inout : ptr)
+
+    if (ls_cpu_fj[i]->feasible_found) {
+      f_t cpu_obj = ls_cpu_fj[i]->h_best_objective;
       if (cpu_obj < best_cpu_obj) {
         best_cpu_obj = cpu_obj;
-        solution_cpu.copy_new_assignment(cpu_fj.fj_cpu->h_best_assignment);
+        solution_cpu.copy_new_assignment(ls_cpu_fj[i]->h_best_assignment);
         solution_cpu.compute_feasibility();
       }
     }
diff --git a/cpp/src/mip/local_search/local_search.cuh b/cpp/src/mip/local_search/local_search.cuh
index 6fdf4ac72..5264c9372 100644
--- a/cpp/src/mip/local_search/local_search.cuh
+++ b/cpp/src/mip/local_search/local_search.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -13,13 +13,9 @@
 #include <mip/local_search/line_segment_search/line_segment_search.cuh>
 #include <mip/solution/solution.cuh>
 #include <mip/solver.cuh>
-#include <utilities/timer.hpp>
 
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
+#include <utilities/omp_helpers.hpp>
+#include <utilities/timer.hpp>
 
 namespace cuopt::linear_programming::detail {
 
@@ -117,9 +113,13 @@ class local_search_t {
   feasibility_pump_t<i_t, f_t> fp;
   std::mt19937 rng;
 
-  std::array<cpu_fj_thread_t<i_t, f_t>, 8> ls_cpu_fj;
-  std::array<cpu_fj_thread_t<i_t, f_t>, 1> scratch_cpu_fj;
-  cpu_fj_thread_t<i_t, f_t> scratch_cpu_fj_on_lp_opt;
+  const i_t num_threads_for_cpu_fj         = 8;
+  const i_t num_threads_for_scratch_cpu_fj = 1;
+
+  fj_t<i_t, f_t>* fj_ptr;
+  std::vector<std::unique_ptr<fj_cpu_climber_t<i_t, f_t>>> ls_cpu_fj;
+  std::vector<std::unique_ptr<fj_cpu_climber_t<i_t, f_t>>> scratch_cpu_fj;
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> scratch_cpu_fj_on_lp_opt;
   problem_t<i_t, f_t> problem_with_objective_cut;
   bool cutting_plane_added_for_active_run{false};
 };
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index 08e1806b9..adb191f1f 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -87,6 +87,8 @@ struct branch_and_bound_solution_helper_t {
 template <typename i_t, typename f_t>
 solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 {
+  solution_t<i_t, f_t> sol(*context.problem_ptr);
+
   if (context.settings.get_mip_callbacks().size() > 0) {
     for (auto callback : context.settings.get_mip_callbacks()) {
       callback->template setup<f_t>(context.problem_ptr->original_problem_ptr->get_n_variables());
@@ -101,7 +103,6 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   if (context.problem_ptr->empty) {
     CUOPT_LOG_INFO("Problem fully reduced in presolve");
-    solution_t<i_t, f_t> sol(*context.problem_ptr);
     sol.set_problem_fully_reduced();
     context.problem_ptr->post_process_solution(sol);
     return sol;
@@ -112,14 +113,12 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
   bool presolve_success = dm.run_presolve(timer_.remaining_time());
   if (!presolve_success) {
     CUOPT_LOG_INFO("Problem proven infeasible in presolve");
-    solution_t<i_t, f_t> sol(*context.problem_ptr);
     sol.set_problem_fully_reduced();
     context.problem_ptr->post_process_solution(sol);
     return sol;
   }
   if (context.problem_ptr->empty) {
     CUOPT_LOG_INFO("Problem full reduced in presolve");
-    solution_t<i_t, f_t> sol(*context.problem_ptr);
     sol.set_problem_fully_reduced();
     context.problem_ptr->post_process_solution(sol);
     return sol;
@@ -147,97 +146,107 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     return sol;
   }
 
-  namespace dual_simplex = cuopt::linear_programming::dual_simplex;
-  std::future<dual_simplex::mip_status_t> branch_and_bound_status_future;
+  namespace dual_simplex               = cuopt::linear_programming::dual_simplex;
+  dual_simplex::mip_status_t bb_status = dual_simplex::mip_status_t::UNSET;
   dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(context.problem_ptr->handle_ptr);
   dual_simplex::simplex_solver_settings_t<i_t, f_t> branch_and_bound_settings;
   std::unique_ptr<dual_simplex::branch_and_bound_t<i_t, f_t>> branch_and_bound;
   branch_and_bound_solution_helper_t solution_helper(&dm, branch_and_bound_settings);
   dual_simplex::mip_solution_t<i_t, f_t> branch_and_bound_solution(1);
 
-  if (!context.settings.heuristics_only) {
-    // Convert the presolved problem to dual_simplex::user_problem_t
-    op_problem_.get_host_user_problem(branch_and_bound_problem);
-    // Resize the solution now that we know the number of columns/variables
-    branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
-
-    // Fill in the settings for branch and bound
-    branch_and_bound_settings.time_limit           = timer_.remaining_time();
-    branch_and_bound_settings.print_presolve_stats = false;
-    branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
-    branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
-    branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
-
-    if (context.settings.num_cpu_threads < 0) {
-      branch_and_bound_settings.num_threads = omp_get_max_threads() - 1;
-    } else {
-      branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
-    }
+  i_t num_threads = 0;
+  if (context.settings.num_cpu_threads < 0) {
+    num_threads = omp_get_max_threads();
+  } else {
+    num_threads = std::max(1, context.settings.num_cpu_threads);
+  }
 
-    i_t num_threads                           = branch_and_bound_settings.num_threads;
-    i_t num_bfs_workers                       = std::max(1, num_threads / 4);
-    i_t num_diving_workers                    = std::max(1, num_threads - num_bfs_workers);
-    branch_and_bound_settings.num_bfs_workers = num_bfs_workers;
-    branch_and_bound_settings.diving_settings.num_diving_workers = num_diving_workers;
-
-    // Set the branch and bound -> primal heuristics callback
-    branch_and_bound_settings.solution_callback =
-      std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::solution_callback,
-                &solution_helper,
-                std::placeholders::_1,
-                std::placeholders::_2);
-    branch_and_bound_settings.heuristic_preemption_callback = std::bind(
-      &branch_and_bound_solution_helper_t<i_t, f_t>::preempt_heuristic_solver, &solution_helper);
-
-    branch_and_bound_settings.set_simplex_solution_callback =
-      std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::set_simplex_solution,
-                &solution_helper,
-                std::placeholders::_1,
-                std::placeholders::_2,
-                std::placeholders::_3);
-
-    branch_and_bound_settings.node_processed_callback =
-      std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::node_processed_callback,
-                &solution_helper,
-                std::placeholders::_1,
-                std::placeholders::_2);
-
-    // Create the branch and bound object
-    branch_and_bound = std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(
-      branch_and_bound_problem, branch_and_bound_settings);
-    context.branch_and_bound_ptr = branch_and_bound.get();
-    branch_and_bound->set_concurrent_lp_root_solve(true);
-
-    // Set the primal heuristics -> branch and bound callback
-    context.problem_ptr->branch_and_bound_callback =
-      std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_new_solution,
-                branch_and_bound.get(),
-                std::placeholders::_1);
-    context.problem_ptr->set_root_relaxation_solution_callback =
-      std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_root_relaxation_solution,
-                branch_and_bound.get(),
-                std::placeholders::_1,
-                std::placeholders::_2,
-                std::placeholders::_3,
-                std::placeholders::_4,
-                std::placeholders::_5,
-                std::placeholders::_6);
-
-    // Fork a thread for branch and bound
-    // std::async and std::future allow us to get the return value of bb::solve()
-    // without having to manually manage the thread
-    // std::future.get() performs a join() operation to wait until the return status is available
-    branch_and_bound_status_future = std::async(std::launch::async,
-                                                &dual_simplex::branch_and_bound_t<i_t, f_t>::solve,
-                                                branch_and_bound.get(),
-                                                std::ref(branch_and_bound_solution));
+#pragma omp parallel num_threads(num_threads)
+  {
+#pragma omp master
+    {
+      if (!context.settings.heuristics_only) {
+        // Convert the presolved problem to dual_simplex::user_problem_t
+        op_problem_.get_host_user_problem(branch_and_bound_problem);
+        // Resize the solution now that we know the number of columns/variables
+        branch_and_bound_solution.resize(branch_and_bound_problem.num_cols);
+
+        // Fill in the settings for branch and bound
+        branch_and_bound_settings.time_limit           = timer_.remaining_time();
+        branch_and_bound_settings.num_threads          = std::max(num_threads - 3, 1);
+        branch_and_bound_settings.print_presolve_stats = false;
+        branch_and_bound_settings.absolute_mip_gap_tol =
+          context.settings.tolerances.absolute_mip_gap;
+        branch_and_bound_settings.relative_mip_gap_tol =
+          context.settings.tolerances.relative_mip_gap;
+        branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance;
+        branch_and_bound_settings.reliability_branching_settings.enable =
+          solver_settings_.reliability_branching;
+
+        dual_simplex::mip_solve_mode_t solve_mode =
+          branch_and_bound_settings.num_threads == 1
+            ? dual_simplex::mip_solve_mode_t::BNB_SINGLE_THREADED
+            : dual_simplex::mip_solve_mode_t::BNB_PARALLEL;
+
+        // Set the branch and bound -> primal heuristics callback
+        branch_and_bound_settings.solution_callback =
+          std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::solution_callback,
+                    &solution_helper,
+                    std::placeholders::_1,
+                    std::placeholders::_2);
+        branch_and_bound_settings.heuristic_preemption_callback =
+          std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::preempt_heuristic_solver,
+                    &solution_helper);
+
+        branch_and_bound_settings.set_simplex_solution_callback =
+          std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::set_simplex_solution,
+                    &solution_helper,
+                    std::placeholders::_1,
+                    std::placeholders::_2,
+                    std::placeholders::_3);
+
+        branch_and_bound_settings.node_processed_callback =
+          std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::node_processed_callback,
+                    &solution_helper,
+                    std::placeholders::_1,
+                    std::placeholders::_2);
+
+        // Create the branch and bound object
+        branch_and_bound = std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(
+          branch_and_bound_problem, branch_and_bound_settings);
+        context.branch_and_bound_ptr = branch_and_bound.get();
+        branch_and_bound->set_concurrent_lp_root_solve(true);
+
+        // Set the primal heuristics -> branch and bound callback
+        context.problem_ptr->branch_and_bound_callback =
+          std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_new_solution,
+                    branch_and_bound.get(),
+                    std::placeholders::_1);
+        context.problem_ptr->set_root_relaxation_solution_callback =
+          std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_root_relaxation_solution,
+                    branch_and_bound.get(),
+                    std::placeholders::_1,
+                    std::placeholders::_2,
+                    std::placeholders::_3,
+                    std::placeholders::_4,
+                    std::placeholders::_5,
+                    std::placeholders::_6);
+
+#pragma omp task
+        {
+          bb_status = branch_and_bound->solve(branch_and_bound_solution, solve_mode);
+        }
+      }
+
+#pragma omp task
+      {
+        // Start the primal heuristics
+        sol = dm.run_solver();
+      }
+    }
   }
 
-  // Start the primal heuristics
-  auto sol = dm.run_solver();
   if (!context.settings.heuristics_only) {
-    // Wait for the branch and bound to finish
-    auto bb_status = branch_and_bound_status_future.get();
     if (branch_and_bound_solution.lower_bound > -std::numeric_limits<f_t>::infinity()) {
       context.stats.solution_bound =
         context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound);
@@ -257,7 +266,6 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     return sol;
   }
   context.problem_ptr->post_process_solution(sol);
-  dm.rins.stop_rins();
   return sol;
 }
 
diff --git a/cpp/src/mip/utilities/cpu_worker_thread.cuh b/cpp/src/mip/utilities/cpu_worker_thread.cuh
deleted file mode 100644
index 60bd5685b..000000000
--- a/cpp/src/mip/utilities/cpu_worker_thread.cuh
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-
-namespace cuopt::linear_programming::detail {
-
-template <typename Derived>
-class cpu_worker_thread_base_t {
- public:
-  cpu_worker_thread_base_t();
-  ~cpu_worker_thread_base_t();
-
-  void start_cpu_solver();
-  bool wait_for_cpu_solver();
-
-  // Derived classes MUST call this in their destructor before the base destructor runs.
-  // This ensures on_terminate() is called while the derived object is still fully alive.
-  void request_termination();
-
-  // Internal method for thread management - safe to call during destruction
-  void join_worker();
-  void cpu_worker_thread();
-
-  std::thread cpu_worker;
-  std::mutex cpu_mutex;
-  std::condition_variable cpu_cv;
-  std::atomic<bool> should_stop{false};
-  std::atomic<bool> cpu_thread_should_start{false};
-  std::atomic<bool> cpu_thread_done{true};
-  std::atomic<bool> cpu_thread_terminate{false};
-};
-
-template <typename Derived>
-cpu_worker_thread_base_t<Derived>::cpu_worker_thread_base_t()
-{
-  cpu_worker = std::thread(&cpu_worker_thread_base_t<Derived>::cpu_worker_thread, this);
-}
-
-template <typename Derived>
-cpu_worker_thread_base_t<Derived>::~cpu_worker_thread_base_t()
-{
-  // Note: We don't call on_terminate() here since the derived object is already destroyed.
-  join_worker();
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::cpu_worker_thread()
-{
-  while (!cpu_thread_terminate) {
-    {
-      std::unique_lock<std::mutex> lock(cpu_mutex);
-      cpu_cv.wait(lock, [this] { return cpu_thread_should_start || cpu_thread_terminate; });
-
-      if (cpu_thread_terminate) break;
-
-      cpu_thread_done         = false;
-      cpu_thread_should_start = false;
-    }
-
-    static_cast<Derived*>(this)->run_worker();
-
-    {
-      std::lock_guard<std::mutex> lock(cpu_mutex);
-      cpu_thread_done = true;
-    }
-    cpu_cv.notify_all();
-  }
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::request_termination()
-{
-  bool should_terminate = false;
-  {
-    std::lock_guard<std::mutex> lock(cpu_mutex);
-    if (cpu_thread_terminate) return;
-    cpu_thread_terminate = true;
-    should_terminate     = true;
-    static_cast<Derived*>(this)->on_terminate();
-  }
-
-  if (should_terminate) {
-    cpu_cv.notify_one();
-    join_worker();
-  }
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::join_worker()
-{
-  {
-    std::lock_guard<std::mutex> lock(cpu_mutex);
-    if (!cpu_thread_terminate) { cpu_thread_terminate = true; }
-  }
-  cpu_cv.notify_one();
-
-  if (cpu_worker.joinable()) { cpu_worker.join(); }
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::start_cpu_solver()
-{
-  {
-    std::lock_guard<std::mutex> lock(cpu_mutex);
-    cpu_thread_done         = false;
-    cpu_thread_should_start = true;
-    static_cast<Derived*>(this)->on_start();
-  }
-  cpu_cv.notify_one();
-}
-
-template <typename Derived>
-bool cpu_worker_thread_base_t<Derived>::wait_for_cpu_solver()
-{
-  std::unique_lock<std::mutex> lock(cpu_mutex);
-  cpu_cv.wait(lock, [this] { return cpu_thread_done || cpu_thread_terminate; });
-
-  return static_cast<Derived*>(this)->get_result();
-}
-
-}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index e1b68bf88..b79026762 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -42,7 +42,7 @@ class omp_atomic_t {
     return new_val;
   }
 
-  operator T() { return load(); }
+  operator T() const { return load(); }
   T operator+=(T inc) { return fetch_add(inc) + inc; }
   T operator-=(T inc) { return fetch_sub(inc) - inc; }
 
diff --git a/cpp/src/utilities/pcg.hpp b/cpp/src/utilities/pcg.hpp
new file mode 100644
index 000000000..5422fd8df
--- /dev/null
+++ b/cpp/src/utilities/pcg.hpp
@@ -0,0 +1,158 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+
+// Copied from raft/PCGenerator (rng_device.cuh). We cannot use raft implementation
+// on the CPU (.cpp file) since the raft header includes CUDA code.
+// The original code is from https://www.pcg-random.org/.
+namespace cuopt {
+class PCG {
+ public:
+  static constexpr uint64_t default_seed   = 0x853c49e6748fea9bULL;
+  static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL;
+
+  /**
+   * @brief ctor. Initializes the PCG
+   * @param rng_state is the generator state used for initializing the generator
+   * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences
+   * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a
+   * unique set of random numbers. This can be achieved by initializing the generator with same
+   * rng_state for all the threads and diststreamt values for subsequence.
+   */
+  PCG(const uint64_t seed        = default_seed,
+      const uint64_t subsequence = default_stream,
+      uint64_t offset            = 0)
+  {
+    set_seed(seed, subsequence, offset);
+  }
+
+  // Set the seed, subsequence and offset of the PCG
+  void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0)
+  {
+    state  = uint64_t(0);
+    stream = (subsequence << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    state += seed;
+    next(discard);
+    skipahead(offset);
+  }
+
+  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
+  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  void skipahead(uint64_t offset)
+  {
+    uint64_t G = 1;
+    uint64_t h = 6364136223846793005ULL;
+    uint64_t C = 0;
+    uint64_t f = stream;
+    while (offset) {
+      if (offset & 1) {
+        G = G * h;
+        C = C * h + f;
+      }
+      f = f * (h + 1);
+      h = h * h;
+      offset >>= 1;
+    }
+    state = state * G + C;
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @brief This code is derived from PCG basic code
+   * @{
+   */
+  uint32_t next_u32()
+  {
+    uint32_t ret;
+    uint64_t oldstate   = state;
+    state               = oldstate * 6364136223846793005ULL + stream;
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot        = oldstate >> 59u;
+    ret                 = (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+    return ret;
+  }
+
+  uint64_t next_u64()
+  {
+    uint64_t ret;
+    uint32_t a, b;
+    a   = next_u32();
+    b   = next_u32();
+    ret = uint64_t(a) | (uint64_t(b) << 32);
+    return ret;
+  }
+
+  int32_t next_i32()
+  {
+    int32_t ret;
+    uint32_t val;
+    val = next_u32();
+    ret = int32_t(val & 0x7fffffff);
+    return ret;
+  }
+
+  int64_t next_i64()
+  {
+    int64_t ret;
+    uint64_t val;
+    val = next_u64();
+    ret = int64_t(val & 0x7fffffffffffffff);
+    return ret;
+  }
+
+  float next_float() { return static_cast<float>((next_u32() >> 8) * 0x1.0p-24); }
+
+  double next_double() { return static_cast<double>((next_u64() >> 11) * 0x1.0p-53); }
+
+  template <typename T>
+  T next()
+  {
+    T val;
+    next(val);
+    return val;
+  }
+
+  void next(uint32_t& ret) { ret = next_u32(); }
+  void next(uint64_t& ret) { ret = next_u64(); }
+  void next(int32_t& ret) { ret = next_i32(); }
+  void next(int64_t& ret) { ret = next_i64(); }
+  void next(float& ret) { ret = next_float(); }
+  void next(double& ret) { ret = next_double(); }
+
+  /// Draws a sample from a uniform distribution. The samples are uniformly distributed over
+  /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward
+  /// some numbers in the range (scaling by floating-point).
+  template <typename T>
+  T uniform(T low, T high)
+  {
+    double val = next_double();
+    T range    = high - low;
+    return low + (val * range);
+  }
+
+  // Shuffles the contents of a sequence using the Fisher–Yates algorithm.
+  template <typename T>
+  void shuffle(std::vector<T>& seq)
+  {
+    if (seq.empty()) { return; }
+    for (size_t i = 0; i < seq.size() - 1; ++i) {
+      size_t j = uniform(i, seq.size());
+      if (j != i) std::swap(seq[i], seq[j]);
+    }
+  }
+
+ private:
+  uint64_t state;
+  uint64_t stream;
+};
+}  // namespace cuopt