diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index 213e38e5e..71034ff12 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -147,6 +147,7 @@ int run_single_file(std::string file_path, int num_cpu_threads, bool write_log_file, bool log_to_console, + bool reliability_branching, double time_limit) { const raft::handle_t handle_{}; @@ -197,6 +198,8 @@ int run_single_file(std::string file_path, } } + CUOPT_LOG_INFO("Reliability branching: %d\n", reliability_branching); + settings.time_limit = time_limit; settings.heuristics_only = heuristics_only; settings.num_cpu_threads = num_cpu_threads; @@ -204,6 +207,7 @@ int run_single_file(std::string file_path, settings.tolerances.relative_tolerance = 1e-12; settings.tolerances.absolute_tolerance = 1e-6; settings.presolve = true; + settings.reliability_branching = reliability_branching; cuopt::linear_programming::benchmark_info_t benchmark_info; settings.benchmark_info_ptr = &benchmark_info; auto start_run_solver = std::chrono::high_resolution_clock::now(); @@ -256,6 +260,7 @@ void run_single_file_mp(std::string file_path, int num_cpu_threads, bool write_log_file, bool log_to_console, + bool reliability_branching, double time_limit) { std::cout << "running file " << file_path << " on gpu : " << device << std::endl; @@ -271,6 +276,7 @@ void run_single_file_mp(std::string file_path, num_cpu_threads, write_log_file, log_to_console, + reliability_branching, time_limit); // this is a bad design to communicate the result but better than adding complexity of IPC or // pipes @@ -354,6 +360,10 @@ int main(int argc, char* argv[]) .help("track allocations (t/f)") .default_value(std::string("f")); + program.add_argument("--reliability-branching") + .help("enable reliability branching (t/f)") + .default_value(std::string("t")); + // Parse arguments try { program.parse_args(argc, argv); @@ -376,12 +386,13 @@ int main(int argc, char* argv[]) std::string result_file; int batch_num = -1; - bool heuristics_only = program.get("--heuristics-only")[0] == 't'; - int num_cpu_threads = program.get("--num-cpu-threads"); - bool write_log_file = program.get("--write-log-file")[0] == 't'; - bool log_to_console = program.get("--log-to-console")[0] == 't'; - double memory_limit = program.get("--memory-limit"); - bool track_allocations = program.get("--track-allocations")[0] == 't'; + bool heuristics_only = program.get("--heuristics-only")[0] == 't'; + int num_cpu_threads = program.get("--num-cpu-threads"); + bool write_log_file = program.get("--write-log-file")[0] == 't'; + bool log_to_console = program.get("--log-to-console")[0] == 't'; + double memory_limit = program.get("--memory-limit"); + bool track_allocations = program.get("--track-allocations")[0] == 't'; + bool reliability_branching = program.get("--reliability-branching")[0] == 't'; if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; } @@ -469,6 +480,7 @@ int main(int argc, char* argv[]) num_cpu_threads, write_log_file, log_to_console, + reliability_branching, time_limit); } else if (sys_pid < 0) { std::cerr << "Fork failed!" << std::endl; @@ -509,6 +521,7 @@ int main(int argc, char* argv[]) num_cpu_threads, write_log_file, log_to_console, + reliability_branching, time_limit); } diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 4f6320752..f5cb1c80e 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -83,6 +83,9 @@ class mip_solver_settings_t { i_t num_cpu_threads = -1; // -1 means use default number of threads in branch and bound i_t num_gpus = 1; bool log_to_console = true; + + bool reliability_branching = true; + std::string log_file; std::string sol_file; std::string user_problem_file; diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt index af1415fa9..3e72e33cb 100644 --- a/cpp/src/dual_simplex/CMakeLists.txt +++ b/cpp/src/dual_simplex/CMakeLists.txt @@ -13,7 +13,6 @@ set(DUAL_SIMPLEX_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/crossover.cpp ${CMAKE_CURRENT_SOURCE_DIR}/folding.cpp ${CMAKE_CURRENT_SOURCE_DIR}/initial_basis.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mip_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/phase1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/phase2.cpp ${CMAKE_CURRENT_SOURCE_DIR}/presolve.cpp diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp index afd4f4c9a..038db59b9 100644 --- a/cpp/src/dual_simplex/basis_updates.hpp +++ b/cpp/src/dual_simplex/basis_updates.hpp @@ -223,6 +223,9 @@ class basis_update_mpf_t { reset_stats(); } + basis_update_mpf_t(const basis_update_mpf_t& other) = default; + basis_update_mpf_t& operator=(const basis_update_mpf_t& other) = default; + void print_stats() const { i_t total_L_transpose_calls = total_sparse_L_transpose_ + total_dense_L_transpose_; diff --git a/cpp/src/dual_simplex/bnb_worker.hpp b/cpp/src/dual_simplex/bnb_worker.hpp new file mode 100644 index 000000000..e770edd84 --- /dev/null +++ b/cpp/src/dual_simplex/bnb_worker.hpp @@ -0,0 +1,274 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::dual_simplex { + +constexpr int bnb_num_worker_types = 5; + +// Indicate the search and variable selection algorithms used by each thread +// in B&B (See [1]). +// +// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin, +// Berlin, 2007. doi: 10.14279/depositonce-1634. +enum bnb_worker_type_t : int { + BEST_FIRST = 0, // Best-First + Plunging. + PSEUDOCOST_DIVING = 1, // Pseudocost diving (9.2.5) + LINE_SEARCH_DIVING = 2, // Line search diving (9.2.4) + GUIDED_DIVING = 3, // Guided diving (9.2.3). + COEFFICIENT_DIVING = 4 // Coefficient diving (9.2.1) +}; + +template +struct bnb_stats_t { + f_t start_time = 0.0; + omp_atomic_t total_lp_solve_time = 0.0; + omp_atomic_t nodes_explored = 0; + omp_atomic_t nodes_unexplored = 0; + omp_atomic_t total_lp_iters = 0; + omp_atomic_t nodes_since_last_log = 0; + omp_atomic_t last_log = 0.0; +}; + +template +class bnb_worker_data_t { + public: + const i_t worker_id; + omp_atomic_t worker_type; + omp_atomic_t is_active; + omp_atomic_t lower_bound; + + lp_problem_t leaf_problem; + lp_solution_t leaf_solution; + std::vector leaf_edge_norms; + + basis_update_mpf_t basis_factors; + std::vector basic_list; + std::vector nonbasic_list; + + bounds_strengthening_t node_presolver; + std::vector bounds_changed; + + std::vector start_lower; + std::vector start_upper; + mip_node_t* start_node; + + PCG rng; + + bool recompute_basis = true; + bool recompute_bounds = true; + + bnb_worker_data_t(i_t worker_id, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_type, + const simplex_solver_settings_t& settings) + : worker_id(worker_id), + worker_type(BEST_FIRST), + is_active(false), + lower_bound(-std::numeric_limits::infinity()), + leaf_problem(original_lp), + leaf_solution(original_lp.num_rows, original_lp.num_cols), + basis_factors(original_lp.num_rows, settings.refactor_frequency), + basic_list(original_lp.num_rows), + nonbasic_list(), + node_presolver(leaf_problem, Arow, {}, var_type), + bounds_changed(original_lp.num_cols, false), + rng(PCG::default_seed ^ worker_id, PCG::default_stream + worker_id) + { + } + + // Set the `start_node` for best-first search. + void init_best_first(mip_node_t* node, const lp_problem_t& original_lp) + { + start_node = node; + start_lower = original_lp.lower; + start_upper = original_lp.upper; + worker_type = BEST_FIRST; + lower_bound = node->lower_bound; + is_active = true; + } + + // Initialize the worker for diving, setting the `start_node`, `start_lower` and + // `start_upper`. Returns `true` if the starting node is feasible via + // bounds propagation. + bool init_diving(mip_node_t* node, + bnb_worker_type_t type, + const lp_problem_t& original_lp, + const simplex_solver_settings_t& settings) + { + internal_node = node->detach_copy(); + start_node = &internal_node; + + start_lower = original_lp.lower; + start_upper = original_lp.upper; + worker_type = type; + lower_bound = node->lower_bound; + is_active = true; + + std::fill(bounds_changed.begin(), bounds_changed.end(), false); + node->get_variable_bounds(start_lower, start_upper, bounds_changed); + + return node_presolver.bounds_strengthening(start_lower, start_upper, bounds_changed, settings); + } + + // Set the variables bounds for the LP relaxation of the current node. + bool set_lp_variable_bounds_for(mip_node_t* node_ptr, + const simplex_solver_settings_t& settings) + { + // Reset the bound_changed markers + std::fill(bounds_changed.begin(), bounds_changed.end(), false); + + // Set the correct bounds for the leaf problem + if (recompute_bounds) { + leaf_problem.lower = start_lower; + leaf_problem.upper = start_upper; + node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed); + + } else { + node_ptr->update_branched_variable_bounds( + leaf_problem.lower, leaf_problem.upper, bounds_changed); + } + + return node_presolver.bounds_strengthening( + leaf_problem.lower, leaf_problem.upper, bounds_changed, settings); + } + + private: + // For diving, we need to store the full node instead of + // of just a pointer, since it is not store in the tree anymore. + // To keep the same interface across all worker types, + // this will be used as a temporary storage and + // will be pointed by `start_node`. + // For exploration, this will not be used. + mip_node_t internal_node; +}; + +template +class bnb_worker_pool_t { + public: + void init(i_t num_workers, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_type, + const simplex_solver_settings_t& settings) + { + workers_.resize(num_workers); + num_idle_workers_ = num_workers; + for (i_t i = 0; i < num_workers; ++i) { + workers_[i] = + std::make_unique>(i, original_lp, Arow, var_type, settings); + idle_workers_.push_front(i); + } + } + + // Here, we are assuming that the master is the only + // thread that can retrieve/pop an idle worker. + bnb_worker_data_t* get_idle_worker() + { + std::lock_guard lock(mutex_); + if (idle_workers_.empty()) { + return nullptr; + } else { + i_t idx = idle_workers_.front(); + return workers_[idx].get(); + } + } + + // Here, we are assuming that the master is the only + // thread that can retrieve/pop an idle worker. + void pop_idle_worker() + { + std::lock_guard lock(mutex_); + if (!idle_workers_.empty()) { + idle_workers_.pop_front(); + num_idle_workers_--; + } + } + + void return_worker_to_pool(bnb_worker_data_t* worker) + { + worker->is_active = false; + std::lock_guard lock(mutex_); + idle_workers_.push_back(worker->worker_id); + num_idle_workers_++; + } + + f_t get_lower_bounds() + { + f_t lower_bound = std::numeric_limits::infinity(); + + for (i_t i = 0; i < workers_.size(); ++i) { + if (workers_[i]->worker_type == BEST_FIRST && workers_[i]->is_active) { + lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound); + } + } + + return lower_bound; + } + + i_t num_idle_workers() { return num_idle_workers_; } + + private: + // Worker pool + std::vector>> workers_; + + omp_mutex_t mutex_; + std::deque idle_workers_; + omp_atomic_t num_idle_workers_; +}; + +template +std::vector bnb_get_worker_types(diving_heuristics_settings_t settings) +{ + std::vector types; + types.reserve(bnb_num_worker_types); + types.push_back(BEST_FIRST); + if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); } + if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); } + if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); } + if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); } + return types; +} + +template +std::array bnb_get_max_workers( + i_t num_workers, std::vector worker_types) +{ + std::array max_num_workers; + max_num_workers.fill(0); + + i_t bfs_workers = std::max(worker_types.size() == 1 ? num_workers : num_workers / 4, 1); + max_num_workers[BEST_FIRST] = bfs_workers; + + i_t diving_workers = (num_workers - bfs_workers); + i_t m = worker_types.size() - 1; + + for (size_t i = 1, k = 0; i < worker_types.size(); ++i) { + i_t start = (double)k * diving_workers / m; + i_t end = (double)(k + 1) * diving_workers / m; + max_num_workers[worker_types[i]] = end - start; + ++k; + } + + return max_num_workers; +} + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp index 4114e7e09..712f25837 100644 --- a/cpp/src/dual_simplex/bounds_strengthening.cpp +++ b/cpp/src/dual_simplex/bounds_strengthening.cpp @@ -59,8 +59,7 @@ bounds_strengthening_t::bounds_strengthening_t( const csr_matrix_t& Arow, const std::vector& row_sense, const std::vector& var_types) - : bounds_changed(problem.num_cols, false), - A(problem.A), + : A(problem.A), Arow(Arow), var_types(var_types), delta_min_activity(problem.num_rows), @@ -93,6 +92,7 @@ template bool bounds_strengthening_t::bounds_strengthening( std::vector& lower_bounds, std::vector& upper_bounds, + const std::vector& bounds_changed, const simplex_solver_settings_t& settings) { const i_t m = A.m; diff --git a/cpp/src/dual_simplex/bounds_strengthening.hpp b/cpp/src/dual_simplex/bounds_strengthening.hpp index e7e218b82..6a43ef44c 100644 --- a/cpp/src/dual_simplex/bounds_strengthening.hpp +++ b/cpp/src/dual_simplex/bounds_strengthening.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -22,10 +22,9 @@ class bounds_strengthening_t { bool bounds_strengthening(std::vector& lower_bounds, std::vector& upper_bounds, + const std::vector& bounds_changed, const simplex_solver_settings_t& settings); - std::vector bounds_changed; - private: const csc_matrix_t& A; const csr_matrix_t& Arow; diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp index 7fbe90284..2ab39d1d9 100644 --- a/cpp/src/dual_simplex/branch_and_bound.cpp +++ b/cpp/src/dual_simplex/branch_and_bound.cpp @@ -250,18 +250,14 @@ f_t branch_and_bound_t::get_lower_bound() f_t lower_bound = lower_bound_ceiling_.load(); f_t heap_lower_bound = node_queue_.get_lower_bound(); lower_bound = std::min(heap_lower_bound, lower_bound); - - for (i_t i = 0; i < local_lower_bounds_.size(); ++i) { - lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound); - } - + lower_bound = std::min(worker_pool_.get_lower_bounds(), lower_bound); return std::isfinite(lower_bound) ? lower_bound : -inf; } template void branch_and_bound_t::report_heuristic(f_t obj) { - if (is_running) { + if (is_running_) { f_t user_obj = compute_user_objective(original_lp_, obj); f_t user_lower = compute_user_objective(original_lp_, get_lower_bound()); std::string user_gap = user_mip_gap(user_obj, user_lower); @@ -286,7 +282,7 @@ void branch_and_bound_t::report(char symbol, f_t obj, f_t lower_bound, i_t nodes_unexplored = exploration_stats_.nodes_unexplored; f_t user_obj = compute_user_objective(original_lp_, obj); f_t user_lower = compute_user_objective(original_lp_, lower_bound); - f_t iter_node = exploration_stats_.total_lp_iters / nodes_explored; + f_t iter_node = (f_t)exploration_stats_.total_lp_iters / nodes_explored; std::string user_gap = user_mip_gap(user_obj, user_lower); settings_.log.printf("%c %10d %10lu %+13.6e %+10.6e %6d %7.1e %s %9.2f\n", symbol, @@ -569,24 +565,37 @@ template branch_variable_t branch_and_bound_t::variable_selection( mip_node_t* node_ptr, const std::vector& fractional, - const std::vector& solution, - bnb_worker_type_t type) + bnb_worker_data_t* worker_data) { logger_t log; log.log = false; i_t branch_var = -1; rounding_direction_t round_dir = rounding_direction_t::NONE; std::vector current_incumbent; + std::vector& solution = worker_data->leaf_solution.x; - // If there is no incumbent, use pseudocost diving instead of guided diving - if (upper_bound_ == inf && type == bnb_worker_type_t::GUIDED_DIVING) { - type = bnb_worker_type_t::PSEUDOCOST_DIVING; - } - - switch (type) { + switch (worker_data->worker_type) { case bnb_worker_type_t::BEST_FIRST: - branch_var = pc_.variable_selection(fractional, solution, log); - round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]); + + if (settings_.reliability_branching_settings.enable) { + simplex_solver_settings_t rb_settings = settings_; + rb_settings.reliability_branching_settings.num_tasks = worker_pool_.num_idle_workers(); + + branch_var = pc_.reliable_variable_selection(node_ptr, + fractional, + solution, + rb_settings, + var_types_, + worker_data, + exploration_stats_, + upper_bound_, + log); + } else { + branch_var = pc_.variable_selection(fractional, solution, log); + } + + round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]); + return {branch_var, round_dir}; case bnb_worker_type_t::COEFFICIENT_DIVING: @@ -606,29 +615,19 @@ branch_variable_t branch_and_bound_t::variable_selection( return guided_diving(pc_, fractional, solution, current_incumbent, log); default: - log.debug("Unknown variable selection method: %d\n", type); + log.debug("Unknown variable selection method: %d\n", worker_data->worker_type); return {-1, rounding_direction_t::NONE}; } } template -dual::status_t branch_and_bound_t::solve_node_lp( - mip_node_t* node_ptr, - lp_problem_t& leaf_problem, - lp_solution_t& leaf_solution, - basis_update_mpf_t& basis_factors, - std::vector& basic_list, - std::vector& nonbasic_list, - bounds_strengthening_t& node_presolver, - bnb_worker_type_t thread_type, - bool recompute_bounds_and_basis, - const std::vector& root_lower, - const std::vector& root_upper, - bnb_stats_t& stats, - logger_t& log) +dual::status_t branch_and_bound_t::solve_node_lp(mip_node_t* node_ptr, + bnb_worker_data_t* worker_data, + bnb_stats_t& stats, + logger_t& log) { std::vector& leaf_vstatus = node_ptr->vstatus; - assert(leaf_vstatus.size() == leaf_problem.num_cols); + assert(leaf_vstatus.size() == worker_data->leaf_problem.num_cols); simplex_solver_settings_t lp_settings = settings_; lp_settings.set_log(false); @@ -637,10 +636,10 @@ dual::status_t branch_and_bound_t::solve_node_lp( lp_settings.time_limit = settings_.time_limit - toc(exploration_stats_.start_time); lp_settings.scale_columns = false; - if (thread_type != bnb_worker_type_t::BEST_FIRST) { - i_t bnb_lp_iters = exploration_stats_.total_lp_iters; + if (worker_data->worker_type != bnb_worker_type_t::BEST_FIRST) { + int64_t bnb_lp_iters = exploration_stats_.total_lp_iters; f_t factor = settings_.diving_settings.iteration_limit_factor; - i_t max_iter = factor * bnb_lp_iters; + int64_t max_iter = factor * bnb_lp_iters; lp_settings.iteration_limit = max_iter - stats.total_lp_iters; if (lp_settings.iteration_limit <= 0) { return dual::status_t::ITERATION_LIMIT; } } @@ -667,56 +666,40 @@ dual::status_t branch_and_bound_t::solve_node_lp( node_ptr->vstatus[node_ptr->branch_var]); #endif - // Reset the bound_changed markers - std::fill(node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false); - - // Set the correct bounds for the leaf problem - if (recompute_bounds_and_basis) { - leaf_problem.lower = root_lower; - leaf_problem.upper = root_upper; - node_ptr->get_variable_bounds( - leaf_problem.lower, leaf_problem.upper, node_presolver.bounds_changed); - - } else { - node_ptr->update_branched_variable_bounds( - leaf_problem.lower, leaf_problem.upper, node_presolver.bounds_changed); - } - - bool feasible = - node_presolver.bounds_strengthening(leaf_problem.lower, leaf_problem.upper, lp_settings); - - dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED; + bool feasible = worker_data->set_lp_variable_bounds_for(node_ptr, settings_); + dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED; + worker_data->leaf_edge_norms = edge_norms_; // = node.steepest_edge_norms; if (feasible) { - i_t node_iter = 0; - f_t lp_start_time = tic(); - std::vector leaf_edge_norms = edge_norms_; // = node.steepest_edge_norms; + i_t node_iter = 0; + f_t lp_start_time = tic(); lp_status = dual_phase2_with_advanced_basis(2, 0, - recompute_bounds_and_basis, + worker_data->recompute_basis, lp_start_time, - leaf_problem, + worker_data->leaf_problem, lp_settings, leaf_vstatus, - basis_factors, - basic_list, - nonbasic_list, - leaf_solution, + worker_data->basis_factors, + worker_data->basic_list, + worker_data->nonbasic_list, + worker_data->leaf_solution, node_iter, - leaf_edge_norms); + worker_data->leaf_edge_norms); if (lp_status == dual::status_t::NUMERICAL) { log.printf("Numerical issue node %d. Resolving from scratch.\n", node_ptr->node_id); - lp_status_t second_status = solve_linear_program_with_advanced_basis(leaf_problem, - lp_start_time, - lp_settings, - leaf_solution, - basis_factors, - basic_list, - nonbasic_list, - leaf_vstatus, - leaf_edge_norms); + lp_status_t second_status = + solve_linear_program_with_advanced_basis(worker_data->leaf_problem, + lp_start_time, + lp_settings, + worker_data->leaf_solution, + worker_data->basis_factors, + worker_data->basic_list, + worker_data->nonbasic_list, + leaf_vstatus, + worker_data->leaf_edge_norms); lp_status = convert_lp_status_to_dual_status(second_status); } @@ -731,19 +714,18 @@ dual::status_t branch_and_bound_t::solve_node_lp( return lp_status; } - template std::pair branch_and_bound_t::update_tree( mip_node_t* node_ptr, search_tree_t& search_tree, - lp_problem_t& leaf_problem, - lp_solution_t& leaf_solution, - bnb_worker_type_t thread_type, + bnb_worker_data_t* worker_data, dual::status_t lp_status, logger_t& log) { const f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10; std::vector& leaf_vstatus = node_ptr->vstatus; + lp_problem_t& leaf_problem = worker_data->leaf_problem; + lp_solution_t& leaf_solution = worker_data->leaf_solution; if (lp_status == dual::status_t::DUAL_UNBOUNDED) { // Node was infeasible. Do not branch @@ -766,12 +748,12 @@ std::pair branch_and_bound_t::upd i_t leaf_num_fractional = fractional_variables(settings_, leaf_solution.x, var_types_, leaf_fractional); - f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x); - node_ptr->lower_bound = leaf_objective; + f_t leaf_objective = compute_objective(leaf_problem, leaf_solution.x); search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective); pc_.update_pseudo_costs(node_ptr, leaf_objective); + node_ptr->lower_bound = leaf_objective; - if (thread_type == bnb_worker_type_t::BEST_FIRST) { + if (worker_data->worker_type == bnb_worker_type_t::BEST_FIRST) { if (settings_.node_processed_callback != nullptr) { std::vector original_x; uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x); @@ -781,15 +763,15 @@ std::pair branch_and_bound_t::upd if (leaf_num_fractional == 0) { // Found a integer feasible solution - add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, thread_type); + add_feasible_solution( + leaf_objective, leaf_solution.x, node_ptr->depth, worker_data->worker_type); search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective); search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE); return {node_status_t::INTEGER_FEASIBLE, rounding_direction_t::NONE}; } else if (leaf_objective <= upper_bound_ + abs_fathom_tol) { // Choose fractional variable to branch on - auto [branch_var, round_dir] = - variable_selection(node_ptr, leaf_fractional, leaf_solution.x, thread_type); + auto [branch_var, round_dir] = variable_selection(node_ptr, leaf_fractional, worker_data); assert(leaf_vstatus.size() == leaf_problem.num_cols); assert(branch_var >= 0); @@ -798,7 +780,7 @@ std::pair branch_and_bound_t::upd // Note that the exploration thread is the only one that can insert new nodes into the heap, // and thus, we only need to calculate the objective estimate here (it is used for // sorting the nodes for diving). - if (thread_type == bnb_worker_type_t::BEST_FIRST) { + if (worker_data->worker_type == bnb_worker_type_t::BEST_FIRST) { logger_t pc_log; pc_log.log = false; node_ptr->objective_estimate = @@ -816,7 +798,7 @@ std::pair branch_and_bound_t::upd return {node_status_t::FATHOMED, rounding_direction_t::NONE}; } } else { - if (thread_type == bnb_worker_type_t::BEST_FIRST) { + if (worker_data->worker_type == bnb_worker_type_t::BEST_FIRST) { fetch_min(lower_bound_ceiling_, node_ptr->lower_bound); log.printf( "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set " @@ -834,133 +816,20 @@ std::pair branch_and_bound_t::upd } template -void branch_and_bound_t::exploration_ramp_up(mip_node_t* node, - i_t initial_heap_size) +void branch_and_bound_t::plunge_with(bnb_worker_data_t* worker_data, + mip_solve_mode_t mode) { - if (solver_status_ != mip_status_t::UNSET) { return; } - - // Note that we do not know which thread will execute the - // `exploration_ramp_up` task, so we allow to any thread - // to repair the heuristic solution. - repair_heuristic_solutions(); - - f_t lower_bound = node->lower_bound; - f_t upper_bound = upper_bound_; - f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); - f_t abs_gap = upper_bound - lower_bound; - - if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) { - search_tree_.graphviz_node(settings_.log, node, "cutoff", node->lower_bound); - search_tree_.update(node, node_status_t::FATHOMED); - --exploration_stats_.nodes_unexplored; - return; - } - - f_t now = toc(exploration_stats_.start_time); - f_t time_since_last_log = - exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); - - if (((exploration_stats_.nodes_since_last_log >= 10 || - abs_gap < 10 * settings_.absolute_mip_gap_tol) && - (time_since_last_log >= 1)) || - (time_since_last_log > 30) || now > settings_.time_limit) { - bool should_report = should_report_.exchange(false); - - if (should_report) { - report(' ', upper_bound, root_objective_, node->depth); - exploration_stats_.nodes_since_last_log = 0; - exploration_stats_.last_log = tic(); - should_report_ = true; - } - } - - if (now > settings_.time_limit) { - solver_status_ = mip_status_t::TIME_LIMIT; - return; - } - - // Make a copy of the original LP. We will modify its bounds at each leaf - lp_problem_t leaf_problem = original_lp_; - std::vector row_sense; - bounds_strengthening_t node_presolver(leaf_problem, Arow_, row_sense, var_types_); - - const i_t m = leaf_problem.num_rows; - basis_update_mpf_t basis_factors(m, settings_.refactor_frequency); - std::vector basic_list(m); - std::vector nonbasic_list; - - lp_solution_t leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols); - dual::status_t lp_status = solve_node_lp(node, - leaf_problem, - leaf_solution, - basis_factors, - basic_list, - nonbasic_list, - node_presolver, - bnb_worker_type_t::BEST_FIRST, - true, - original_lp_.lower, - original_lp_.upper, - exploration_stats_, - settings_.log); - if (lp_status == dual::status_t::TIME_LIMIT) { - solver_status_ = mip_status_t::TIME_LIMIT; - return; - } - - ++exploration_stats_.nodes_since_last_log; - ++exploration_stats_.nodes_explored; - --exploration_stats_.nodes_unexplored; - - auto [node_status, round_dir] = update_tree(node, - search_tree_, - leaf_problem, - leaf_solution, - bnb_worker_type_t::BEST_FIRST, - lp_status, - settings_.log); - - if (node_status == node_status_t::HAS_CHILDREN) { - exploration_stats_.nodes_unexplored += 2; - - // If we haven't generated enough nodes to keep the threads busy, continue the ramp up phase - if (exploration_stats_.nodes_unexplored < initial_heap_size) { -#pragma omp task - exploration_ramp_up(node->get_down_child(), initial_heap_size); - -#pragma omp task - exploration_ramp_up(node->get_up_child(), initial_heap_size); - - } else { - // We've generated enough nodes, push further nodes onto the heap - node_queue_.push(node->get_down_child()); - node_queue_.push(node->get_up_child()); - } - } -} - -template -void branch_and_bound_t::plunge_from(i_t task_id, - mip_node_t* start_node, - lp_problem_t& leaf_problem, - bounds_strengthening_t& node_presolver, - basis_update_mpf_t& basis_factors, - std::vector& basic_list, - std::vector& nonbasic_list) -{ - bool recompute_bounds_and_basis = true; std::deque*> stack; - stack.push_front(start_node); - - while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) { - if (task_id == 0) { repair_heuristic_solutions(); } + stack.push_front(worker_data->start_node); + worker_data->recompute_basis = true; + worker_data->recompute_bounds = true; + while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET) { mip_node_t* node_ptr = stack.front(); stack.pop_front(); f_t lower_bound = node_ptr->lower_bound; f_t upper_bound = upper_bound_; - f_t abs_gap = upper_bound - lower_bound; f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); // This is based on three assumptions: @@ -969,56 +838,29 @@ void branch_and_bound_t::plunge_from(i_t task_id, // - The current node and its siblings uses the lower bound of the parent before solving the LP // relaxation // - The lower bound of the parent is lower or equal to its children - assert(task_id < local_lower_bounds_.size()); - local_lower_bounds_[task_id] = lower_bound; + worker_data->lower_bound = lower_bound; if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) { search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound); search_tree_.update(node_ptr, node_status_t::FATHOMED); - recompute_bounds_and_basis = true; + worker_data->recompute_basis = true; + worker_data->recompute_bounds = true; --exploration_stats_.nodes_unexplored; continue; } - f_t now = toc(exploration_stats_.start_time); - - if (task_id == 0) { - f_t time_since_last_log = - exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); - - if (((exploration_stats_.nodes_since_last_log >= 1000 || - abs_gap < 10 * settings_.absolute_mip_gap_tol) && - time_since_last_log >= 1) || - (time_since_last_log > 30) || now > settings_.time_limit) { - report(' ', upper_bound, get_lower_bound(), node_ptr->depth); - exploration_stats_.last_log = tic(); - exploration_stats_.nodes_since_last_log = 0; - } - } - - if (now > settings_.time_limit) { + if (toc(exploration_stats_.start_time) > settings_.time_limit) { solver_status_ = mip_status_t::TIME_LIMIT; break; } + if (exploration_stats_.nodes_explored >= settings_.node_limit) { solver_status_ = mip_status_t::NODE_LIMIT; break; } - lp_solution_t leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols); - dual::status_t lp_status = solve_node_lp(node_ptr, - leaf_problem, - leaf_solution, - basis_factors, - basic_list, - nonbasic_list, - node_presolver, - bnb_worker_type_t::BEST_FIRST, - recompute_bounds_and_basis, - original_lp_.lower, - original_lp_.upper, - exploration_stats_, - settings_.log); + dual::status_t lp_status = + solve_node_lp(node_ptr, worker_data, exploration_stats_, settings_.log); if (lp_status == dual::status_t::TIME_LIMIT) { solver_status_ = mip_status_t::TIME_LIMIT; @@ -1031,15 +873,11 @@ void branch_and_bound_t::plunge_from(i_t task_id, ++exploration_stats_.nodes_explored; --exploration_stats_.nodes_unexplored; - auto [node_status, round_dir] = update_tree(node_ptr, - search_tree_, - leaf_problem, - leaf_solution, - bnb_worker_type_t::BEST_FIRST, - lp_status, - settings_.log); + auto [node_status, round_dir] = + update_tree(node_ptr, search_tree_, worker_data, lp_status, settings_.log); - recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN; + worker_data->recompute_basis = node_status != node_status_t::HAS_CHILDREN; + worker_data->recompute_bounds = node_status != node_status_t::HAS_CHILDREN; if (node_status == node_status_t::HAS_CHILDREN) { // The stack should only contain the children of the current parent. @@ -1055,102 +893,45 @@ void branch_and_bound_t::plunge_from(i_t task_id, exploration_stats_.nodes_unexplored += 2; if (round_dir == rounding_direction_t::UP) { - stack.push_front(node_ptr->get_down_child()); + if (node_queue_.best_first_queue_size() < min_node_queue_size_) { + node_queue_.push(node_ptr->get_down_child()); + } else { + stack.push_front(node_ptr->get_down_child()); + } + stack.push_front(node_ptr->get_up_child()); } else { - stack.push_front(node_ptr->get_up_child()); - stack.push_front(node_ptr->get_down_child()); - } - } - } -} - -template -void branch_and_bound_t::best_first_thread(i_t task_id) -{ - f_t lower_bound = -inf; - f_t abs_gap = inf; - f_t rel_gap = inf; - - // Make a copy of the original LP. We will modify its bounds at each leaf - lp_problem_t leaf_problem = original_lp_; - std::vector row_sense; - bounds_strengthening_t node_presolver(leaf_problem, Arow_, row_sense, var_types_); - - const i_t m = leaf_problem.num_rows; - basis_update_mpf_t basis_factors(m, settings_.refactor_frequency); - std::vector basic_list(m); - std::vector nonbasic_list; + if (node_queue_.best_first_queue_size() < min_node_queue_size_) { + node_queue_.push(node_ptr->get_up_child()); + } else { + stack.push_front(node_ptr->get_up_child()); + } - while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && - rel_gap > settings_.relative_mip_gap_tol && - (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) { - // In the current implementation, we are use the active number of subtree to decide - // when to stop the execution. We need to increment the counter at the same - // time as we pop a node from the queue to avoid some threads exiting - // the main loop thinking that the solver has already finished. - // This will be not needed in the master-worker model. - node_queue_.lock(); - // If there any node left in the heap, we pop the top node and explore it. - std::optional*> start_node = node_queue_.pop_best_first(); - if (start_node.has_value()) { active_subtrees_++; }; - node_queue_.unlock(); - - if (start_node.has_value()) { - if (upper_bound_ < start_node.value()->lower_bound) { - // This node was put on the heap earlier but its lower bound is now greater than the - // current upper bound - search_tree_.graphviz_node( - settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); - search_tree_.update(start_node.value(), node_status_t::FATHOMED); - active_subtrees_--; - continue; + stack.push_front(node_ptr->get_down_child()); } - - // Best-first search with plunging - plunge_from(task_id, - start_node.value(), - leaf_problem, - node_presolver, - basis_factors, - basic_list, - nonbasic_list); - - active_subtrees_--; } - - lower_bound = get_lower_bound(); - abs_gap = upper_bound_ - lower_bound; - rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); } - is_running = false; - - // Check if it is the last thread that exited the loop and no - // timeout or numerical error has happen. - if (solver_status_ == mip_status_t::UNSET) { - if (active_subtrees_ > 0) { local_lower_bounds_[task_id] = inf; } + if (mode == mip_solve_mode_t::BNB_PARALLEL) { + worker_pool_.return_worker_to_pool(worker_data); + active_workers_per_type_[BEST_FIRST]--; } } template -void branch_and_bound_t::dive_from(mip_node_t& start_node, - const std::vector& start_lower, - const std::vector& start_upper, - lp_problem_t& leaf_problem, - bounds_strengthening_t& node_presolver, - basis_update_mpf_t& basis_factors, - std::vector& basic_list, - std::vector& nonbasic_list, - bnb_worker_type_t diving_type) +void branch_and_bound_t::dive_with(bnb_worker_data_t* worker_data) { logger_t log; log.log = false; + bnb_worker_type_t diving_type = worker_data->worker_type; const i_t diving_node_limit = settings_.diving_settings.node_limit; const i_t diving_backtrack_limit = settings_.diving_settings.backtrack_limit; - bool recompute_bounds_and_basis = true; - search_tree_t dive_tree(std::move(start_node)); + + worker_data->recompute_basis = true; + worker_data->recompute_bounds = true; + + search_tree_t dive_tree(std::move(*worker_data->start_node)); std::deque*> stack; stack.push_front(&dive_tree.root); @@ -1160,36 +941,25 @@ void branch_and_bound_t::dive_from(mip_node_t& start_node, dive_stats.nodes_explored = 0; dive_stats.nodes_unexplored = 1; - while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) { + while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running_) { mip_node_t* node_ptr = stack.front(); stack.pop_front(); - f_t lower_bound = node_ptr->lower_bound; - f_t upper_bound = upper_bound_; - f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); + f_t lower_bound = node_ptr->lower_bound; + f_t upper_bound = upper_bound_; + f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); + worker_data->lower_bound = lower_bound; if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) { - recompute_bounds_and_basis = true; + worker_data->recompute_basis = true; + worker_data->recompute_bounds = true; continue; } if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; } if (dive_stats.nodes_explored > diving_node_limit) { break; } - lp_solution_t leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols); - dual::status_t lp_status = solve_node_lp(node_ptr, - leaf_problem, - leaf_solution, - basis_factors, - basic_list, - nonbasic_list, - node_presolver, - diving_type, - recompute_bounds_and_basis, - start_lower, - start_upper, - dive_stats, - log); + dual::status_t lp_status = solve_node_lp(node_ptr, worker_data, dive_stats, log); if (lp_status == dual::status_t::TIME_LIMIT) { solver_status_ = mip_status_t::TIME_LIMIT; @@ -1200,9 +970,10 @@ void branch_and_bound_t::dive_from(mip_node_t& start_node, ++dive_stats.nodes_explored; - auto [node_status, round_dir] = - update_tree(node_ptr, dive_tree, leaf_problem, leaf_solution, diving_type, lp_status, log); - recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN; + auto [node_status, round_dir] = update_tree(node_ptr, dive_tree, worker_data, lp_status, log); + + worker_data->recompute_basis = node_status != node_status_t::HAS_CHILDREN; + worker_data->recompute_bounds = node_status != node_status_t::HAS_CHILDREN; if (node_status == node_status_t::HAS_CHILDREN) { if (round_dir == rounding_direction_t::UP) { @@ -1220,68 +991,220 @@ void branch_and_bound_t::dive_from(mip_node_t& start_node, stack.pop_back(); } } + + worker_pool_.return_worker_to_pool(worker_data); + active_workers_per_type_[diving_type]--; +} + +template +void branch_and_bound_t::run_scheduler() +{ + diving_heuristics_settings_t diving_settings = settings_.diving_settings; + const i_t num_workers = 2 * settings_.num_threads; + + if (!std::isfinite(upper_bound_)) { diving_settings.guided_diving = false; } + std::vector worker_types = bnb_get_worker_types(diving_settings); + std::array max_num_workers_per_type = + bnb_get_max_workers(num_workers, worker_types); + +#ifdef CUOPT_LOG_DEBUG + for (auto type : worker_types) { + settings_.log.debug("%c%d: max num of workers = %d", + feasible_solution_symbol(type), + type, + max_num_workers_per_type[type]); + } +#endif + + worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, settings_); + active_workers_per_type_.fill(0); + + f_t lower_bound = get_lower_bound(); + f_t abs_gap = upper_bound_ - lower_bound; + f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + i_t last_node_depth = 0; + + is_running_ = true; + + settings_.log.printf( + " | Explored | Unexplored | Objective | Bound | Depth | Iter/Node | Gap " + "| Time |\n"); + + while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && + rel_gap > settings_.relative_mip_gap_tol && + (active_workers_per_type_[0] > 0 || node_queue_.best_first_queue_size() > 0)) { + bool launched_any_task = false; + lower_bound = get_lower_bound(); + abs_gap = upper_bound_ - lower_bound; + rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + + repair_heuristic_solutions(); + + // If the guided diving was disabled previously due to the lack of an incumbent solution, + // re-enable as soon as a new incumbent is found. + if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) { + if (std::isfinite(upper_bound_)) { + diving_settings.guided_diving = settings_.diving_settings.guided_diving; + worker_types = bnb_get_worker_types(diving_settings); + max_num_workers_per_type = bnb_get_max_workers(num_workers, worker_types); + +#ifdef CUOPT_LOG_DEBUG + for (auto type : worker_types) { + settings_.log.debug("%c%d: max num of workers = %d", + feasible_solution_symbol(type), + type, + max_num_workers_per_type[type]); + } +#endif + } + } + + f_t now = toc(exploration_stats_.start_time); + f_t time_since_last_log = + exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); + i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; + + if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && + time_since_last_log >= 1) || + (time_since_last_log > 30) || now > settings_.time_limit) { + i_t depth = + node_queue_.best_first_queue_size() > 0 ? node_queue_.bfs_top()->depth : last_node_depth; + report(' ', upper_bound_, lower_bound, depth); + exploration_stats_.last_log = tic(); + exploration_stats_.nodes_since_last_log = 0; + } + + if (now > settings_.time_limit) { + solver_status_ = mip_status_t::TIME_LIMIT; + break; + } + + for (auto type : worker_types) { + if (active_workers_per_type_[type] >= max_num_workers_per_type[type]) { continue; } + + // Get an idle worker. + bnb_worker_data_t* worker = worker_pool_.get_idle_worker(); + if (worker == nullptr) { break; } + + if (type == BEST_FIRST) { + // If there any node left in the heap, we pop the top node and explore it. + std::optional*> start_node = node_queue_.pop_best_first(); + + if (!start_node.has_value()) { continue; } + if (upper_bound_ < start_node.value()->lower_bound) { + // This node was put on the heap earlier but its lower bound is now greater than the + // current upper bound + search_tree_.graphviz_node( + settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); + search_tree_.update(start_node.value(), node_status_t::FATHOMED); + continue; + } + + // Remove the worker from the idle list. + worker_pool_.pop_idle_worker(); + worker->init_best_first(start_node.value(), original_lp_); + last_node_depth = start_node.value()->depth; + active_workers_per_type_[type]++; + launched_any_task = true; + +#pragma omp task affinity(worker) + plunge_with(worker, mip_solve_mode_t::BNB_PARALLEL); + + } else { + std::optional*> start_node = node_queue_.pop_diving(); + + if (!start_node.has_value()) { continue; } + if (upper_bound_ < start_node.value()->lower_bound || + start_node.value()->depth < diving_settings.min_node_depth) { + continue; + } + + bool is_feasible = worker->init_diving(start_node.value(), type, original_lp_, settings_); + if (!is_feasible) { continue; } + + // Remove the worker from the idle list. + worker_pool_.pop_idle_worker(); + active_workers_per_type_[type]++; + launched_any_task = true; + +#pragma omp task affinity(worker) + dive_with(worker); + } + } + + // If no new task was launched in this iteration, suspend temporarily the + // execution of the master. As of 8/Jan/2026, GCC does not + // implement taskyield, but LLVM does. + if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + } + +#pragma omp taskwait + is_running_ = false; } template -void branch_and_bound_t::diving_thread(bnb_worker_type_t diving_type) +void branch_and_bound_t::single_threaded_solve() { - // Make a copy of the original LP. We will modify its bounds at each leaf - lp_problem_t leaf_problem = original_lp_; - std::vector row_sense; - bounds_strengthening_t node_presolver(leaf_problem, Arow_, row_sense, var_types_); - - const i_t m = leaf_problem.num_rows; - basis_update_mpf_t basis_factors(m, settings_.refactor_frequency); - std::vector basic_list(m); - std::vector nonbasic_list; - - std::vector start_lower; - std::vector start_upper; - bool reset_starting_bounds = true; - - while (solver_status_ == mip_status_t::UNSET && is_running && - (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) { - if (reset_starting_bounds) { - start_lower = original_lp_.lower; - start_upper = original_lp_.upper; - std::fill(node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false); - reset_starting_bounds = false; + bnb_worker_data_t worker(0, original_lp_, Arow_, var_types_, settings_); + + f_t lower_bound = get_lower_bound(); + f_t abs_gap = upper_bound_ - lower_bound; + f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + i_t last_node_depth = 0; + + is_running_ = true; + + settings_.log.printf( + " | Explored | Unexplored | Objective | Bound | Depth | Iter/Node | Gap " + "| Time |\n"); + + while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && + rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) { + bool launched_any_task = false; + lower_bound = get_lower_bound(); + abs_gap = upper_bound_ - lower_bound; + rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + + repair_heuristic_solutions(); + + f_t now = toc(exploration_stats_.start_time); + f_t time_since_last_log = + exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); + i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; + + if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && + time_since_last_log >= 1) || + (time_since_last_log > 30) || now > settings_.time_limit) { + i_t depth = + node_queue_.best_first_queue_size() > 0 ? node_queue_.bfs_top()->depth : last_node_depth; + report(' ', upper_bound_, lower_bound, depth); + exploration_stats_.last_log = tic(); + exploration_stats_.nodes_since_last_log = 0; } - // In the current implementation, multiple threads can pop the nodes - // from the queue, so we need to initialize the lower and upper bound here - // to avoid other thread fathoming the node (i.e., deleting) before we can read - // the variable bounds from the tree. - // This will be not needed in the master-worker model. - node_queue_.lock(); - std::optional*> node_ptr = node_queue_.pop_diving(); - std::optional> start_node = std::nullopt; - - if (node_ptr.has_value()) { - node_ptr.value()->get_variable_bounds( - start_lower, start_upper, node_presolver.bounds_changed); - start_node = node_ptr.value()->detach_copy(); + if (now > settings_.time_limit) { + solver_status_ = mip_status_t::TIME_LIMIT; + break; } - node_queue_.unlock(); - - if (start_node.has_value()) { - reset_starting_bounds = true; - - if (upper_bound_ < start_node->lower_bound) { continue; } - bool is_feasible = node_presolver.bounds_strengthening(start_lower, start_upper, settings_); - if (!is_feasible) { continue; } - - dive_from(start_node.value(), - start_lower, - start_upper, - leaf_problem, - node_presolver, - basis_factors, - basic_list, - nonbasic_list, - diving_type); + + // If there any node left in the heap, we pop the top node and explore it. + std::optional*> start_node = node_queue_.pop_best_first(); + + if (!start_node.has_value()) { continue; } + if (upper_bound_ < start_node.value()->lower_bound) { + // This node was put on the heap earlier but its lower bound is now greater than the + // current upper bound + search_tree_.graphviz_node( + settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); + search_tree_.update(start_node.value(), node_status_t::FATHOMED); + continue; } + + worker.init_best_first(start_node.value(), original_lp_); + plunge_with(&worker, mip_solve_mode_t::BNB_SINGLE_THREADED); } + + is_running_ = false; } template @@ -1295,15 +1218,19 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( // Root node path lp_status_t root_status; - std::future root_status_future; - root_status_future = std::async(std::launch::async, - &solve_linear_program_advanced, - std::ref(original_lp_), - exploration_stats_.start_time, - std::ref(lp_settings), - std::ref(root_relax_soln_), - std::ref(root_vstatus_), - std::ref(edge_norms_)); + + // Note that we need to explicitly declared `root_status` as a shared variable here since + // it is local to the thread that are executing the enclosing task. +#pragma omp task shared(root_status) + { + root_status = solve_linear_program_advanced(original_lp_, + exploration_stats_.start_time, + lp_settings, + root_relax_soln_, + root_vstatus_, + edge_norms_); + } + // Wait for the root relaxation solution to be sent by the diversity manager or dual simplex // to finish while (!root_crossover_solution_set_.load(std::memory_order_acquire) && @@ -1346,7 +1273,7 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( // Check if crossover was stopped by dual simplex if (crossover_status == crossover_status_t::OPTIMAL) { set_root_concurrent_halt(1); // Stop dual simplex - root_status = root_status_future.get(); +#pragma omp taskwait // Override the root relaxation solution with the crossover solution root_relax_soln_ = root_crossover_soln_; @@ -1357,13 +1284,13 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( solver_name = "Barrier/PDLP and Crossover"; } else { - root_status = root_status_future.get(); +#pragma omp taskwait user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; solver_name = "Dual Simplex"; } } else { - root_status = root_status_future.get(); +#pragma omp taskwait user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; solver_name = "Dual Simplex"; @@ -1377,7 +1304,7 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( solver_name.c_str()); settings_.log.printf("Root relaxation objective %+.8e\n", user_objective); } else { - settings_.log.printf("Root relaxation returned status: %s\n", + settings_.log.printf("Root relaxation returned: %s\n", lp_status_to_string(root_status).c_str()); } @@ -1388,41 +1315,18 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( } template -mip_status_t branch_and_bound_t::solve(mip_solution_t& solution) +mip_status_t branch_and_bound_t::solve(mip_solution_t& solution, + mip_solve_mode_t solve_mode) { logger_t log; log.log = false; log.log_prefix = settings_.log.log_prefix; solver_status_ = mip_status_t::UNSET; - is_running = false; + is_running_ = false; exploration_stats_.nodes_unexplored = 0; exploration_stats_.nodes_explored = 0; original_lp_.A.to_compressed_row(Arow_); - std::vector diving_strategies; - diving_strategies.reserve(4); - - if (settings_.diving_settings.pseudocost_diving != 0) { - diving_strategies.push_back(bnb_worker_type_t::PSEUDOCOST_DIVING); - } - - if (settings_.diving_settings.line_search_diving != 0) { - diving_strategies.push_back(bnb_worker_type_t::LINE_SEARCH_DIVING); - } - - if (settings_.diving_settings.guided_diving != 0) { - diving_strategies.push_back(bnb_worker_type_t::GUIDED_DIVING); - } - - if (settings_.diving_settings.coefficient_diving != 0) { - diving_strategies.push_back(bnb_worker_type_t::COEFFICIENT_DIVING); - calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); - } - - if (diving_strategies.empty()) { - settings_.log.printf("Warning: All diving heuristics are disabled!\n"); - } - if (guess_.size() != 0) { std::vector crushed_guess; crush_primal_solution(original_problem_, original_lp_, guess_, new_slacks_, crushed_guess); @@ -1446,6 +1350,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut simplex_solver_settings_t lp_settings = settings_; lp_settings.inside_mip = 1; lp_settings.concurrent_halt = get_root_concurrent_halt(); + // RINS/SUBMIP path if (!enable_concurrent_lp_root_solve()) { settings_.log.printf("\nSolving LP root relaxation with dual simplex\n"); @@ -1492,7 +1397,6 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut set_uninitialized_steepest_edge_norms(edge_norms_); root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); - local_lower_bounds_.assign(settings_.num_bfs_workers, root_objective_); if (settings_.set_simplex_solution_callback != nullptr) { std::vector original_x; @@ -1567,59 +1471,28 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut root_vstatus_, original_lp_, log); + node_queue_.push(search_tree_.root.get_down_child()); + node_queue_.push(search_tree_.root.get_up_child()); - settings_.log.printf("Exploring the B&B tree using %d threads (best-first = %d, diving = %d)\n\n", - settings_.num_threads, - settings_.num_bfs_workers, - settings_.num_threads - settings_.num_bfs_workers); + settings_.log.printf("Exploring the B&B tree using %d threads\n\n", settings_.num_threads); exploration_stats_.nodes_explored = 0; exploration_stats_.nodes_unexplored = 2; exploration_stats_.nodes_since_last_log = 0; exploration_stats_.last_log = tic(); - active_subtrees_ = 0; - is_running = true; lower_bound_ceiling_ = inf; - should_report_ = true; - - settings_.log.printf( - " | Explored | Unexplored | Objective | Bound | Depth | Iter/Node | Gap " - "| Time |\n"); - -#pragma omp parallel num_threads(settings_.num_threads) - { -#pragma omp master - { - auto down_child = search_tree_.root.get_down_child(); - auto up_child = search_tree_.root.get_up_child(); - i_t initial_size = 2 * settings_.num_threads; - const i_t num_strategies = diving_strategies.size(); - -#pragma omp taskgroup - { -#pragma omp task - exploration_ramp_up(down_child, initial_size); - -#pragma omp task - exploration_ramp_up(up_child, initial_size); - } + min_node_queue_size_ = 2 * settings_.num_threads; - for (i_t i = 0; i < settings_.num_bfs_workers; i++) { -#pragma omp task - best_first_thread(i); - } + if (settings_.diving_settings.coefficient_diving != 0) { + calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); + } - if (!diving_strategies.empty()) { - for (i_t k = 0; k < settings_.diving_settings.num_diving_workers; k++) { - const bnb_worker_type_t diving_type = diving_strategies[k % num_strategies]; -#pragma omp task - diving_thread(diving_type); - } - } - } + if (solve_mode == mip_solve_mode_t::BNB_PARALLEL) { + run_scheduler(); + } else { + single_threaded_solve(); } - is_running = false; f_t lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound() : search_tree_.root.lower_bound; set_final_solution(solution, lower_bound); diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp index 327f99bc4..3efaaa6d6 100644 --- a/cpp/src/dual_simplex/branch_and_bound.hpp +++ b/cpp/src/dual_simplex/branch_and_bound.hpp @@ -7,6 +7,7 @@ #pragma once +#include #include #include #include @@ -35,38 +36,14 @@ enum class mip_status_t { UNSET = 6, // The status is not set }; -// Indicate the search and variable selection algorithms used by each thread -// in B&B (See [1]). -// -// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin, -// Berlin, 2007. doi: 10.14279/depositonce-1634. -enum class bnb_worker_type_t { - BEST_FIRST = 0, // Best-First + Plunging. - PSEUDOCOST_DIVING = 1, // Pseudocost diving (9.2.5) - LINE_SEARCH_DIVING = 2, // Line search diving (9.2.4) - GUIDED_DIVING = 3, // Guided diving (9.2.3). If no incumbent is found yet, use pseudocost diving. - COEFFICIENT_DIVING = 4 // Coefficient diving (9.2.1) +enum class mip_solve_mode_t { + BNB_PARALLEL = 0, // Parallel B&B (default) + BNB_SINGLE_THREADED = 1, // Single threaded B&B for SubMIP and RINS }; -template -class bounds_strengthening_t; - template void upper_bound_callback(f_t upper_bound); -template -struct bnb_stats_t { - f_t start_time = 0.0; - omp_atomic_t total_lp_solve_time = 0.0; - omp_atomic_t nodes_explored = 0; - omp_atomic_t nodes_unexplored = 0; - omp_atomic_t total_lp_iters = 0; - - // This should only be used by the main thread - omp_atomic_t last_log = 0.0; - omp_atomic_t nodes_since_last_log = 0; -}; - template class branch_and_bound_t { public: @@ -114,7 +91,7 @@ class branch_and_bound_t { lp_status_t solve_root_relaxation(simplex_solver_settings_t const& lp_settings); // The main entry routine. Returns the solver status and populates solution with the incumbent. - mip_status_t solve(mip_solution_t& solution); + mip_status_t solve(mip_solution_t& solution, mip_solve_mode_t solve_mode); private: const user_problem_t& original_problem_; @@ -135,9 +112,6 @@ class branch_and_bound_t { std::vector var_up_locks_; std::vector var_down_locks_; - // Local lower bounds for each thread - std::vector> local_lower_bounds_; - // Mutex for upper bound omp_mutex_t mutex_upper_; @@ -175,14 +149,21 @@ class branch_and_bound_t { // Search tree search_tree_t search_tree_; - // Count the number of subtrees that are currently being explored. - omp_atomic_t active_subtrees_; + // Count the number of workers per type that either are being executed or + // are waiting to be executed. + std::array, bnb_num_worker_types> active_workers_per_type_; + + // Worker pool + bnb_worker_pool_t worker_pool_; // Global status of the solver. omp_atomic_t solver_status_; - omp_atomic_t is_running{false}; + omp_atomic_t is_running_{false}; - omp_atomic_t should_report_; + // Minimum number of node in the queue. When the queue size is less than + // this variable, the nodes are added directly to the queue instead of + // the local stack. This also determines the end of the ramp-up phase. + i_t min_node_queue_size_; // In case, a best-first thread encounters a numerical issue when solving a node, // its blocks the progression of the lower bound. @@ -204,72 +185,43 @@ class branch_and_bound_t { // Repairs low-quality solutions from the heuristics, if it is applicable. void repair_heuristic_solutions(); - // Ramp-up phase of the solver, where we greedily expand the tree until - // there is enough unexplored nodes. This is done recursively using OpenMP tasks. - void exploration_ramp_up(mip_node_t* node, i_t initial_heap_size); - // We use best-first to pick the `start_node` and then perform a depth-first search // from this node (i.e., a plunge). It can only backtrack to a sibling node. // Unexplored nodes in the subtree are inserted back into the global heap. - void plunge_from(i_t task_id, - mip_node_t* start_node, - lp_problem_t& leaf_problem, - bounds_strengthening_t& node_presolver, - basis_update_mpf_t& basis_update, - std::vector& basic_list, - std::vector& nonbasic_list); - - // Each "main" thread pops a node from the global heap and then performs a plunge - // (i.e., a shallow dive) into the subtree determined by the node. - void best_first_thread(i_t task_id); + void plunge_with(bnb_worker_data_t* worker_data, mip_solve_mode_t mode); // Perform a deep dive in the subtree determined by the `start_node` in order // to find integer feasible solutions. - void dive_from(mip_node_t& start_node, - const std::vector& start_lower, - const std::vector& start_upper, - lp_problem_t& leaf_problem, - bounds_strengthening_t& node_presolver, - basis_update_mpf_t& basis_update, - std::vector& basic_list, - std::vector& nonbasic_list, - bnb_worker_type_t diving_type); - - // Each diving thread pops the first node from the dive queue and then performs - // a deep dive into the subtree determined by the node. - void diving_thread(bnb_worker_type_t diving_type); + void dive_with(bnb_worker_data_t* worker_data); + + // Run the scheduler (aka the master) whose will schedule and manage + // all the other workers. + void run_scheduler(); + + // Run the branch-and-bound algorithm in single threaded mode. + // This disable all diving heuristics. + void single_threaded_solve(); // Solve the LP relaxation of a leaf node dual::status_t solve_node_lp(mip_node_t* node_ptr, - lp_problem_t& leaf_problem, - lp_solution_t& leaf_solution, - basis_update_mpf_t& basis_factors, - std::vector& basic_list, - std::vector& nonbasic_list, - bounds_strengthening_t& node_presolver, - bnb_worker_type_t thread_type, - bool recompute_bounds_and_basis, - const std::vector& root_lower, - const std::vector& root_upper, + bnb_worker_data_t* worker_data, bnb_stats_t& stats, logger_t& log); // Update the tree based on the LP relaxation. Returns the status // of the node and, if appropriated, the preferred rounding direction // when visiting the children. - std::pair update_tree(mip_node_t* node_ptr, - search_tree_t& search_tree, - lp_problem_t& leaf_problem, - lp_solution_t& leaf_solution, - bnb_worker_type_t thread_type, - dual::status_t lp_status, - logger_t& log); + std::pair update_tree( + mip_node_t* node_ptr, + search_tree_t& search_tree, + bnb_worker_data_t* worker_data, + dual::status_t lp_status, + logger_t& log); // Selects the variable to branch on. branch_variable_t variable_selection(mip_node_t* node_ptr, const std::vector& fractional, - const std::vector& solution, - bnb_worker_type_t type); + bnb_worker_data_t* worker_data); }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/diving_heuristics.cpp b/cpp/src/dual_simplex/diving_heuristics.cpp index a56b4cce3..44101c0c6 100644 --- a/cpp/src/dual_simplex/diving_heuristics.cpp +++ b/cpp/src/dual_simplex/diving_heuristics.cpp @@ -71,7 +71,6 @@ branch_variable_t pseudocost_diving(pseudo_costs_t& pc, const std::vector& root_solution, logger_t& log) { - std::lock_guard lock(pc.mutex); i_t branch_var = -1; f_t max_score = std::numeric_limits::lowest(); rounding_direction_t round_dir = rounding_direction_t::NONE; @@ -146,7 +145,6 @@ branch_variable_t guided_diving(pseudo_costs_t& pc, const std::vector& incumbent, logger_t& log) { - std::lock_guard lock(pc.mutex); i_t branch_var = -1; f_t max_score = std::numeric_limits::lowest(); rounding_direction_t round_dir = rounding_direction_t::NONE; diff --git a/cpp/src/dual_simplex/mip_node.cpp b/cpp/src/dual_simplex/mip_node.cpp deleted file mode 100644 index 547950fad..000000000 --- a/cpp/src/dual_simplex/mip_node.cpp +++ /dev/null @@ -1,18 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#include - -namespace cuopt::linear_programming::dual_simplex { - -bool inactive_status(node_status_t status) -{ - return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE || - status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL); -} - -} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp index de147132a..610fa66e5 100644 --- a/cpp/src/dual_simplex/mip_node.hpp +++ b/cpp/src/dual_simplex/mip_node.hpp @@ -29,7 +29,11 @@ enum class node_status_t : int { enum class rounding_direction_t : int8_t { NONE = -1, DOWN = 0, UP = 1 }; -bool inactive_status(node_status_t status); +inline bool inactive_status(node_status_t status) +{ + return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE || + status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL); +} template class mip_node_t { diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp index 28072795a..b5c9a9ea7 100644 --- a/cpp/src/dual_simplex/node_queue.hpp +++ b/cpp/src/dual_simplex/node_queue.hpp @@ -114,26 +114,29 @@ class node_queue_t { std::optional*> pop_best_first() { + std::lock_guard lock(mutex); auto entry = best_first_heap.pop(); + if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); } + return std::nullopt; } std::optional*> pop_diving() { + std::lock_guard lock(mutex); + while (!diving_heap.empty()) { auto entry = diving_heap.pop(); + if (entry.has_value()) { if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; } } } + return std::nullopt; } - void lock() { mutex.lock(); } - - void unlock() { mutex.unlock(); } - i_t diving_queue_size() { std::lock_guard lock(mutex); diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index bbfe18d61..179979a32 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -621,7 +621,7 @@ void convert_user_problem(const user_problem_t& user_problem, } constexpr bool run_bounds_strengthening = false; - if (run_bounds_strengthening) { + if constexpr (run_bounds_strengthening) { csr_matrix_t Arow(1, 1, 1); problem.A.to_compressed_row(Arow); @@ -629,8 +629,8 @@ void convert_user_problem(const user_problem_t& user_problem, // Empty var_types means that all variables are continuous bounds_strengthening_t strengthening(problem, Arow, row_sense, {}); - std::fill(strengthening.bounds_changed.begin(), strengthening.bounds_changed.end(), true); - strengthening.bounds_strengthening(problem.lower, problem.upper, settings); + std::vector bounds_changed(problem.num_cols, true); + strengthening.bounds_strengthening(problem.lower, problem.upper, bounds_changed, settings); } settings.log.debug( diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp index 0a8f660e9..31bed6756 100644 --- a/cpp/src/dual_simplex/pseudo_costs.cpp +++ b/cpp/src/dual_simplex/pseudo_costs.cpp @@ -133,6 +133,77 @@ void strong_branch_helper(i_t start, } } +template +f_t trial_branching(const lp_problem_t& original_lp, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& vstatus, + const std::vector& edge_norms, + const basis_update_mpf_t& basis_factors, + const std::vector& basic_list, + const std::vector& nonbasic_list, + i_t branch_var, + f_t branch_var_lower, + f_t branch_var_upper, + f_t upper_bound, + i_t bnb_lp_iter_per_node, + f_t start_time, + omp_atomic_t& total_lp_iter) +{ + lp_problem_t child_problem = original_lp; + child_problem.lower[branch_var] = branch_var_lower; + child_problem.upper[branch_var] = branch_var_upper; + + simplex_solver_settings_t child_settings = settings; + child_settings.set_log(false); + i_t lp_iter_upper = settings.reliability_branching_settings.upper_max_lp_iter; + i_t lp_iter_lower = settings.reliability_branching_settings.lower_max_lp_iter; + child_settings.iteration_limit = std::clamp(bnb_lp_iter_per_node, lp_iter_lower, lp_iter_upper); + child_settings.cut_off = upper_bound + settings.dual_tol; + child_settings.inside_mip = 2; + child_settings.scale_columns = false; + + lp_solution_t solution(original_lp.num_rows, original_lp.num_cols); + i_t iter = 0; + std::vector child_vstatus = vstatus; + std::vector child_edge_norms = edge_norms; + std::vector child_basic_list = basic_list; + std::vector child_nonbasic_list = nonbasic_list; + basis_update_mpf_t child_basis_factors = basis_factors; + + dual::status_t status = dual_phase2_with_advanced_basis(2, + 0, + false, + start_time, + child_problem, + child_settings, + child_vstatus, + child_basis_factors, + child_basic_list, + child_nonbasic_list, + solution, + iter, + child_edge_norms); + total_lp_iter += iter; + settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n", + branch_var, + child_problem.lower[branch_var], + child_problem.upper[branch_var], + iter, + dual::status_to_string(status).c_str(), + compute_objective(child_problem, solution.x)); + + if (status == dual::status_t::DUAL_UNBOUNDED) { + // LP was infeasible + return std::numeric_limits::infinity(); + } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || + status == dual::status_t::CUTOFF) { + return compute_objective(child_problem, solution.x); + } else { + return std::numeric_limits::quiet_NaN(); + } +} + } // namespace template @@ -156,40 +227,37 @@ void strong_branching(const lp_problem_t& original_lp, settings.num_threads, fractional.size()); -#pragma omp parallel num_threads(settings.num_threads) - { - i_t n = std::min(4 * settings.num_threads, fractional.size()); - - // Here we are creating more tasks than the number of threads - // such that they can be scheduled dynamically to the threads. -#pragma omp for schedule(dynamic, 1) - for (i_t k = 0; k < n; k++) { - i_t start = std::floor(k * fractional.size() / n); - i_t end = std::floor((k + 1) * fractional.size() / n); - - constexpr bool verbose = false; - if (verbose) { - settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", - omp_get_thread_num(), - k, - start, - end, - end - start); - } - - strong_branch_helper(start, - end, - start_time, - original_lp, - settings, - var_types, - fractional, - root_obj, - root_soln, - root_vstatus, - edge_norms, - pc); + // Here we are creating more tasks than the number of threads + // to overcome any load unbalanced between the iterations + i_t n = std::min(4 * settings.num_threads, fractional.size()); + +#pragma omp taskloop grainsize(1) + for (i_t k = 0; k < n; k++) { + i_t start = std::floor(k * fractional.size() / n); + i_t end = std::floor((k + 1) * fractional.size() / n); + + constexpr bool verbose = false; + if (verbose) { + settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", + omp_get_thread_num(), + k, + start, + end, + end - start); } + + strong_branch_helper(start, + end, + start_time, + original_lp, + settings, + var_types, + fractional, + root_obj, + root_soln, + root_vstatus, + edge_norms, + pc); } pc.update_pseudo_costs_from_strong_branching(fractional, root_soln); @@ -199,7 +267,6 @@ template void pseudo_costs_t::update_pseudo_costs(mip_node_t* node_ptr, f_t leaf_objective) { - std::lock_guard lock(mutex); const f_t change_in_obj = leaf_objective - node_ptr->lower_bound; const f_t frac = node_ptr->branch_dir == rounding_direction_t::DOWN ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val) @@ -217,7 +284,7 @@ template void pseudo_costs_t::initialized(i_t& num_initialized_down, i_t& num_initialized_up, f_t& pseudo_cost_down_avg, - f_t& pseudo_cost_up_avg) const + f_t& pseudo_cost_up_avg) { num_initialized_down = 0; num_initialized_up = 0; @@ -257,8 +324,6 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio const std::vector& solution, logger_t& log) { - std::lock_guard lock(mutex); - const i_t num_fractional = fractional.size(); std::vector pseudo_cost_up(num_fractional); std::vector pseudo_cost_down(num_fractional); @@ -279,6 +344,7 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio for (i_t k = 0; k < num_fractional; k++) { const i_t j = fractional[k]; + if (pseudo_cost_num_down[j] != 0) { pseudo_cost_down[k] = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j]; } else { @@ -290,6 +356,7 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio } else { pseudo_cost_up[k] = pseudo_cost_up_avg; } + constexpr f_t eps = 1e-6; const f_t f_down = solution[j] - std::floor(solution[j]); const f_t f_up = std::ceil(solution[j]) - solution[j]; @@ -300,6 +367,7 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio i_t branch_var = fractional[0]; f_t max_score = -1; i_t select = -1; + for (i_t k = 0; k < num_fractional; k++) { if (score[k] > max_score) { max_score = score[k]; @@ -308,8 +376,193 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio } } + log.debug("Pseudocost branching on %d. Value %e. Score %e.\n", + branch_var, + solution[branch_var], + score[select]); + + return branch_var; +} + +template +i_t pseudo_costs_t::reliable_variable_selection( + mip_node_t* node_ptr, + const std::vector& fractional, + const std::vector& solution, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + bnb_worker_data_t* worker_data, + const bnb_stats_t& bnb_stats, + f_t upper_bound, + logger_t& log) +{ + f_t start_time = bnb_stats.start_time; + i_t branch_var = fractional[0]; + f_t max_score = -1; + i_t num_initialized_down; + i_t num_initialized_up; + f_t pseudo_cost_down_avg; + f_t pseudo_cost_up_avg; + + initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); + + log.printf("PC: num initialized down %d up %d avg down %e up %e\n", + num_initialized_down, + num_initialized_up, + pseudo_cost_down_avg, + pseudo_cost_up_avg); + + const int64_t bnb_total_lp_iter = bnb_stats.total_lp_iters; + const int64_t bnb_nodes_explored = bnb_stats.nodes_explored; + const i_t bnb_lp_iter_per_node = bnb_total_lp_iter / bnb_stats.nodes_explored; + + i_t reliable_threshold = settings.reliability_branching_settings.reliable_threshold; + if (reliable_threshold < 0) { + const i_t max_threshold = settings.reliability_branching_settings.max_reliable_threshold; + const i_t min_threshold = settings.reliability_branching_settings.min_reliable_threshold; + const f_t iter_factor = settings.reliability_branching_settings.bnb_lp_factor; + const i_t iter_offset = settings.reliability_branching_settings.bnb_lp_offset; + const int64_t alpha = iter_factor * bnb_total_lp_iter; + const int64_t max_iter = alpha + settings.reliability_branching_settings.bnb_lp_offset; + + f_t gamma = (max_iter - sb_total_lp_iter) / (sb_total_lp_iter + 1.0); + gamma = std::min(1.0, gamma); + gamma = std::max((alpha - sb_total_lp_iter) / (sb_total_lp_iter + 1.0), gamma); + reliable_threshold = (1 - gamma) * min_threshold + gamma * max_threshold; + reliable_threshold = sb_total_lp_iter < max_iter ? reliable_threshold : 0; + } + + std::vector unreliable_list; + omp_mutex_t score_mutex; + + for (auto j : fractional) { + if (pseudo_cost_num_down[j] < reliable_threshold || + pseudo_cost_num_up[j] < reliable_threshold) { + unreliable_list.push_back(j); + continue; + } + + f_t pc_up = pseudo_cost_num_up[j] > 0 ? pseudo_cost_sum_up[j] / pseudo_cost_num_up[j] + : pseudo_cost_up_avg; + f_t pc_down = pseudo_cost_sum_down[j] > 0 ? pseudo_cost_sum_down[j] / pseudo_cost_num_down[j] + : pseudo_cost_down_avg; + + constexpr f_t eps = 1e-6; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); + + if (score > max_score) { + max_score = score; + branch_var = j; + } + } + + if (unreliable_list.empty()) { + log.printf( + "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score); + + return branch_var; + } + + const int num_tasks = std::max(settings.reliability_branching_settings.num_tasks, 1); + const int task_priority = settings.reliability_branching_settings.task_priority; + const i_t max_num_candidates = settings.reliability_branching_settings.max_num_candidates; + const i_t num_candidates = std::min(unreliable_list.size(), max_num_candidates); + + assert(task_priority > 0); + assert(max_num_candidates > 0); + assert(num_candidates > 0); + + log.printf( + "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n", + sb_total_lp_iter.load(), + bnb_total_lp_iter, + unreliable_list.size(), + num_tasks, + reliable_threshold); + + // Shuffle the unreliable list so every variable has the same chance to be selected. + if (unreliable_list.size() > max_num_candidates) { worker_data->rng.shuffle(unreliable_list); } + +#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) + for (i_t i = 0; i < num_candidates; ++i) { + const i_t j = unreliable_list[i]; + std::lock_guard lock(pseudo_cost_mutex[j]); + + if (toc(start_time) > settings.time_limit) { continue; } + if (pseudo_cost_num_down[j] < reliable_threshold) { + // Do trial branching on the down branch + f_t obj = trial_branching(worker_data->leaf_problem, + settings, + var_types, + node_ptr->vstatus, + worker_data->leaf_edge_norms, + worker_data->basis_factors, + worker_data->basic_list, + worker_data->nonbasic_list, + j, + worker_data->leaf_problem.lower[j], + std::floor(solution[j]), + upper_bound, + bnb_lp_iter_per_node, + start_time, + sb_total_lp_iter); + + if (!std::isnan(obj)) { + f_t change_in_obj = obj - node_ptr->lower_bound; + f_t change_in_x = solution[j] - std::floor(solution[j]); + pseudo_cost_sum_down[j] += change_in_obj / change_in_x; + pseudo_cost_num_down[j]++; + } + } + + if (toc(start_time) > settings.time_limit) { continue; } + if (pseudo_cost_num_up[j] < reliable_threshold) { + f_t obj = trial_branching(worker_data->leaf_problem, + settings, + var_types, + node_ptr->vstatus, + worker_data->leaf_edge_norms, + worker_data->basis_factors, + worker_data->basic_list, + worker_data->nonbasic_list, + j, + std::ceil(solution[j]), + worker_data->leaf_problem.upper[j], + upper_bound, + bnb_lp_iter_per_node, + start_time, + sb_total_lp_iter); + + if (!std::isnan(obj)) { + f_t change_in_obj = obj - node_ptr->lower_bound; + f_t change_in_x = std::ceil(solution[j]) - solution[j]; + pseudo_cost_sum_up[j] += change_in_obj / change_in_x; + pseudo_cost_num_up[j]++; + } + } + + if (toc(start_time) > settings.time_limit) { continue; } + f_t pc_up = pseudo_cost_num_up[j] > 0 ? pseudo_cost_sum_up[j] / pseudo_cost_num_up[j] + : pseudo_cost_up_avg; + f_t pc_down = pseudo_cost_sum_down[j] > 0 ? pseudo_cost_sum_down[j] / pseudo_cost_num_down[j] + : pseudo_cost_down_avg; + + constexpr f_t eps = 1e-6; + const f_t f_down = solution[j] - std::floor(solution[j]); + const f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); + + std::lock_guard score_lock(score_mutex); + if (score > max_score) { + max_score = score; + branch_var = j; + } + } + log.printf( - "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]); + "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score); return branch_var; } @@ -320,8 +573,6 @@ f_t pseudo_costs_t::obj_estimate(const std::vector& fractional, f_t lower_bound, logger_t& log) { - std::lock_guard lock(mutex); - const i_t num_fractional = fractional.size(); f_t estimate = lower_bound; @@ -333,7 +584,8 @@ f_t pseudo_costs_t::obj_estimate(const std::vector& fractional, initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); for (i_t k = 0; k < num_fractional; k++) { - const i_t j = fractional[k]; + const i_t j = fractional[k]; + f_t pseudo_cost_down = 0; f_t pseudo_cost_up = 0; @@ -348,10 +600,12 @@ f_t pseudo_costs_t::obj_estimate(const std::vector& fractional, } else { pseudo_cost_up = pseudo_cost_up_avg; } + constexpr f_t eps = 1e-6; const f_t f_down = solution[j] - std::floor(solution[j]); const f_t f_up = std::ceil(solution[j]) - solution[j]; - estimate += std::min(pseudo_cost_down * f_down, pseudo_cost_up * f_up); + estimate += + std::min(std::max(pseudo_cost_down * f_down, eps), std::max(pseudo_cost_up * f_up, eps)); } log.printf("pseudocost estimate = %e\n", estimate); diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp index 4bab438fa..8d566483a 100644 --- a/cpp/src/dual_simplex/pseudo_costs.hpp +++ b/cpp/src/dual_simplex/pseudo_costs.hpp @@ -7,11 +7,14 @@ #pragma once +#include +#include #include #include #include #include #include +#include #include @@ -24,7 +27,8 @@ class pseudo_costs_t { : pseudo_cost_sum_down(num_variables), pseudo_cost_sum_up(num_variables), pseudo_cost_num_down(num_variables), - pseudo_cost_num_up(num_variables) + pseudo_cost_num_up(num_variables), + pseudo_cost_mutex(num_variables) { } @@ -36,12 +40,13 @@ class pseudo_costs_t { pseudo_cost_sum_up.assign(num_variables, 0); pseudo_cost_num_down.assign(num_variables, 0); pseudo_cost_num_up.assign(num_variables, 0); + pseudo_cost_mutex.resize(num_variables); } void initialized(i_t& num_initialized_down, i_t& num_initialized_up, f_t& pseudo_cost_down_avg, - f_t& pseudo_cost_up_avg) const; + f_t& pseudo_cost_up_avg); f_t obj_estimate(const std::vector& fractional, const std::vector& solution, @@ -52,17 +57,27 @@ class pseudo_costs_t { const std::vector& solution, logger_t& log); + i_t reliable_variable_selection(mip_node_t* node_ptr, + const std::vector& fractional, + const std::vector& solution, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + bnb_worker_data_t* worker_data, + const bnb_stats_t& bnb_stats, + f_t upper_bound, + logger_t& log); + void update_pseudo_costs_from_strong_branching(const std::vector& fractional, const std::vector& root_soln); - std::vector pseudo_cost_sum_up; - std::vector pseudo_cost_sum_down; - std::vector pseudo_cost_num_up; - std::vector pseudo_cost_num_down; + std::vector> pseudo_cost_sum_up; + std::vector> pseudo_cost_sum_down; + std::vector> pseudo_cost_num_up; + std::vector> pseudo_cost_num_down; std::vector strong_branch_down; std::vector strong_branch_up; - - omp_mutex_t mutex; + std::vector pseudo_cost_mutex; omp_atomic_t num_strong_branches_completed = 0; + omp_atomic_t sb_total_lp_iter = 0; }; template diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index d86f84c39..2fdaf0356 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -22,18 +22,69 @@ namespace cuopt::linear_programming::dual_simplex { template struct diving_heuristics_settings_t { - i_t num_diving_workers = -1; - // -1 automatic, 0 disabled, 1 enabled i_t line_search_diving = -1; i_t pseudocost_diving = -1; i_t guided_diving = -1; i_t coefficient_diving = -1; - i_t min_node_depth = 10; - i_t node_limit = 500; + // The minimum depth to start diving from. + i_t min_node_depth = 10; + + // The maximum number of nodes when performing a dive. + i_t node_limit = 500; + + // The maximum number of dual simplex iteration allowed + // in a single dive. This set in terms of the total number of + // iterations in the best-first threads. f_t iteration_limit_factor = 0.05; - i_t backtrack_limit = 5; + + // The maximum backtracking allowed. + i_t backtrack_limit = 5; +}; + +template +struct reliability_branching_settings_t { + // Enable or disable reliability branching + bool enable = false; + + // Lower bound for the maximum number of LP iterations for a single trial branching + i_t lower_max_lp_iter = 10; + + // Upper bound for the maximum number of LP iterations for a single trial branching + i_t upper_max_lp_iter = 500; + + // Priority of the tasks created when running the trial branching in parallel. + // Set to 1 to have the same priority as the other tasks. + i_t task_priority = 5; + + // The number of tasks spawned for performing strong branching. + i_t num_tasks = -1; + + // The maximum number of candidates initialized by strong branching in a single + // node + i_t max_num_candidates = 100; + + // Define the maximum number of iteration spent in strong branching. + // Let `bnb_lp_iter` = total number of iterations in B&B, then + // `max iter in strong branching = bnb_lp_factor * bnb_lp_iter + bnb_lp_offset`. + // This is used for determining the `reliable_threshold`. + f_t bnb_lp_factor = 0.5; + i_t bnb_lp_offset = 100000; + + // Threshold for determining for the number of pseudocost updates. Used for + // determining if the pseudocost is reliable or not. + // - <0: automatic + // - 0: disable (use pseudocost branching instead) + // - >0: will use the value for the threshold. + i_t reliable_threshold = -1; + + // Maximum and minimum points of the curve to determine the value + // of the `reliable_threshold` based on the current number of LP + // iterations in strong branching and B&B. + // Only used when `reliable_threshold` is negative + i_t max_reliable_threshold = 5; + i_t min_reliable_threshold = 1; }; template @@ -87,14 +138,12 @@ struct simplex_solver_settings_t { iteration_log_frequency(1000), first_iteration_log(2), num_threads(omp_get_max_threads() - 1), - num_bfs_workers(std::max(num_threads / 4, 1)), random_seed(0), inside_mip(0), solution_callback(nullptr), heuristic_preemption_callback(nullptr), concurrent_halt(nullptr) { - diving_settings.num_diving_workers = std::max(num_threads - num_bfs_workers, 1); } void set_log(bool logging) const { log.log = logging; } @@ -154,9 +203,10 @@ struct simplex_solver_settings_t { i_t first_iteration_log; // number of iterations to log at beginning of solve i_t num_threads; // number of threads to use i_t random_seed; // random seed - i_t num_bfs_workers; // number of threads dedicated to the best-first search diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics + reliability_branching_settings_t + reliability_branching_settings; // Settings for reliability branching i_t inside_mip; // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node std::function&, f_t)> solution_callback; diff --git a/cpp/src/dual_simplex/solution.hpp b/cpp/src/dual_simplex/solution.hpp index d1d745cbd..4e8bcaf75 100644 --- a/cpp/src/dual_simplex/solution.hpp +++ b/cpp/src/dual_simplex/solution.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -72,8 +72,8 @@ class mip_solution_t { std::vector x; f_t objective; f_t lower_bound; - i_t nodes_explored; - i_t simplex_iterations; + int64_t nodes_explored; + int64_t simplex_iterations; bool has_incumbent; }; diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 1f31a757d..cec1eb92d 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -599,7 +599,7 @@ i_t solve(const user_problem_t& problem, if (is_mip(problem) && !settings.relaxation) { branch_and_bound_t branch_and_bound(problem, settings); mip_solution_t mip_solution(problem.num_cols); - mip_status_t mip_status = branch_and_bound.solve(mip_solution); + mip_status_t mip_status = branch_and_bound.solve(mip_solution, mip_solve_mode_t::BNB_PARALLEL); if (mip_status == mip_status_t::OPTIMAL) { status = 0; } else { @@ -638,7 +638,7 @@ i_t solve_mip_with_guess(const user_problem_t& problem, if (is_mip(problem)) { branch_and_bound_t branch_and_bound(problem, settings); branch_and_bound.set_initial_guess(guess); - mip_status_t mip_status = branch_and_bound.solve(solution); + mip_status_t mip_status = branch_and_bound.solve(solution, mip_solve_mode_t::BNB_PARALLEL); if (mip_status == mip_status_t::OPTIMAL) { status = 0; } else { diff --git a/cpp/src/mip/diversity/lns/rins.cu b/cpp/src/mip/diversity/lns/rins.cu index 7456b59ed..041447204 100644 --- a/cpp/src/mip/diversity/lns/rins.cu +++ b/cpp/src/mip/diversity/lns/rins.cu @@ -33,19 +33,6 @@ rins_t::rins_t(mip_solver_context_t& context_, time_limit = settings.default_time_limit; } -template -rins_thread_t::~rins_thread_t() -{ - this->request_termination(); -} - -template -void rins_thread_t::run_worker() -{ - raft::common::nvtx::range fun_scope("Running RINS"); - rins_ptr->run_rins(); -} - template void rins_t::new_best_incumbent_callback(const std::vector& solution) { @@ -56,19 +43,16 @@ template void rins_t::node_callback(const std::vector& solution, f_t objective) { if (!enabled) return; - node_count++; if (node_count - node_count_at_last_improvement < settings.nodes_after_later_improvement) return; - if (node_count - node_count_at_last_rins > settings.node_freq) { - // opportunistic early test w/ atomic to avoid having to take the lock - if (!rins_thread->cpu_thread_done) return; - std::lock_guard lock(rins_mutex); - if (rins_thread->cpu_thread_done && dm.population.current_size() > 0 && - dm.population.is_feasible()) { - lp_optimal_solution = solution; - rins_thread->start_cpu_solver(); + // If multiple threads call this function, only the first one will launch + // a new task for RINS. The other are ignored. + if (!launch_new_task.exchange(false)) return; + if (dm.population.current_size() > 0 && dm.population.is_feasible()) { +#pragma omp task + run_rins(solution); } } } @@ -76,8 +60,6 @@ void rins_t::node_callback(const std::vector& solution, f_t objec template void rins_t::enable() { - rins_thread = std::make_unique>(); - rins_thread->rins_ptr = this; seed = cuopt::seed_generator::get_seed(); problem_copy = std::make_unique>(*problem_ptr); problem_copy->handle_ptr = &rins_handle; @@ -85,19 +67,16 @@ void rins_t::enable() } template -void rins_t::stop_rins() +void rins_t::run_rins(std::vector lp_optimal_solution) { - enabled = false; - if (rins_thread) rins_thread->request_termination(); - rins_thread.reset(); -} + raft::common::nvtx::range fun_scope("Running RINS"); -template -void rins_t::run_rins() -{ if (total_calls == 0) RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); - if (!dm.population.is_feasible()) return; + if (!dm.population.is_feasible()) { + launch_new_task = true; + return; + } cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch"); cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch"); @@ -127,10 +106,13 @@ void rins_t::run_rins() cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch"); cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible"); - if (!best_sol.get_feasible()) { return; } + if (!best_sol.get_feasible()) { + launch_new_task = true; + return; + } i_t sol_size_before_rins = best_sol.assignment.size(); - auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream()); + auto lp_opt_device = cuopt::device_copy(lp_optimal_solution, rins_handle.get_stream()); cuopt_assert(lp_opt_device.size() == problem_copy->n_variables, "Assignment size mismatch"); cuopt_assert(best_sol.assignment.size() == problem_copy->n_variables, "Assignment size mismatch"); @@ -151,6 +133,7 @@ void rins_t::run_rins() // abort if the fractional ratio is too low if (fractional_ratio < settings.min_fractional_ratio) { CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting"); + launch_new_task = true; return; } @@ -175,6 +158,7 @@ void rins_t::run_rins() if (n_to_fix == 0) { CUOPT_LOG_DEBUG("RINS no variables to fix"); + launch_new_task = true; return; } @@ -220,21 +204,25 @@ void rins_t::run_rins() mip_solver_context_t fj_context( &rins_handle, &fixed_problem, context.settings, context.scaling); fj_t fj(fj_context); + fj_t* fj_ptr = &fj; solution_t fj_solution(fixed_problem); fj_solution.copy_new_assignment(cuopt::host_copy(fixed_assignment, rins_handle.get_stream())); std::vector default_weights(fixed_problem.n_constraints, 1.); - cpu_fj_thread_t cpu_fj_thread; - cpu_fj_thread.fj_cpu = fj.create_cpu_climber(fj_solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); - cpu_fj_thread.fj_ptr = &fj; - cpu_fj_thread.fj_cpu->log_prefix = "[RINS] "; - cpu_fj_thread.time_limit = time_limit; - cpu_fj_thread.start_cpu_solver(); + + std::unique_ptr> fj_cpu = + fj.create_cpu_climber(fj_solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); + auto fj_cpu_ptr = fj_cpu.get(); + + fj_cpu->log_prefix = "[RINS] "; + +#pragma omp task + fj_ptr->cpu_solve(*fj_cpu_ptr, time_limit); f_t lower_bound = context.branch_and_bound_ptr ? context.branch_and_bound_ptr->get_lower_bound() : -std::numeric_limits::infinity(); @@ -256,26 +244,21 @@ void rins_t::run_rins() branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap; branch_and_bound_settings.relative_mip_gap_tol = std::min(current_mip_gap, (f_t)settings.target_mip_gap); - branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance; - branch_and_bound_settings.num_threads = 2; - branch_and_bound_settings.num_bfs_workers = 1; - - // In the future, let RINS use all the diving heuristics. For now, - // restricting to guided diving. - branch_and_bound_settings.diving_settings.num_diving_workers = 1; - branch_and_bound_settings.diving_settings.line_search_diving = 0; - branch_and_bound_settings.diving_settings.coefficient_diving = 0; - branch_and_bound_settings.diving_settings.pseudocost_diving = 0; - branch_and_bound_settings.log.log = false; - branch_and_bound_settings.log.log_prefix = "[RINS] "; + branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance; + branch_and_bound_settings.num_threads = 1; + branch_and_bound_settings.reliability_branching_settings.enable = false; + branch_and_bound_settings.log.log = false; + branch_and_bound_settings.log.log_prefix = "[RINS] "; branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector& solution, f_t objective) { rins_solution_queue.push_back(solution); }; + dual_simplex::branch_and_bound_t branch_and_bound(branch_and_bound_problem, branch_and_bound_settings); branch_and_bound.set_initial_guess(cuopt::host_copy(fixed_assignment, rins_handle.get_stream())); - branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution); + branch_and_bound_status = branch_and_bound.solve( + branch_and_bound_solution, dual_simplex::mip_solve_mode_t::BNB_SINGLE_THREADED); if (!std::isnan(branch_and_bound_solution.objective)) { CUOPT_LOG_DEBUG("RINS submip solution found. Objective %.16e. Status %d", @@ -305,15 +288,13 @@ void rins_t::run_rins() time_limit = std::min(time_limit + 2, settings.max_time_limit); } - cpu_fj_thread.stop_cpu_solver(); - bool fj_solution_found = cpu_fj_thread.wait_for_cpu_solver(); - CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", cpu_fj_thread.fj_cpu->iterations); - if (fj_solution_found) { - CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", - cpu_fj_thread.fj_cpu->h_best_objective); - rins_solution_queue.push_back(cpu_fj_thread.fj_cpu->h_best_assignment); +#pragma omp taskwait + + CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations); + if (fj_cpu->feasible_found) { + CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", fj_cpu->h_best_objective); + rins_solution_queue.push_back(fj_cpu->h_best_assignment); } - // Thread will be automatically terminated and joined by destructor bool improvement_found = false; for (auto& fixed_sol : rins_solution_queue) { @@ -348,15 +329,14 @@ void rins_t::run_rins() if (improvement_found) total_success++; CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success); + launch_new_task = true; } #if MIP_INSTANTIATE_FLOAT -template class rins_thread_t; template class rins_t; #endif #if MIP_INSTANTIATE_DOUBLE -template class rins_thread_t; template class rins_t; #endif diff --git a/cpp/src/mip/diversity/lns/rins.cuh b/cpp/src/mip/diversity/lns/rins.cuh index 3d16875ce..65808325b 100644 --- a/cpp/src/mip/diversity/lns/rins.cuh +++ b/cpp/src/mip/diversity/lns/rins.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights * reserved. SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,16 +20,10 @@ #include #include #include -#include +#include #include -#include -#include -#include -#include -#include -#include #include namespace cuopt::linear_programming::detail { @@ -55,18 +49,6 @@ struct rins_settings_t { template class rins_t; -template -struct rins_thread_t : public cpu_worker_thread_base_t> { - ~rins_thread_t(); - - void run_worker(); - void on_terminate() {} - void on_start() {} - bool get_result() { return true; } - - rins_t* rins_ptr{nullptr}; -}; - template class rins_t { public: @@ -77,9 +59,10 @@ class rins_t { void node_callback(const std::vector& solution, f_t objective); void new_best_incumbent_callback(const std::vector& solution); void enable(); - void stop_rins(); - void run_rins(); + // We need to copy the vector since many tasks may be created + // for running this routine. + void run_rins(std::vector lp_optimal_solution); mip_solver_context_t& context; problem_t* problem_ptr; @@ -91,23 +74,19 @@ class rins_t { std::unique_ptr> problem_copy; raft::handle_t rins_handle; - std::vector lp_optimal_solution; - f_t fixrate{0.5}; i_t total_calls{0}; i_t total_success{0}; f_t time_limit{10.}; i_t seed; - std::atomic enabled{false}; - std::atomic lower_bound{0.}; - - std::atomic node_count{0}; - std::atomic node_count_at_last_rins{0}; - std::atomic node_count_at_last_improvement{0}; - std::mutex rins_mutex; + omp_atomic_t enabled{false}; + omp_atomic_t lower_bound{0.}; - std::unique_ptr> rins_thread; + omp_atomic_t node_count{0}; + omp_atomic_t node_count_at_last_rins{0}; + omp_atomic_t node_count_at_last_improvement{0}; + omp_atomic_t launch_new_task{true}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/diversity/recombiners/sub_mip.cuh b/cpp/src/mip/diversity/recombiners/sub_mip.cuh index 82670437a..340e0b9ff 100644 --- a/cpp/src/mip/diversity/recombiners/sub_mip.cuh +++ b/cpp/src/mip/diversity/recombiners/sub_mip.cuh @@ -101,16 +101,9 @@ class sub_mip_recombiner_t : public recombiner_t { branch_and_bound_settings.print_presolve_stats = false; branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap; branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap; - branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance; - branch_and_bound_settings.num_threads = 2; - branch_and_bound_settings.num_bfs_workers = 1; - - // In the future, let SubMIP use all the diving heuristics. For now, - // restricting to guided diving. - branch_and_bound_settings.diving_settings.num_diving_workers = 1; - branch_and_bound_settings.diving_settings.line_search_diving = 0; - branch_and_bound_settings.diving_settings.coefficient_diving = 0; - branch_and_bound_settings.diving_settings.pseudocost_diving = 0; + branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance; + branch_and_bound_settings.num_threads = 1; + branch_and_bound_settings.reliability_branching_settings.enable = false; branch_and_bound_settings.solution_callback = [this](std::vector& solution, f_t objective) { this->solution_callback(solution, objective); @@ -120,7 +113,8 @@ class sub_mip_recombiner_t : public recombiner_t { branch_and_bound_settings.log.log = false; dual_simplex::branch_and_bound_t branch_and_bound(branch_and_bound_problem, branch_and_bound_settings); - branch_and_bound_status = branch_and_bound.solve(branch_and_bound_solution); + branch_and_bound_status = branch_and_bound.solve( + branch_and_bound_solution, dual_simplex::mip_solve_mode_t::BNB_SINGLE_THREADED); if (solution_vector.size() > 0) { cuopt_assert(fixed_assignment.size() == branch_and_bound_solution.x.size(), "Assignment size mismatch"); diff --git a/cpp/src/mip/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip/feasibility_jump/feasibility_jump.cuh index 36d50af32..5e3bfd123 100644 --- a/cpp/src/mip/feasibility_jump/feasibility_jump.cuh +++ b/cpp/src/mip/feasibility_jump/feasibility_jump.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -210,7 +210,7 @@ class fj_t { std::atomic& preemption_flag, fj_settings_t settings = fj_settings_t{}, bool randomize_params = false); - bool cpu_solve(fj_cpu_climber_t& fj_cpu, + void cpu_solve(fj_cpu_climber_t& fj_cpu, f_t time_limit = +std::numeric_limits::infinity()); i_t alloc_max_climbers(i_t desired_climbers); void resize_vectors(const raft::handle_t* handle_ptr); diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cu b/cpp/src/mip/feasibility_jump/fj_cpu.cu index 8d534dfff..14f0271af 100644 --- a/cpp/src/mip/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip/feasibility_jump/fj_cpu.cu @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -934,7 +934,7 @@ std::unique_ptr> fj_t::create_cpu_climber( } template -bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_limit) +void fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_limit) { raft::common::nvtx::range scope("fj_cpu"); @@ -1050,50 +1050,14 @@ bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_l CUOPT_LOG_TRACE("\n=== Final Timing Statistics ===\n"); print_timing_stats(fj_cpu); #endif - - return fj_cpu.feasible_found; -} - -template -cpu_fj_thread_t::~cpu_fj_thread_t() -{ - this->request_termination(); -} - -template -void cpu_fj_thread_t::run_worker() -{ - bool solution_found = fj_ptr->cpu_solve(*fj_cpu, time_limit); - cpu_fj_solution_found = solution_found; -} - -template -void cpu_fj_thread_t::on_terminate() -{ - if (fj_cpu) fj_cpu->halted = true; -} - -template -void cpu_fj_thread_t::on_start() -{ - cuopt_assert(fj_cpu != nullptr, "fj_cpu must not be null"); - fj_cpu->halted = false; -} - -template -void cpu_fj_thread_t::stop_cpu_solver() -{ - fj_cpu->halted = true; } #if MIP_INSTANTIATE_FLOAT template class fj_t; -template class cpu_fj_thread_t; #endif #if MIP_INSTANTIATE_DOUBLE template class fj_t; -template class cpu_fj_thread_t; #endif } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/feasibility_jump/fj_cpu.cuh b/cpp/src/mip/feasibility_jump/fj_cpu.cuh index 4b9cfc0cc..1d2d63ea7 100644 --- a/cpp/src/mip/feasibility_jump/fj_cpu.cuh +++ b/cpp/src/mip/feasibility_jump/fj_cpu.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -8,16 +8,12 @@ #pragma once #include -#include #include -#include -#include -#include #include #include #include -#include +#include namespace cuopt::linear_programming::detail { @@ -90,7 +86,7 @@ struct fj_cpu_climber_t { // vector is actually likely beneficial here since we're memory bound std::vector flip_move_computed; - ; + // CSR nnz offset -> (delta, score) std::vector> cached_mtm_moves; @@ -119,21 +115,4 @@ struct fj_cpu_climber_t { std::atomic& preemption_flag; }; -template -struct cpu_fj_thread_t : public cpu_worker_thread_base_t> { - ~cpu_fj_thread_t(); - - void run_worker(); - void on_terminate(); - void on_start(); - bool get_result() { return cpu_fj_solution_found; } - - void stop_cpu_solver(); - - std::atomic cpu_fj_solution_found{false}; - f_t time_limit{+std::numeric_limits::infinity()}; - std::unique_ptr> fj_cpu; - fj_t* fj_ptr{nullptr}; -}; - } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/local_search/local_search.cu b/cpp/src/mip/local_search/local_search.cu index ecd277065..805057485 100644 --- a/cpp/src/mip/local_search/local_search.cu +++ b/cpp/src/mip/local_search/local_search.cu @@ -20,8 +20,6 @@ #include -#include - namespace cuopt::linear_programming::detail { template @@ -44,13 +42,9 @@ local_search_t::local_search_t(mip_solver_context_t& context rng(cuopt::seed_generator::get_seed()), problem_with_objective_cut(*context.problem_ptr, context.problem_ptr->handle_ptr) { - for (auto& cpu_fj : ls_cpu_fj) { - cpu_fj.fj_ptr = &fj; - } - for (auto& cpu_fj : scratch_cpu_fj) { - cpu_fj.fj_ptr = &fj; - } - scratch_cpu_fj_on_lp_opt.fj_ptr = &fj; + fj_ptr = &fj; + ls_cpu_fj.resize(num_threads_for_cpu_fj); + scratch_cpu_fj.resize(num_threads_for_scratch_cpu_fj); } static double local_search_best_obj = std::numeric_limits::max(); @@ -72,16 +66,16 @@ void local_search_t::start_cpufj_scratch_threads(population_t 0) solution.assign_random_within_bounds(0.4); - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - /*randomize=*/counter > 0); - - cpu_fj.fj_cpu->log_prefix = "******* scratch " + std::to_string(counter) + ": "; - cpu_fj.fj_cpu->improvement_callback = [&population](f_t obj, const std::vector& h_vec) { + cpu_fj = fj_ptr->create_cpu_climber(solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + /*randomize=*/counter > 0); + + cpu_fj->log_prefix = "******* scratch " + std::to_string(counter) + ": "; + cpu_fj->improvement_callback = [&population](f_t obj, const std::vector& h_vec) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); if (obj < local_search_best_obj) { CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g", @@ -95,8 +89,9 @@ void local_search_t::start_cpufj_scratch_threads(population_tcpu_solve(*scratch_cpu_fj[i]); } } @@ -112,34 +107,37 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp.copy_new_assignment( host_copy(lp_optimal_solution, context.problem_ptr->handle_ptr->get_stream())); solution_lp.round_random_nearest(500); - scratch_cpu_fj_on_lp_opt.fj_cpu = fj.create_cpu_climber( + scratch_cpu_fj_on_lp_opt = fj.create_cpu_climber( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); - scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; - scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = - [this, &population](f_t obj, const std::vector& h_vec) { - population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - if (obj < local_search_best_obj) { - CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", - context.problem_ptr->get_user_obj_from_solver_obj(obj), - context.problem_ptr->get_user_obj_from_solver_obj( - population.is_feasible() ? population.best_feasible().get_objective() - : std::numeric_limits::max())); - local_search_best_obj = obj; - } - }; + scratch_cpu_fj_on_lp_opt->log_prefix = "******* scratch on LP optimal: "; + scratch_cpu_fj_on_lp_opt->improvement_callback = [this, &population]( + f_t obj, const std::vector& h_vec) { + population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); + if (obj < local_search_best_obj) { + CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", + context.problem_ptr->get_user_obj_from_solver_obj(obj), + context.problem_ptr->get_user_obj_from_solver_obj( + population.is_feasible() ? population.best_feasible().get_objective() + : std::numeric_limits::max())); + local_search_best_obj = obj; + } + }; // default weights cudaDeviceSynchronize(); - scratch_cpu_fj_on_lp_opt.start_cpu_solver(); + +#pragma omp task + fj_ptr->cpu_solve(*scratch_cpu_fj_on_lp_opt); } template void local_search_t::stop_cpufj_scratch_threads() { for (auto& cpu_fj : scratch_cpu_fj) { - cpu_fj.request_termination(); + cpu_fj->halted = true; } - scratch_cpu_fj_on_lp_opt.request_termination(); + + if (scratch_cpu_fj_on_lp_opt) scratch_cpu_fj_on_lp_opt->halted = true; } template @@ -155,20 +153,22 @@ bool local_search_t::do_fj_solve(solution_t& solution, auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream()); auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream()); for (auto& cpu_fj : ls_cpu_fj) { - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - h_weights, - h_weights, - h_objective_weight, - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); + cpu_fj = fj_ptr->create_cpu_climber(solution, + h_weights, + h_weights, + h_objective_weight, + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); } auto solution_copy = solution; // Start CPU solver in background thread - for (auto& cpu_fj : ls_cpu_fj) { - cpu_fj.start_cpu_solver(); + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + auto ptr = &ls_cpu_fj[i]; +#pragma omp task depend(inout : ptr) + fj_ptr->cpu_solve(*ls_cpu_fj[i]); } // Run GPU solver and measure execution time @@ -176,9 +176,8 @@ bool local_search_t::do_fj_solve(solution_t& solution, in_fj.settings.time_limit = timer.remaining_time(); in_fj.solve(solution); - // Stop CPU solver - for (auto& cpu_fj : ls_cpu_fj) { - cpu_fj.stop_cpu_solver(); + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + ls_cpu_fj[i]->halted = true; } auto gpu_fj_end = std::chrono::high_resolution_clock::now(); @@ -188,13 +187,15 @@ bool local_search_t::do_fj_solve(solution_t& solution, f_t best_cpu_obj = std::numeric_limits::max(); // // Wait for CPU solver to finish - for (auto& cpu_fj : ls_cpu_fj) { - bool cpu_sol_found = cpu_fj.wait_for_cpu_solver(); - if (cpu_sol_found) { - f_t cpu_obj = cpu_fj.fj_cpu->h_best_objective; + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + auto ptr = &ls_cpu_fj[i]; +#pragma omp taskwait depend(inout : ptr) + + if (ls_cpu_fj[i]->feasible_found) { + f_t cpu_obj = ls_cpu_fj[i]->h_best_objective; if (cpu_obj < best_cpu_obj) { best_cpu_obj = cpu_obj; - solution_cpu.copy_new_assignment(cpu_fj.fj_cpu->h_best_assignment); + solution_cpu.copy_new_assignment(ls_cpu_fj[i]->h_best_assignment); solution_cpu.compute_feasibility(); } } diff --git a/cpp/src/mip/local_search/local_search.cuh b/cpp/src/mip/local_search/local_search.cuh index 6fdf4ac72..5264c9372 100644 --- a/cpp/src/mip/local_search/local_search.cuh +++ b/cpp/src/mip/local_search/local_search.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -13,13 +13,9 @@ #include #include #include -#include -#include -#include -#include -#include -#include +#include +#include namespace cuopt::linear_programming::detail { @@ -117,9 +113,13 @@ class local_search_t { feasibility_pump_t fp; std::mt19937 rng; - std::array, 8> ls_cpu_fj; - std::array, 1> scratch_cpu_fj; - cpu_fj_thread_t scratch_cpu_fj_on_lp_opt; + const i_t num_threads_for_cpu_fj = 8; + const i_t num_threads_for_scratch_cpu_fj = 1; + + fj_t* fj_ptr; + std::vector>> ls_cpu_fj; + std::vector>> scratch_cpu_fj; + std::unique_ptr> scratch_cpu_fj_on_lp_opt; problem_t problem_with_objective_cut; bool cutting_plane_added_for_active_run{false}; }; diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu index 08e1806b9..adb191f1f 100644 --- a/cpp/src/mip/solver.cu +++ b/cpp/src/mip/solver.cu @@ -87,6 +87,8 @@ struct branch_and_bound_solution_helper_t { template solution_t mip_solver_t::run_solver() { + solution_t sol(*context.problem_ptr); + if (context.settings.get_mip_callbacks().size() > 0) { for (auto callback : context.settings.get_mip_callbacks()) { callback->template setup(context.problem_ptr->original_problem_ptr->get_n_variables()); @@ -101,7 +103,6 @@ solution_t mip_solver_t::run_solver() if (context.problem_ptr->empty) { CUOPT_LOG_INFO("Problem fully reduced in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); context.problem_ptr->post_process_solution(sol); return sol; @@ -112,14 +113,12 @@ solution_t mip_solver_t::run_solver() bool presolve_success = dm.run_presolve(timer_.remaining_time()); if (!presolve_success) { CUOPT_LOG_INFO("Problem proven infeasible in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); context.problem_ptr->post_process_solution(sol); return sol; } if (context.problem_ptr->empty) { CUOPT_LOG_INFO("Problem full reduced in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); context.problem_ptr->post_process_solution(sol); return sol; @@ -147,97 +146,107 @@ solution_t mip_solver_t::run_solver() return sol; } - namespace dual_simplex = cuopt::linear_programming::dual_simplex; - std::future branch_and_bound_status_future; + namespace dual_simplex = cuopt::linear_programming::dual_simplex; + dual_simplex::mip_status_t bb_status = dual_simplex::mip_status_t::UNSET; dual_simplex::user_problem_t branch_and_bound_problem(context.problem_ptr->handle_ptr); dual_simplex::simplex_solver_settings_t branch_and_bound_settings; std::unique_ptr> branch_and_bound; branch_and_bound_solution_helper_t solution_helper(&dm, branch_and_bound_settings); dual_simplex::mip_solution_t branch_and_bound_solution(1); - if (!context.settings.heuristics_only) { - // Convert the presolved problem to dual_simplex::user_problem_t - op_problem_.get_host_user_problem(branch_and_bound_problem); - // Resize the solution now that we know the number of columns/variables - branch_and_bound_solution.resize(branch_and_bound_problem.num_cols); - - // Fill in the settings for branch and bound - branch_and_bound_settings.time_limit = timer_.remaining_time(); - branch_and_bound_settings.print_presolve_stats = false; - branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap; - branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap; - branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance; - - if (context.settings.num_cpu_threads < 0) { - branch_and_bound_settings.num_threads = omp_get_max_threads() - 1; - } else { - branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads); - } + i_t num_threads = 0; + if (context.settings.num_cpu_threads < 0) { + num_threads = omp_get_max_threads(); + } else { + num_threads = std::max(1, context.settings.num_cpu_threads); + } - i_t num_threads = branch_and_bound_settings.num_threads; - i_t num_bfs_workers = std::max(1, num_threads / 4); - i_t num_diving_workers = std::max(1, num_threads - num_bfs_workers); - branch_and_bound_settings.num_bfs_workers = num_bfs_workers; - branch_and_bound_settings.diving_settings.num_diving_workers = num_diving_workers; - - // Set the branch and bound -> primal heuristics callback - branch_and_bound_settings.solution_callback = - std::bind(&branch_and_bound_solution_helper_t::solution_callback, - &solution_helper, - std::placeholders::_1, - std::placeholders::_2); - branch_and_bound_settings.heuristic_preemption_callback = std::bind( - &branch_and_bound_solution_helper_t::preempt_heuristic_solver, &solution_helper); - - branch_and_bound_settings.set_simplex_solution_callback = - std::bind(&branch_and_bound_solution_helper_t::set_simplex_solution, - &solution_helper, - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3); - - branch_and_bound_settings.node_processed_callback = - std::bind(&branch_and_bound_solution_helper_t::node_processed_callback, - &solution_helper, - std::placeholders::_1, - std::placeholders::_2); - - // Create the branch and bound object - branch_and_bound = std::make_unique>( - branch_and_bound_problem, branch_and_bound_settings); - context.branch_and_bound_ptr = branch_and_bound.get(); - branch_and_bound->set_concurrent_lp_root_solve(true); - - // Set the primal heuristics -> branch and bound callback - context.problem_ptr->branch_and_bound_callback = - std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, - branch_and_bound.get(), - std::placeholders::_1); - context.problem_ptr->set_root_relaxation_solution_callback = - std::bind(&dual_simplex::branch_and_bound_t::set_root_relaxation_solution, - branch_and_bound.get(), - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3, - std::placeholders::_4, - std::placeholders::_5, - std::placeholders::_6); - - // Fork a thread for branch and bound - // std::async and std::future allow us to get the return value of bb::solve() - // without having to manually manage the thread - // std::future.get() performs a join() operation to wait until the return status is available - branch_and_bound_status_future = std::async(std::launch::async, - &dual_simplex::branch_and_bound_t::solve, - branch_and_bound.get(), - std::ref(branch_and_bound_solution)); +#pragma omp parallel num_threads(num_threads) + { +#pragma omp master + { + if (!context.settings.heuristics_only) { + // Convert the presolved problem to dual_simplex::user_problem_t + op_problem_.get_host_user_problem(branch_and_bound_problem); + // Resize the solution now that we know the number of columns/variables + branch_and_bound_solution.resize(branch_and_bound_problem.num_cols); + + // Fill in the settings for branch and bound + branch_and_bound_settings.time_limit = timer_.remaining_time(); + branch_and_bound_settings.num_threads = std::max(num_threads - 3, 1); + branch_and_bound_settings.print_presolve_stats = false; + branch_and_bound_settings.absolute_mip_gap_tol = + context.settings.tolerances.absolute_mip_gap; + branch_and_bound_settings.relative_mip_gap_tol = + context.settings.tolerances.relative_mip_gap; + branch_and_bound_settings.integer_tol = context.settings.tolerances.integrality_tolerance; + branch_and_bound_settings.reliability_branching_settings.enable = + solver_settings_.reliability_branching; + + dual_simplex::mip_solve_mode_t solve_mode = + branch_and_bound_settings.num_threads == 1 + ? dual_simplex::mip_solve_mode_t::BNB_SINGLE_THREADED + : dual_simplex::mip_solve_mode_t::BNB_PARALLEL; + + // Set the branch and bound -> primal heuristics callback + branch_and_bound_settings.solution_callback = + std::bind(&branch_and_bound_solution_helper_t::solution_callback, + &solution_helper, + std::placeholders::_1, + std::placeholders::_2); + branch_and_bound_settings.heuristic_preemption_callback = + std::bind(&branch_and_bound_solution_helper_t::preempt_heuristic_solver, + &solution_helper); + + branch_and_bound_settings.set_simplex_solution_callback = + std::bind(&branch_and_bound_solution_helper_t::set_simplex_solution, + &solution_helper, + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3); + + branch_and_bound_settings.node_processed_callback = + std::bind(&branch_and_bound_solution_helper_t::node_processed_callback, + &solution_helper, + std::placeholders::_1, + std::placeholders::_2); + + // Create the branch and bound object + branch_and_bound = std::make_unique>( + branch_and_bound_problem, branch_and_bound_settings); + context.branch_and_bound_ptr = branch_and_bound.get(); + branch_and_bound->set_concurrent_lp_root_solve(true); + + // Set the primal heuristics -> branch and bound callback + context.problem_ptr->branch_and_bound_callback = + std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, + branch_and_bound.get(), + std::placeholders::_1); + context.problem_ptr->set_root_relaxation_solution_callback = + std::bind(&dual_simplex::branch_and_bound_t::set_root_relaxation_solution, + branch_and_bound.get(), + std::placeholders::_1, + std::placeholders::_2, + std::placeholders::_3, + std::placeholders::_4, + std::placeholders::_5, + std::placeholders::_6); + +#pragma omp task + { + bb_status = branch_and_bound->solve(branch_and_bound_solution, solve_mode); + } + } + +#pragma omp task + { + // Start the primal heuristics + sol = dm.run_solver(); + } + } } - // Start the primal heuristics - auto sol = dm.run_solver(); if (!context.settings.heuristics_only) { - // Wait for the branch and bound to finish - auto bb_status = branch_and_bound_status_future.get(); if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { context.stats.solution_bound = context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound); @@ -257,7 +266,6 @@ solution_t mip_solver_t::run_solver() return sol; } context.problem_ptr->post_process_solution(sol); - dm.rins.stop_rins(); return sol; } diff --git a/cpp/src/mip/utilities/cpu_worker_thread.cuh b/cpp/src/mip/utilities/cpu_worker_thread.cuh deleted file mode 100644 index 60bd5685b..000000000 --- a/cpp/src/mip/utilities/cpu_worker_thread.cuh +++ /dev/null @@ -1,141 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights - * reserved. SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace cuopt::linear_programming::detail { - -template -class cpu_worker_thread_base_t { - public: - cpu_worker_thread_base_t(); - ~cpu_worker_thread_base_t(); - - void start_cpu_solver(); - bool wait_for_cpu_solver(); - - // Derived classes MUST call this in their destructor before the base destructor runs. - // This ensures on_terminate() is called while the derived object is still fully alive. - void request_termination(); - - // Internal method for thread management - safe to call during destruction - void join_worker(); - void cpu_worker_thread(); - - std::thread cpu_worker; - std::mutex cpu_mutex; - std::condition_variable cpu_cv; - std::atomic should_stop{false}; - std::atomic cpu_thread_should_start{false}; - std::atomic cpu_thread_done{true}; - std::atomic cpu_thread_terminate{false}; -}; - -template -cpu_worker_thread_base_t::cpu_worker_thread_base_t() -{ - cpu_worker = std::thread(&cpu_worker_thread_base_t::cpu_worker_thread, this); -} - -template -cpu_worker_thread_base_t::~cpu_worker_thread_base_t() -{ - // Note: We don't call on_terminate() here since the derived object is already destroyed. - join_worker(); -} - -template -void cpu_worker_thread_base_t::cpu_worker_thread() -{ - while (!cpu_thread_terminate) { - { - std::unique_lock lock(cpu_mutex); - cpu_cv.wait(lock, [this] { return cpu_thread_should_start || cpu_thread_terminate; }); - - if (cpu_thread_terminate) break; - - cpu_thread_done = false; - cpu_thread_should_start = false; - } - - static_cast(this)->run_worker(); - - { - std::lock_guard lock(cpu_mutex); - cpu_thread_done = true; - } - cpu_cv.notify_all(); - } -} - -template -void cpu_worker_thread_base_t::request_termination() -{ - bool should_terminate = false; - { - std::lock_guard lock(cpu_mutex); - if (cpu_thread_terminate) return; - cpu_thread_terminate = true; - should_terminate = true; - static_cast(this)->on_terminate(); - } - - if (should_terminate) { - cpu_cv.notify_one(); - join_worker(); - } -} - -template -void cpu_worker_thread_base_t::join_worker() -{ - { - std::lock_guard lock(cpu_mutex); - if (!cpu_thread_terminate) { cpu_thread_terminate = true; } - } - cpu_cv.notify_one(); - - if (cpu_worker.joinable()) { cpu_worker.join(); } -} - -template -void cpu_worker_thread_base_t::start_cpu_solver() -{ - { - std::lock_guard lock(cpu_mutex); - cpu_thread_done = false; - cpu_thread_should_start = true; - static_cast(this)->on_start(); - } - cpu_cv.notify_one(); -} - -template -bool cpu_worker_thread_base_t::wait_for_cpu_solver() -{ - std::unique_lock lock(cpu_mutex); - cpu_cv.wait(lock, [this] { return cpu_thread_done || cpu_thread_terminate; }); - - return static_cast(this)->get_result(); -} - -} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index e1b68bf88..b79026762 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -42,7 +42,7 @@ class omp_atomic_t { return new_val; } - operator T() { return load(); } + operator T() const { return load(); } T operator+=(T inc) { return fetch_add(inc) + inc; } T operator-=(T inc) { return fetch_sub(inc) - inc; } diff --git a/cpp/src/utilities/pcg.hpp b/cpp/src/utilities/pcg.hpp new file mode 100644 index 000000000..5422fd8df --- /dev/null +++ b/cpp/src/utilities/pcg.hpp @@ -0,0 +1,158 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include + +// Copied from raft/PCGenerator (rng_device.cuh). We cannot use raft implementation +// on the CPU (.cpp file) since the raft header includes CUDA code. +// The original code is from https://www.pcg-random.org/. +namespace cuopt { +class PCG { + public: + static constexpr uint64_t default_seed = 0x853c49e6748fea9bULL; + static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL; + + /** + * @brief ctor. Initializes the PCG + * @param rng_state is the generator state used for initializing the generator + * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences + * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a + * unique set of random numbers. This can be achieved by initializing the generator with same + * rng_state for all the threads and diststreamt values for subsequence. + */ + PCG(const uint64_t seed = default_seed, + const uint64_t subsequence = default_stream, + uint64_t offset = 0) + { + set_seed(seed, subsequence, offset); + } + + // Set the seed, subsequence and offset of the PCG + void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0) + { + state = uint64_t(0); + stream = (subsequence << 1u) | 1u; + uint32_t discard; + next(discard); + state += seed; + next(discard); + skipahead(offset); + } + + // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown + // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf + void skipahead(uint64_t offset) + { + uint64_t G = 1; + uint64_t h = 6364136223846793005ULL; + uint64_t C = 0; + uint64_t f = stream; + while (offset) { + if (offset & 1) { + G = G * h; + C = C * h + f; + } + f = f * (h + 1); + h = h * h; + offset >>= 1; + } + state = state * G + C; + } + + /** + * @defgroup NextRand Generate the next random number + * @brief This code is derived from PCG basic code + * @{ + */ + uint32_t next_u32() + { + uint32_t ret; + uint64_t oldstate = state; + state = oldstate * 6364136223846793005ULL + stream; + uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; + uint32_t rot = oldstate >> 59u; + ret = (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); + return ret; + } + + uint64_t next_u64() + { + uint64_t ret; + uint32_t a, b; + a = next_u32(); + b = next_u32(); + ret = uint64_t(a) | (uint64_t(b) << 32); + return ret; + } + + int32_t next_i32() + { + int32_t ret; + uint32_t val; + val = next_u32(); + ret = int32_t(val & 0x7fffffff); + return ret; + } + + int64_t next_i64() + { + int64_t ret; + uint64_t val; + val = next_u64(); + ret = int64_t(val & 0x7fffffffffffffff); + return ret; + } + + float next_float() { return static_cast((next_u32() >> 8) * 0x1.0p-24); } + + double next_double() { return static_cast((next_u64() >> 11) * 0x1.0p-53); } + + template + T next() + { + T val; + next(val); + return val; + } + + void next(uint32_t& ret) { ret = next_u32(); } + void next(uint64_t& ret) { ret = next_u64(); } + void next(int32_t& ret) { ret = next_i32(); } + void next(int64_t& ret) { ret = next_i64(); } + void next(float& ret) { ret = next_float(); } + void next(double& ret) { ret = next_double(); } + + /// Draws a sample from a uniform distribution. The samples are uniformly distributed over + /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward + /// some numbers in the range (scaling by floating-point). + template + T uniform(T low, T high) + { + double val = next_double(); + T range = high - low; + return low + (val * range); + } + + // Shuffles the contents of a sequence using the Fisher–Yates algorithm. + template + void shuffle(std::vector& seq) + { + if (seq.empty()) { return; } + for (size_t i = 0; i < seq.size() - 1; ++i) { + size_t j = uniform(i, seq.size()); + if (j != i) std::swap(seq[i], seq[j]); + } + } + + private: + uint64_t state; + uint64_t stream; +}; +} // namespace cuopt