From 04de628fbc86e1b9e4c2cf89953336779d1477cf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 06:31:05 +0000 Subject: [PATCH 1/4] Initial plan From 4e9aca58c5e3f1182a5973f1ebd310c4b7721e1c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 06:44:04 +0000 Subject: [PATCH 2/4] Add 6 CLI binaries for statistical model training and prediction Add command-line utilities for stochastic forests, conditional random forests, and conditional inference trees (train + predict for each). All binaries use ArgumentHandler for CLI args, support CSV/TSV input with auto-detection, and follow existing repository conventions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- binaries/CMakeLists.txt | 79 ++++++- binaries/Ygor_CI_Tree_Predict.cc | 109 ++++++++++ binaries/Ygor_CI_Tree_Train.cc | 162 +++++++++++++++ binaries/Ygor_Conditional_Forest_Predict.cc | 109 ++++++++++ binaries/Ygor_Conditional_Forest_Train.cc | 217 ++++++++++++++++++++ binaries/Ygor_Stochastic_Forest_Predict.cc | 109 ++++++++++ binaries/Ygor_Stochastic_Forest_Train.cc | 193 +++++++++++++++++ 7 files changed, 977 insertions(+), 1 deletion(-) create mode 100644 binaries/Ygor_CI_Tree_Predict.cc create mode 100644 binaries/Ygor_CI_Tree_Train.cc create mode 100644 binaries/Ygor_Conditional_Forest_Predict.cc create mode 100644 binaries/Ygor_Conditional_Forest_Train.cc create mode 100644 binaries/Ygor_Stochastic_Forest_Predict.cc create mode 100644 binaries/Ygor_Stochastic_Forest_Train.cc diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt index 19c8483..00a45ac 100644 --- a/binaries/CMakeLists.txt +++ b/binaries/CMakeLists.txt @@ -48,13 +48,90 @@ target_link_libraries(parse_TAR_files Threads::Threads ) +add_executable(ygor_stochastic_forest_train + Ygor_Stochastic_Forest_Train.cc +) +target_include_directories(ygor_stochastic_forest_train + SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(ygor_stochastic_forest_train + ygor + m + Threads::Threads +) + +add_executable(ygor_stochastic_forest_predict + Ygor_Stochastic_Forest_Predict.cc +) +target_include_directories(ygor_stochastic_forest_predict + SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(ygor_stochastic_forest_predict + ygor + m + Threads::Threads +) + +add_executable(ygor_conditional_forest_train + Ygor_Conditional_Forest_Train.cc +) +target_include_directories(ygor_conditional_forest_train + SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(ygor_conditional_forest_train + ygor + m + Threads::Threads +) + +add_executable(ygor_conditional_forest_predict + Ygor_Conditional_Forest_Predict.cc +) +target_include_directories(ygor_conditional_forest_predict + SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(ygor_conditional_forest_predict + ygor + m + Threads::Threads +) + +add_executable(ygor_ci_tree_train + Ygor_CI_Tree_Train.cc +) +target_include_directories(ygor_ci_tree_train + SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(ygor_ci_tree_train + ygor + m + Threads::Threads +) + +add_executable(ygor_ci_tree_predict + Ygor_CI_Tree_Predict.cc +) +target_include_directories(ygor_ci_tree_predict + SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(ygor_ci_tree_predict + ygor + m + Threads::Threads +) + install(TARGETS fits_replace_nans twot_pvalue regex_tester parse_TAR_files + ygor_stochastic_forest_train + ygor_stochastic_forest_predict + ygor_conditional_forest_train + ygor_conditional_forest_predict + ygor_ci_tree_train + ygor_ci_tree_predict ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) - diff --git a/binaries/Ygor_CI_Tree_Predict.cc b/binaries/Ygor_CI_Tree_Predict.cc new file mode 100644 index 0000000..35dd04f --- /dev/null +++ b/binaries/Ygor_CI_Tree_Predict.cc @@ -0,0 +1,109 @@ +//Ygor_CI_Tree_Predict.cc -- A command-line utility to predict using a trained conditional inference tree model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "YgorArguments.h" +#include "YgorMath.h" +#include "YgorStatsCITrees.h" + +int main(int argc, char **argv){ + + std::string model_file; + std::string input_file; + bool has_header = false; + + ArgumentHandler arger; + arger.description = "Predict using a trained conditional inference tree model."; + + arger.push_back(std::make_tuple(1, 'm', "model", true, "", + "Trained model file to load.", + [&](const std::string &optarg) -> void { + model_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'i', "input", true, "", + "Input CSV/TSV file with feature values.", + [&](const std::string &optarg) -> void { + input_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'H', "header", false, "", + "Indicate that the first row is a header (will be skipped).", + [&](const std::string &) -> void { + has_header = true; + })); + + arger.Launch(argc, argv); + + if(model_file.empty()){ + throw std::runtime_error("A model file must be specified via -m or --model."); + } + if(input_file.empty()){ + throw std::runtime_error("An input file must be specified via -i or --input."); + } + + // Load the model. + Stats::ConditionalInferenceTrees model; + { + std::ifstream fm(model_file); + if(!fm.good()){ + throw std::runtime_error("Unable to open model file '" + model_file + "'."); + } + if(!model.read_from(fm)){ + throw std::runtime_error("Failed to read model from '" + model_file + "'."); + } + } + + // Read the input file. + std::ifstream fi(input_file); + if(!fi.good()){ + throw std::runtime_error("Unable to open input file '" + input_file + "'."); + } + + std::string line; + bool first_data_line = true; + char delimiter = ','; + + while(std::getline(fi, line)){ + if(line.empty()) continue; + if(has_header && first_data_line){ + has_header = false; + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + continue; + } + if(first_data_line){ + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + first_data_line = false; + } + + std::vector vals; + std::stringstream ss(line); + std::string token; + while(std::getline(ss, token, delimiter)){ + vals.push_back(std::stod(token)); + } + if(vals.empty()) continue; + + const int64_t n_features = static_cast(vals.size()); + num_array x(1, n_features, 0.0); + for(int64_t c = 0; c < n_features; ++c){ + x.coeff(0, c) = vals[c]; + } + + double prediction = model.predict(x); + std::cout << prediction << std::endl; + } + + return 0; +} diff --git a/binaries/Ygor_CI_Tree_Train.cc b/binaries/Ygor_CI_Tree_Train.cc new file mode 100644 index 0000000..721898f --- /dev/null +++ b/binaries/Ygor_CI_Tree_Train.cc @@ -0,0 +1,162 @@ +//Ygor_CI_Tree_Train.cc -- A command-line utility to train a conditional inference tree model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "YgorArguments.h" +#include "YgorMath.h" +#include "YgorStatsCITrees.h" + +int main(int argc, char **argv){ + + std::string input_file; + std::string output_file; + bool has_header = false; + int64_t max_depth = 10; + int64_t min_samples_split = 2; + double alpha = 0.05; + int64_t n_permutations = 1000; + uint64_t random_seed = 42; + + ArgumentHandler arger; + arger.description = "Train a conditional inference tree model from tabular data (CSV/TSV)."; + + arger.push_back(std::make_tuple(1, 'i', "input", true, "", + "Input CSV/TSV file. Last column is the response variable.", + [&](const std::string &optarg) -> void { + input_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'o', "output", true, "", + "Output file for the trained model.", + [&](const std::string &optarg) -> void { + output_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'H', "header", false, "", + "Indicate that the first row is a header (will be skipped).", + [&](const std::string &) -> void { + has_header = true; + })); + arger.push_back(std::make_tuple(2, 'd', "max-depth", true, "", + "Maximum tree depth (default: 10).", + [&](const std::string &optarg) -> void { + max_depth = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 's', "min-samples-split", true, "", + "Minimum samples to split a node (default: 2).", + [&](const std::string &optarg) -> void { + min_samples_split = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'a', "alpha", true, "", + "Significance level for conditional inference tests (default: 0.05).", + [&](const std::string &optarg) -> void { + alpha = std::stod(optarg); + })); + arger.push_back(std::make_tuple(2, 'p', "n-permutations", true, "", + "Number of permutations for hypothesis tests (default: 1000).", + [&](const std::string &optarg) -> void { + n_permutations = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'r', "random-seed", true, "", + "Random seed (default: 42).", + [&](const std::string &optarg) -> void { + random_seed = std::stoull(optarg); + })); + + arger.Launch(argc, argv); + + if(input_file.empty()){ + throw std::runtime_error("An input file must be specified via -i or --input."); + } + if(output_file.empty()){ + throw std::runtime_error("An output file must be specified via -o or --output."); + } + + // Read the input file. + std::ifstream fi(input_file); + if(!fi.good()){ + throw std::runtime_error("Unable to open input file '" + input_file + "'."); + } + + std::vector> rows; + std::string line; + bool first_data_line = true; + char delimiter = ','; + + while(std::getline(fi, line)){ + if(line.empty()) continue; + if(has_header && first_data_line){ + has_header = false; + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + continue; + } + if(first_data_line){ + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + first_data_line = false; + } + + std::vector vals; + std::stringstream ss(line); + std::string token; + while(std::getline(ss, token, delimiter)){ + vals.push_back(std::stod(token)); + } + if(!vals.empty()){ + rows.push_back(vals); + } + } + + if(rows.empty()){ + throw std::runtime_error("No data rows found in input file."); + } + + const int64_t n_rows = static_cast(rows.size()); + const int64_t n_cols = static_cast(rows.front().size()); + if(n_cols < 2){ + throw std::runtime_error("Input must have at least two columns (features + response)."); + } + const int64_t n_features = n_cols - 1; + + num_array X(n_rows, n_features, 0.0); + num_array y(n_rows, 1, 0.0); + for(int64_t r = 0; r < n_rows; ++r){ + if(static_cast(rows[r].size()) != n_cols){ + throw std::runtime_error("Row " + std::to_string(r) + " has inconsistent number of columns."); + } + for(int64_t c = 0; c < n_features; ++c){ + X.coeff(r, c) = rows[r][c]; + } + y.coeff(r, 0) = rows[r][n_features]; + } + + std::cout << "Training conditional inference tree with " << n_rows << " samples and " + << n_features << " features." << std::endl; + + // Train the model. + Stats::ConditionalInferenceTrees model(max_depth, min_samples_split, alpha, + n_permutations, random_seed); + model.fit(X, y); + + // Save the model. + std::ofstream fo(output_file); + if(!fo.good()){ + throw std::runtime_error("Unable to open output file '" + output_file + "'."); + } + if(!model.write_to(fo)){ + throw std::runtime_error("Failed to write model to output file."); + } + std::cout << "Model saved to '" << output_file << "'." << std::endl; + + return 0; +} diff --git a/binaries/Ygor_Conditional_Forest_Predict.cc b/binaries/Ygor_Conditional_Forest_Predict.cc new file mode 100644 index 0000000..ff5c079 --- /dev/null +++ b/binaries/Ygor_Conditional_Forest_Predict.cc @@ -0,0 +1,109 @@ +//Ygor_Conditional_Forest_Predict.cc -- A command-line utility to predict using a trained conditional random forest model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "YgorArguments.h" +#include "YgorMath.h" +#include "YgorStatsConditionalForests.h" + +int main(int argc, char **argv){ + + std::string model_file; + std::string input_file; + bool has_header = false; + + ArgumentHandler arger; + arger.description = "Predict using a trained conditional random forest model."; + + arger.push_back(std::make_tuple(1, 'm', "model", true, "", + "Trained model file to load.", + [&](const std::string &optarg) -> void { + model_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'i', "input", true, "", + "Input CSV/TSV file with feature values.", + [&](const std::string &optarg) -> void { + input_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'H', "header", false, "", + "Indicate that the first row is a header (will be skipped).", + [&](const std::string &) -> void { + has_header = true; + })); + + arger.Launch(argc, argv); + + if(model_file.empty()){ + throw std::runtime_error("A model file must be specified via -m or --model."); + } + if(input_file.empty()){ + throw std::runtime_error("An input file must be specified via -i or --input."); + } + + // Load the model. + Stats::ConditionalRandomForests model; + { + std::ifstream fm(model_file); + if(!fm.good()){ + throw std::runtime_error("Unable to open model file '" + model_file + "'."); + } + if(!model.read_from(fm)){ + throw std::runtime_error("Failed to read model from '" + model_file + "'."); + } + } + + // Read the input file. + std::ifstream fi(input_file); + if(!fi.good()){ + throw std::runtime_error("Unable to open input file '" + input_file + "'."); + } + + std::string line; + bool first_data_line = true; + char delimiter = ','; + + while(std::getline(fi, line)){ + if(line.empty()) continue; + if(has_header && first_data_line){ + has_header = false; + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + continue; + } + if(first_data_line){ + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + first_data_line = false; + } + + std::vector vals; + std::stringstream ss(line); + std::string token; + while(std::getline(ss, token, delimiter)){ + vals.push_back(std::stod(token)); + } + if(vals.empty()) continue; + + const int64_t n_features = static_cast(vals.size()); + num_array x(1, n_features, 0.0); + for(int64_t c = 0; c < n_features; ++c){ + x.coeff(0, c) = vals[c]; + } + + double prediction = model.predict(x); + std::cout << prediction << std::endl; + } + + return 0; +} diff --git a/binaries/Ygor_Conditional_Forest_Train.cc b/binaries/Ygor_Conditional_Forest_Train.cc new file mode 100644 index 0000000..fe07eab --- /dev/null +++ b/binaries/Ygor_Conditional_Forest_Train.cc @@ -0,0 +1,217 @@ +//Ygor_Conditional_Forest_Train.cc -- A command-line utility to train a conditional random forest model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "YgorArguments.h" +#include "YgorMath.h" +#include "YgorStatsConditionalForests.h" + +int main(int argc, char **argv){ + + std::string input_file; + std::string output_file; + bool has_header = false; + int64_t n_trees = 100; + int64_t max_depth = 10; + int64_t min_samples_split = 2; + double alpha = 0.05; + int64_t n_permutations = 1000; + int64_t max_features = -1; + double subsample_fraction = 0.632; + double correlation_threshold = 0.20; + uint64_t random_seed = 42; + std::string importance_str = "none"; + + ArgumentHandler arger; + arger.description = "Train a conditional random forest model from tabular data (CSV/TSV)."; + + arger.push_back(std::make_tuple(1, 'i', "input", true, "", + "Input CSV/TSV file. Last column is the response variable.", + [&](const std::string &optarg) -> void { + input_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'o', "output", true, "", + "Output file for the trained model.", + [&](const std::string &optarg) -> void { + output_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'H', "header", false, "", + "Indicate that the first row is a header (will be skipped).", + [&](const std::string &) -> void { + has_header = true; + })); + arger.push_back(std::make_tuple(2, 't', "n-trees", true, "", + "Number of trees (default: 100).", + [&](const std::string &optarg) -> void { + n_trees = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'd', "max-depth", true, "", + "Maximum tree depth (default: 10).", + [&](const std::string &optarg) -> void { + max_depth = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 's', "min-samples-split", true, "", + "Minimum samples to split a node (default: 2).", + [&](const std::string &optarg) -> void { + min_samples_split = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'a', "alpha", true, "", + "Significance level for conditional inference tests (default: 0.05).", + [&](const std::string &optarg) -> void { + alpha = std::stod(optarg); + })); + arger.push_back(std::make_tuple(2, 'p', "n-permutations", true, "", + "Number of permutations for hypothesis tests (default: 1000).", + [&](const std::string &optarg) -> void { + n_permutations = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'f', "max-features", true, "", + "Maximum features per split; -1 for all (default: -1).", + [&](const std::string &optarg) -> void { + max_features = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'S', "subsample-fraction", true, "", + "Fraction of samples to draw per tree (default: 0.632).", + [&](const std::string &optarg) -> void { + subsample_fraction = std::stod(optarg); + })); + arger.push_back(std::make_tuple(2, 'c', "correlation-threshold", true, "", + "Correlation threshold for conditional importance (default: 0.20).", + [&](const std::string &optarg) -> void { + correlation_threshold = std::stod(optarg); + })); + arger.push_back(std::make_tuple(2, 'r', "random-seed", true, "", + "Random seed (default: 42).", + [&](const std::string &optarg) -> void { + random_seed = std::stoull(optarg); + })); + arger.push_back(std::make_tuple(2, 'I', "importance", true, "", + "Feature importance method: none, permutation, or conditional (default: none).", + [&](const std::string &optarg) -> void { + importance_str = optarg; + })); + + arger.Launch(argc, argv); + + if(input_file.empty()){ + throw std::runtime_error("An input file must be specified via -i or --input."); + } + if(output_file.empty()){ + throw std::runtime_error("An output file must be specified via -o or --output."); + } + + // Parse the importance method. + Stats::ConditionalImportanceMethod importance_method = Stats::ConditionalImportanceMethod::none; + if(importance_str == "none"){ + importance_method = Stats::ConditionalImportanceMethod::none; + }else if(importance_str == "permutation"){ + importance_method = Stats::ConditionalImportanceMethod::permutation; + }else if(importance_str == "conditional"){ + importance_method = Stats::ConditionalImportanceMethod::conditional; + }else{ + throw std::runtime_error("Unknown importance method '" + importance_str + "'. Use none, permutation, or conditional."); + } + + // Read the input file. + std::ifstream fi(input_file); + if(!fi.good()){ + throw std::runtime_error("Unable to open input file '" + input_file + "'."); + } + + std::vector> rows; + std::string line; + bool first_data_line = true; + char delimiter = ','; + + while(std::getline(fi, line)){ + if(line.empty()) continue; + if(has_header && first_data_line){ + has_header = false; + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + continue; + } + if(first_data_line){ + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + first_data_line = false; + } + + std::vector vals; + std::stringstream ss(line); + std::string token; + while(std::getline(ss, token, delimiter)){ + vals.push_back(std::stod(token)); + } + if(!vals.empty()){ + rows.push_back(vals); + } + } + + if(rows.empty()){ + throw std::runtime_error("No data rows found in input file."); + } + + const int64_t n_rows = static_cast(rows.size()); + const int64_t n_cols = static_cast(rows.front().size()); + if(n_cols < 2){ + throw std::runtime_error("Input must have at least two columns (features + response)."); + } + const int64_t n_features = n_cols - 1; + + num_array X(n_rows, n_features, 0.0); + num_array y(n_rows, 1, 0.0); + for(int64_t r = 0; r < n_rows; ++r){ + if(static_cast(rows[r].size()) != n_cols){ + throw std::runtime_error("Row " + std::to_string(r) + " has inconsistent number of columns."); + } + for(int64_t c = 0; c < n_features; ++c){ + X.coeff(r, c) = rows[r][c]; + } + y.coeff(r, 0) = rows[r][n_features]; + } + + std::cout << "Training conditional random forest with " << n_rows << " samples and " + << n_features << " features." << std::endl; + + // Train the model. + Stats::ConditionalRandomForests model(n_trees, max_depth, min_samples_split, + alpha, n_permutations, max_features, + subsample_fraction, correlation_threshold, + random_seed); + model.set_importance_method(importance_method); + model.fit(X, y); + + // Compute and display feature importances. + if(importance_method != Stats::ConditionalImportanceMethod::none){ + model.compute_importance(X, y); + std::vector importances = model.get_feature_importances(); + std::cout << "Feature importances:" << std::endl; + for(int64_t c = 0; c < static_cast(importances.size()); ++c){ + std::cout << " feature " << c << ": " << importances[c] << std::endl; + } + } + + // Save the model. + std::ofstream fo(output_file); + if(!fo.good()){ + throw std::runtime_error("Unable to open output file '" + output_file + "'."); + } + if(!model.write_to(fo)){ + throw std::runtime_error("Failed to write model to output file."); + } + std::cout << "Model saved to '" << output_file << "'." << std::endl; + + return 0; +} diff --git a/binaries/Ygor_Stochastic_Forest_Predict.cc b/binaries/Ygor_Stochastic_Forest_Predict.cc new file mode 100644 index 0000000..3aac094 --- /dev/null +++ b/binaries/Ygor_Stochastic_Forest_Predict.cc @@ -0,0 +1,109 @@ +//Ygor_Stochastic_Forest_Predict.cc -- A command-line utility to predict using a trained stochastic forest model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "YgorArguments.h" +#include "YgorMath.h" +#include "YgorStatsStochasticForests.h" + +int main(int argc, char **argv){ + + std::string model_file; + std::string input_file; + bool has_header = false; + + ArgumentHandler arger; + arger.description = "Predict using a trained stochastic forest model."; + + arger.push_back(std::make_tuple(1, 'm', "model", true, "", + "Trained model file to load.", + [&](const std::string &optarg) -> void { + model_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'i', "input", true, "", + "Input CSV/TSV file with feature values.", + [&](const std::string &optarg) -> void { + input_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'H', "header", false, "", + "Indicate that the first row is a header (will be skipped).", + [&](const std::string &) -> void { + has_header = true; + })); + + arger.Launch(argc, argv); + + if(model_file.empty()){ + throw std::runtime_error("A model file must be specified via -m or --model."); + } + if(input_file.empty()){ + throw std::runtime_error("An input file must be specified via -i or --input."); + } + + // Load the model. + Stats::StochasticForests model; + { + std::ifstream fm(model_file); + if(!fm.good()){ + throw std::runtime_error("Unable to open model file '" + model_file + "'."); + } + if(!model.read_from(fm)){ + throw std::runtime_error("Failed to read model from '" + model_file + "'."); + } + } + + // Read the input file. + std::ifstream fi(input_file); + if(!fi.good()){ + throw std::runtime_error("Unable to open input file '" + input_file + "'."); + } + + std::string line; + bool first_data_line = true; + char delimiter = ','; + + while(std::getline(fi, line)){ + if(line.empty()) continue; + if(has_header && first_data_line){ + has_header = false; + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + continue; + } + if(first_data_line){ + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + first_data_line = false; + } + + std::vector vals; + std::stringstream ss(line); + std::string token; + while(std::getline(ss, token, delimiter)){ + vals.push_back(std::stod(token)); + } + if(vals.empty()) continue; + + const int64_t n_features = static_cast(vals.size()); + num_array x(1, n_features, 0.0); + for(int64_t c = 0; c < n_features; ++c){ + x.coeff(0, c) = vals[c]; + } + + double prediction = model.predict(x); + std::cout << prediction << std::endl; + } + + return 0; +} diff --git a/binaries/Ygor_Stochastic_Forest_Train.cc b/binaries/Ygor_Stochastic_Forest_Train.cc new file mode 100644 index 0000000..0c04822 --- /dev/null +++ b/binaries/Ygor_Stochastic_Forest_Train.cc @@ -0,0 +1,193 @@ +//Ygor_Stochastic_Forest_Train.cc -- A command-line utility to train a stochastic forest model. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "YgorArguments.h" +#include "YgorMath.h" +#include "YgorStatsStochasticForests.h" + +int main(int argc, char **argv){ + + std::string input_file; + std::string output_file; + bool has_header = false; + int64_t n_trees = 100; + int64_t max_depth = 10; + int64_t min_samples_split = 2; + int64_t max_features = -1; + uint64_t random_seed = 42; + std::string importance_str = "none"; + + ArgumentHandler arger; + arger.description = "Train a stochastic forest model from tabular data (CSV/TSV)."; + + arger.push_back(std::make_tuple(1, 'i', "input", true, "", + "Input CSV/TSV file. Last column is the response variable.", + [&](const std::string &optarg) -> void { + input_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'o', "output", true, "", + "Output file for the trained model.", + [&](const std::string &optarg) -> void { + output_file = optarg; + })); + arger.push_back(std::make_tuple(1, 'H', "header", false, "", + "Indicate that the first row is a header (will be skipped).", + [&](const std::string &) -> void { + has_header = true; + })); + arger.push_back(std::make_tuple(2, 't', "n-trees", true, "", + "Number of trees (default: 100).", + [&](const std::string &optarg) -> void { + n_trees = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'd', "max-depth", true, "", + "Maximum tree depth (default: 10).", + [&](const std::string &optarg) -> void { + max_depth = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 's', "min-samples-split", true, "", + "Minimum samples to split a node (default: 2).", + [&](const std::string &optarg) -> void { + min_samples_split = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'f', "max-features", true, "", + "Maximum features per split; -1 for all (default: -1).", + [&](const std::string &optarg) -> void { + max_features = std::stoll(optarg); + })); + arger.push_back(std::make_tuple(2, 'r', "random-seed", true, "", + "Random seed (default: 42).", + [&](const std::string &optarg) -> void { + random_seed = std::stoull(optarg); + })); + arger.push_back(std::make_tuple(2, 'I', "importance", true, "", + "Feature importance method: none, gini, or permutation (default: none).", + [&](const std::string &optarg) -> void { + importance_str = optarg; + })); + + arger.Launch(argc, argv); + + if(input_file.empty()){ + throw std::runtime_error("An input file must be specified via -i or --input."); + } + if(output_file.empty()){ + throw std::runtime_error("An output file must be specified via -o or --output."); + } + + // Parse the importance method. + Stats::ImportanceMethod importance_method = Stats::ImportanceMethod::none; + if(importance_str == "none"){ + importance_method = Stats::ImportanceMethod::none; + }else if(importance_str == "gini"){ + importance_method = Stats::ImportanceMethod::gini; + }else if(importance_str == "permutation"){ + importance_method = Stats::ImportanceMethod::permutation; + }else{ + throw std::runtime_error("Unknown importance method '" + importance_str + "'. Use none, gini, or permutation."); + } + + // Read the input file. + std::ifstream fi(input_file); + if(!fi.good()){ + throw std::runtime_error("Unable to open input file '" + input_file + "'."); + } + + std::vector> rows; + std::string line; + bool first_data_line = true; + char delimiter = ','; + + while(std::getline(fi, line)){ + if(line.empty()) continue; + if(has_header && first_data_line){ + has_header = false; + // Auto-detect delimiter from header line. + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + continue; + } + if(first_data_line){ + if(line.find('\t') != std::string::npos){ + delimiter = '\t'; + } + first_data_line = false; + } + + std::vector vals; + std::stringstream ss(line); + std::string token; + while(std::getline(ss, token, delimiter)){ + vals.push_back(std::stod(token)); + } + if(!vals.empty()){ + rows.push_back(vals); + } + } + + if(rows.empty()){ + throw std::runtime_error("No data rows found in input file."); + } + + const int64_t n_rows = static_cast(rows.size()); + const int64_t n_cols = static_cast(rows.front().size()); + if(n_cols < 2){ + throw std::runtime_error("Input must have at least two columns (features + response)."); + } + const int64_t n_features = n_cols - 1; + + num_array X(n_rows, n_features, 0.0); + num_array y(n_rows, 1, 0.0); + for(int64_t r = 0; r < n_rows; ++r){ + if(static_cast(rows[r].size()) != n_cols){ + throw std::runtime_error("Row " + std::to_string(r) + " has inconsistent number of columns."); + } + for(int64_t c = 0; c < n_features; ++c){ + X.coeff(r, c) = rows[r][c]; + } + y.coeff(r, 0) = rows[r][n_features]; + } + + std::cout << "Training stochastic forest with " << n_rows << " samples and " + << n_features << " features." << std::endl; + + // Train the model. + Stats::StochasticForests model(n_trees, max_depth, min_samples_split, max_features, random_seed); + model.set_importance_method(importance_method); + model.fit(X, y); + + // Compute and display feature importances. + if(importance_method == Stats::ImportanceMethod::permutation){ + model.compute_permutation_importance(X, y); + } + if(importance_method != Stats::ImportanceMethod::none){ + std::vector importances = model.get_feature_importances(); + std::cout << "Feature importances:" << std::endl; + for(int64_t c = 0; c < static_cast(importances.size()); ++c){ + std::cout << " feature " << c << ": " << importances[c] << std::endl; + } + } + + // Save the model. + std::ofstream fo(output_file); + if(!fo.good()){ + throw std::runtime_error("Unable to open output file '" + output_file + "'."); + } + if(!model.write_to(fo)){ + throw std::runtime_error("Failed to write model to output file."); + } + std::cout << "Model saved to '" << output_file << "'." << std::endl; + + return 0; +} From 9ad386d58478544bc6e6c0f75e8c3ebe52f51a3e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 06:49:33 +0000 Subject: [PATCH 3/4] Add CLI binaries for statistical models and serialization for CITrees - Add write_to/read_from serialization to ConditionalInferenceTrees - Create ygor_stochastic_forest_train and ygor_stochastic_forest_predict - Create ygor_conditional_forest_train and ygor_conditional_forest_predict - Create ygor_ci_tree_train and ygor_ci_tree_predict - Update binaries/CMakeLists.txt to build and install all new binaries Co-authored-by: hdclark <934858+hdclark@users.noreply.github.com> Agent-Logs-Url: https://github.com/hdclark/Ygor/sessions/a5269fa4-9646-45da-9023-4ba7802d958c --- src/YgorStatsCITrees.cc | 153 ++++++++++++++++++++++++++++++++++++++++ src/YgorStatsCITrees.h | 30 ++++++++ 2 files changed, 183 insertions(+) diff --git a/src/YgorStatsCITrees.cc b/src/YgorStatsCITrees.cc index c13f7a0..500b41a 100644 --- a/src/YgorStatsCITrees.cc +++ b/src/YgorStatsCITrees.cc @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -565,3 +566,155 @@ int64_t Stats::ConditionalInferenceTrees::get_n_permutations() const { template int64_t Stats::ConditionalInferenceTrees::get_n_permutations() const; template int64_t Stats::ConditionalInferenceTrees::get_n_permutations() const; #endif + + +template +bool Stats::ConditionalInferenceTrees::write_tree_node(std::ostream &os, const TreeNode *node) const { + if(node == nullptr){ + return false; + } + if(node->is_leaf){ + os << "L " << node->value << "\n"; + }else{ + os << "I " << node->split_feature << " " << node->split_threshold << "\n"; + if(!write_tree_node(os, node->left.get())) return false; + if(!write_tree_node(os, node->right.get())) return false; + } + return (!os.fail()); +} +#ifndef YGOR_STATS_CI_TREES_DISABLE_ALL_SPECIALIZATIONS + template bool Stats::ConditionalInferenceTrees::write_tree_node(std::ostream &, const TreeNode *) const; + template bool Stats::ConditionalInferenceTrees::write_tree_node(std::ostream &, const TreeNode *) const; +#endif + + +template +std::unique_ptr::TreeNode> +Stats::ConditionalInferenceTrees::read_tree_node(std::istream &is) { + std::string node_type; + is >> node_type; + if(is.fail()) return nullptr; + + auto node = std::make_unique(); + try{ + if(node_type == "L"){ + node->is_leaf = true; + std::string val_str; + is >> val_str; + if(is.fail()) return nullptr; + node->value = static_cast(std::stold(val_str)); + }else if(node_type == "I"){ + node->is_leaf = false; + is >> node->split_feature; + std::string thresh_str; + is >> thresh_str; + if(is.fail()) return nullptr; + node->split_threshold = static_cast(std::stold(thresh_str)); + node->left = read_tree_node(is); + node->right = read_tree_node(is); + if(!node->left || !node->right) return nullptr; + }else{ + return nullptr; + } + }catch(const std::invalid_argument &){ + return nullptr; + }catch(const std::out_of_range &){ + return nullptr; + } + return node; +} +#ifndef YGOR_STATS_CI_TREES_DISABLE_ALL_SPECIALIZATIONS + template std::unique_ptr::TreeNode> + Stats::ConditionalInferenceTrees::read_tree_node(std::istream &); + template std::unique_ptr::TreeNode> + Stats::ConditionalInferenceTrees::read_tree_node(std::istream &); +#endif + + +template +bool Stats::ConditionalInferenceTrees::write_to(std::ostream &os) const { + const auto original_precision = os.precision(); + os.precision( std::numeric_limits::max_digits10 ); + + // RAII guard to restore stream precision on all exit paths. + struct precision_guard { + std::ostream &s; + std::streamsize p; + ~precision_guard(){ s.precision(p); } + } guard{os, original_precision}; + + os << "ConditionalInferenceTrees_v1" << "\n"; + os << "max_depth " << this->max_depth << "\n"; + os << "min_samples_split " << this->min_samples_split << "\n"; + os << "alpha " << this->alpha << "\n"; + os << "n_permutations " << this->n_permutations << "\n"; + os << "n_features_trained " << this->n_features_trained << "\n"; + os << "random_seed " << this->random_seed << "\n"; + + // Write tree. + os << "begin_tree\n"; + if(!write_tree_node(os, this->root.get())) return false; + os << "end_tree\n"; + + os.flush(); + return (!os.fail()); +} +#ifndef YGOR_STATS_CI_TREES_DISABLE_ALL_SPECIALIZATIONS + template bool Stats::ConditionalInferenceTrees::write_to(std::ostream &) const; + template bool Stats::ConditionalInferenceTrees::write_to(std::ostream &) const; +#endif + + +template +bool Stats::ConditionalInferenceTrees::read_from(std::istream &is) { + try{ + std::string label; + + // Read and validate header. + is >> label; + if(is.fail() || label != "ConditionalInferenceTrees_v1") return false; + + // Read parameters. + is >> label >> this->max_depth; + if(is.fail() || label != "max_depth") return false; + + is >> label >> this->min_samples_split; + if(is.fail() || label != "min_samples_split") return false; + + { + std::string val_str; + is >> label >> val_str; + if(is.fail() || label != "alpha") return false; + this->alpha = static_cast(std::stold(val_str)); + } + + is >> label >> this->n_permutations; + if(is.fail() || label != "n_permutations") return false; + + is >> label >> this->n_features_trained; + if(is.fail() || label != "n_features_trained") return false; + + is >> label >> this->random_seed; + if(is.fail() || label != "random_seed") return false; + + // Read tree. + is >> label; + if(is.fail() || label != "begin_tree") return false; + + this->root = read_tree_node(is); + if(!this->root) return false; + + is >> label; + if(is.fail() || label != "end_tree") return false; + + return (!is.fail()); + }catch(const std::invalid_argument &){ + return false; + }catch(const std::out_of_range &){ + return false; + } +} +#ifndef YGOR_STATS_CI_TREES_DISABLE_ALL_SPECIALIZATIONS + template bool Stats::ConditionalInferenceTrees::read_from(std::istream &); + template bool Stats::ConditionalInferenceTrees::read_from(std::istream &); +#endif diff --git a/src/YgorStatsCITrees.h b/src/YgorStatsCITrees.h index 9dc3269..3c3d273 100644 --- a/src/YgorStatsCITrees.h +++ b/src/YgorStatsCITrees.h @@ -13,6 +13,7 @@ #define YGOR_STATS_CI_TREES_HDR_GRD_H #include +#include #include #include #include @@ -109,6 +110,10 @@ class ConditionalInferenceTrees { // Predict using the tree from a given node. T predict_tree(const TreeNode *node, const num_array &x) const; + // Serialization helpers. + bool write_tree_node(std::ostream &os, const TreeNode *node) const; + std::unique_ptr read_tree_node(std::istream &is); + public: // Constructor. // @@ -160,6 +165,31 @@ class ConditionalInferenceTrees { // Get the number of permutations. int64_t get_n_permutations() const; + + // Write the model to a text stream. + // + // Serializes all data members, parameters, and tree structure to a human-readable + // text format. The model can be restored exactly using read_from() without any loss + // in function or accuracy. Floating point values are written with maximum precision. + // + // Parameters: + // os: Output stream to write to. + // + // Returns: + // true on success, false if the stream enters a fail state. + bool write_to(std::ostream &os) const; + + // Read a model from a text stream. + // + // Restores a model previously written by write_to(). All parameters and tree + // structure are restored exactly. + // + // Parameters: + // is: Input stream to read from. + // + // Returns: + // true on success, false if the stream format is invalid or enters a fail state. + bool read_from(std::istream &is); }; } //namespace Stats. From c57bb9132ff10f78f1a3f5d17b85cf5cb74f27d2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 06:51:17 +0000 Subject: [PATCH 4/4] Refactor CSV/TSV header parsing to avoid confusing mutation of has_header flag Co-authored-by: hdclark <934858+hdclark@users.noreply.github.com> Agent-Logs-Url: https://github.com/hdclark/Ygor/sessions/a5269fa4-9646-45da-9023-4ba7802d958c --- binaries/Ygor_CI_Tree_Predict.cc | 11 ++++------- binaries/Ygor_CI_Tree_Train.cc | 11 ++++------- binaries/Ygor_Conditional_Forest_Predict.cc | 11 ++++------- binaries/Ygor_Conditional_Forest_Train.cc | 11 ++++------- binaries/Ygor_Stochastic_Forest_Predict.cc | 11 ++++------- binaries/Ygor_Stochastic_Forest_Train.cc | 13 +++++-------- 6 files changed, 25 insertions(+), 43 deletions(-) diff --git a/binaries/Ygor_CI_Tree_Predict.cc b/binaries/Ygor_CI_Tree_Predict.cc index 35dd04f..264591e 100644 --- a/binaries/Ygor_CI_Tree_Predict.cc +++ b/binaries/Ygor_CI_Tree_Predict.cc @@ -73,17 +73,14 @@ int main(int argc, char **argv){ while(std::getline(fi, line)){ if(line.empty()) continue; - if(has_header && first_data_line){ - has_header = false; - if(line.find('\t') != std::string::npos){ - delimiter = '\t'; - } - continue; - } if(first_data_line){ if(line.find('\t') != std::string::npos){ delimiter = '\t'; } + if(has_header){ + first_data_line = false; + continue; + } first_data_line = false; } diff --git a/binaries/Ygor_CI_Tree_Train.cc b/binaries/Ygor_CI_Tree_Train.cc index 721898f..1bf35ab 100644 --- a/binaries/Ygor_CI_Tree_Train.cc +++ b/binaries/Ygor_CI_Tree_Train.cc @@ -92,17 +92,14 @@ int main(int argc, char **argv){ while(std::getline(fi, line)){ if(line.empty()) continue; - if(has_header && first_data_line){ - has_header = false; - if(line.find('\t') != std::string::npos){ - delimiter = '\t'; - } - continue; - } if(first_data_line){ if(line.find('\t') != std::string::npos){ delimiter = '\t'; } + if(has_header){ + first_data_line = false; + continue; + } first_data_line = false; } diff --git a/binaries/Ygor_Conditional_Forest_Predict.cc b/binaries/Ygor_Conditional_Forest_Predict.cc index ff5c079..e232f27 100644 --- a/binaries/Ygor_Conditional_Forest_Predict.cc +++ b/binaries/Ygor_Conditional_Forest_Predict.cc @@ -73,17 +73,14 @@ int main(int argc, char **argv){ while(std::getline(fi, line)){ if(line.empty()) continue; - if(has_header && first_data_line){ - has_header = false; - if(line.find('\t') != std::string::npos){ - delimiter = '\t'; - } - continue; - } if(first_data_line){ if(line.find('\t') != std::string::npos){ delimiter = '\t'; } + if(has_header){ + first_data_line = false; + continue; + } first_data_line = false; } diff --git a/binaries/Ygor_Conditional_Forest_Train.cc b/binaries/Ygor_Conditional_Forest_Train.cc index fe07eab..633bae1 100644 --- a/binaries/Ygor_Conditional_Forest_Train.cc +++ b/binaries/Ygor_Conditional_Forest_Train.cc @@ -134,17 +134,14 @@ int main(int argc, char **argv){ while(std::getline(fi, line)){ if(line.empty()) continue; - if(has_header && first_data_line){ - has_header = false; - if(line.find('\t') != std::string::npos){ - delimiter = '\t'; - } - continue; - } if(first_data_line){ if(line.find('\t') != std::string::npos){ delimiter = '\t'; } + if(has_header){ + first_data_line = false; + continue; + } first_data_line = false; } diff --git a/binaries/Ygor_Stochastic_Forest_Predict.cc b/binaries/Ygor_Stochastic_Forest_Predict.cc index 3aac094..48659a3 100644 --- a/binaries/Ygor_Stochastic_Forest_Predict.cc +++ b/binaries/Ygor_Stochastic_Forest_Predict.cc @@ -73,17 +73,14 @@ int main(int argc, char **argv){ while(std::getline(fi, line)){ if(line.empty()) continue; - if(has_header && first_data_line){ - has_header = false; - if(line.find('\t') != std::string::npos){ - delimiter = '\t'; - } - continue; - } if(first_data_line){ if(line.find('\t') != std::string::npos){ delimiter = '\t'; } + if(has_header){ + first_data_line = false; + continue; + } first_data_line = false; } diff --git a/binaries/Ygor_Stochastic_Forest_Train.cc b/binaries/Ygor_Stochastic_Forest_Train.cc index 0c04822..a84bb2a 100644 --- a/binaries/Ygor_Stochastic_Forest_Train.cc +++ b/binaries/Ygor_Stochastic_Forest_Train.cc @@ -110,18 +110,15 @@ int main(int argc, char **argv){ while(std::getline(fi, line)){ if(line.empty()) continue; - if(has_header && first_data_line){ - has_header = false; - // Auto-detect delimiter from header line. - if(line.find('\t') != std::string::npos){ - delimiter = '\t'; - } - continue; - } if(first_data_line){ + // Auto-detect delimiter from first non-empty line. if(line.find('\t') != std::string::npos){ delimiter = '\t'; } + if(has_header){ + first_data_line = false; + continue; + } first_data_line = false; }