gpredomics/param.yaml at main · predomics/gpredomics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# gpredomics v0.9.0 — Complete parameter reference
# All fields have sensible defaults. Only override what you need.
# Unless specifically indicated, all fields accept a single value.
general:
  seed: 42                                    # Random seed for reproducibility. Same seed = same results.
  algo: ga                                    # Algorithm: ga, beam, aco, sa, ils, lasso, mcmc
  cv: false                                   # Should cross-validation be enabled?
  thread_number: 8                            # The number of threads used in feature selection and fit computation.
  gpu: false                                  # Should Gpredomics use the GPU? (GA and beam only)
  language: bin,ratio,pow2,ter                # Possible values are ter, bin, ratio, pow2. See README.md for details. A comma-separated list (no spaces) is accepted; the initial population will be split among the listed languages.
  data_type: raw,prev,log                     # Possible values: raw, prev, log, zscore. See docs/individual.md. Comma-separated list accepted. zscore standardizes features using training mean/std (propagated to test).
  epsilon: 1e-5                               # Used for data_type 'prev' (threshold) and 'log' (floor for values below threshold). Not used for 'zscore'.
  fit: auc                                    # Possible values are auc, specificity, sensitivity, mcc, f1_score, g_mean (classification). See README.md for details.
  k_penalty: 0.0001                           # This penalty is derived from the fit function and multiplied by k (the number of variables used in the model).
  fr_penalty: 0.0                             # Used only when fit is specificity or sensitivity. Subtract (1 - symmetrical_metrics) × fr_penalty from the fit during threshold optimization.
  bias_penalty: 0.00                          # Penalize model fit with specificity/sensitivity < 0.5: fit - (1.0 - bad_metrics) * bias_penalty.
  threshold_ci_n_bootstrap: 0                 # Number of bootstrap samples to compute the threshold confidence interval (0 = disable CI).
  threshold_ci_penalty: 0.5                   # If a threshold confidence interval exists (see above), penalize evolution according to rejection_rate * penalty.
  threshold_ci_alpha: 0.05                    # If a threshold confidence interval exists (see above), alpha to construct the CI around the threshold (allows models to reject a sample).
  threshold_ci_frac_bootstrap: 1.0            # Should the bootstrap be based on random draw with replacement (on the whole dataset, frac=1, Efron method) or a random draw without replacement on a random subset (0<frac<1, Politis & Romano method)?
  user_feature_penalties_weight: 1.0          # Weight of user feature penalties defined in feature_annotations if specified
  #log_base:                                  # Uncomment to print log and results in log_base.txt
  log_level: info                             # Possible values are trace, debug, info, warning, or error.
  n_model_to_display: 30                      # Number of models to display in the last generation (default 10, 0 means all models).
  display_colorful: true                      # Should the terminal results be colored to make them easier to read?
  keep_trace: true                            # Should every metrics be kept in memory.
  #save_exp: exp.mp                           # Uncomment to save experiment in timestamp-save_exp, which can be reloaded with --load timestamp-save_exp. Extension should be .json, .mp/.msgpack or .bin

data:
  X: "samples/Qin2014/Xtrain.tsv"             # The features of the training dataset.
  y: "samples/Qin2014/Ytrain.tsv"             # The class labels of the training dataset (0 = class 0, 1 = class 1 (positive), 2 = unknown).
  Xtest: "samples/Qin2014/Xtest.tsv"          # The features of the test dataset.
  ytest: "samples/Qin2014/Ytest.tsv"          # The class labels of the test dataset.
  holdout_ratio: 0.20                         # If >0 and <1, this percentage of samples from X/y will be held out as test data (except Xtest/ytest if provided).
  feature_annotations:                        # Path to a TSV file containing feature annotations (see data.md for details).
  sample_annotations:                         # Path to a TSV file containing sample annotations (see data.md for details).
  features_in_rows: true                      # Are the features arranged in rows (legacy Predomics) or columns (standard in ML)?
  inverse_classes: false                      # If true, the negative class becomes the objective.
  classes:                                    # List of class literal labels.
    - "healthy"                               # Class 0 label
    - "cirrhosis"                             # Class 1 label
    - "unknown"                               # Class 2 label (ignored during training and testing)
  max_features_per_class: 0                   # 0: all significant features; otherwise take the first X significant features sorted by p-value/log_abs_bayes_factor.
  feature_minimal_prevalence_pct: 10          # Per class: features are retained if any class reaches this prevalence (percentage).
  feature_minimal_feature_value: 1e-4         # Features whose mean is below this value are discarded.
  feature_selection_method: wilcoxon          # Possible values are wilcoxon, student_t (Student's t-test), and bayesian_fisher. Wilcoxon is recommended in most cases.
  feature_maximal_adj_pvalue: 1               # BH-FDR alpha. Features with adjusted p-value above this are removed. Adaptive: if < 10 features pass, alpha relaxes (0.1→0.2→0.5) with warnings. Set to 1.0 to keep all.
  feature_minimal_log_abs_bayes_factor: 2     # Features with a lower log absolute Bayes factor will be removed (Bayesian method only).

cv:
  inner_folds: 5                              # Number of folds used to penalize overfitting if overfit_penalty > 0.
  overfit_penalty: 0                          # This penalty is derived from the fit function (fit -= mean(fit on k-1 - abs(delta with last fold) * overfit_penalty)).
  resampling_inner_folds_epochs: 0            # To avoid learning about inner folds, resplit them every x epochs.
  outer_folds: 5                              # Number of folds used for outer cross-validation (run the algorithm on each set of k-1 folds then merge Families of Best Models).
  fit_on_valid: true                          # If true, FBM is based on validation fold fit (favoring generalization); otherwise on k-1 folds. DO NOT enable without external validation data.
  cv_best_models_ci_alpha: 0.05               # Alpha for the Family of Best Models confidence interval based on the best fit on validation fold. Smaller alpha -> larger best_model range.
  cv_fbm_ci_method: wilson                       # CI method for CV FBM selection: wald, wald_continuity (blaise), wilson, agresti_coull, clopper_pearson.
  stratify_by:                                # If sample annotations are provided, stratify folds according to classes AND to this annotation (must exist in sample_annotations file).

importance:
  compute_importance: false                   # Should importance be computed?
  n_permutations_mda: 100                     # Number of permutations per feature for MDA importance.
  scaled_importance: true                     # Scale importance by feature prevalence inside folds.
  importance_aggregation: mean                # Aggregation method for importances: "mean" or "median".

voting:
  vote: false                                 # Should voting be activated?
  fbm_ci_alpha: 0.05                          # Alpha for the Family of Best Models confidence interval (>0 and <1). Smaller alpha -> larger best_model range.
  fbm_ci_method: wilson                          # CI method for FBM selection: wald, wald_continuity (blaise), wilson, agresti_coull, clopper_pearson.
  prune_before_voting: false                  # Should models be pruned before voting according to feature importances (MDA < 0)?
  min_perf: 0.50                              # Required sensitivity AND specificity to be judged; >=0.5 avoids "single-choice oriented" judges.
  min_diversity: 10                           # Required diversity between judges.
  method: Majority                            # Majority: class 1 if votes > threshold, else class 0 (if equal, no classification). Consensus: no classification (rejection) if threshold is not reached.
  method_threshold: 0.5                       # Typically 0.5 for majority (for equal distribution). If set to 0, optimize via Youden's maximum. Typically 1 for consensus (classify samples only when all experts agree).
  threshold_windows_pct: 5                    # Majority only: if provided, samples with votes within threshold ± threshold_windows_pct% are not classified (e.g., 10% -> votes in [40%, 60%] are unclassified).
  complete_display: false                     # If true, display complete results.

ga:
  population_size: 5000                       # The target number of models per generation (NB: the real number may be lower because of clone removal).
  max_epochs: 100                             # The maximum number of generations before stopping (you can stop manually as well).
  min_epochs: 1                               # The minimum number of generations to run.
  max_age_best_model: 100                     # Stopping before max_epochs (after min_epochs) occurs only if the best model reaches this age.
  k_min: 1                                    # The minimal number of variables used in a model.
  k_max: 200                                  # The maximum number of variables used in a model (0 removes any maximum).
  select_elite_pct: 2                         # Percentage of best models from previous generation retained: lower values are more elitist.
  select_niche_pct: 20                        # (optional, default 0) Percentage of best models retained but split per language/data type (helps maintain competition between languages/data types).
  select_random_pct: 10                       # Percentage of opportunistic models retained from the previous generation; split among all languages/data_types present.
  mutated_children_pct: 80                    # Percentage of children submitted to mutation.
  mutated_features_pct: 20                    # Percentage of mutation per "gene" (potential variable). Note: most mutations are nonsensical (e.g., removing a variable).
  mutation_non_null_chance_pct: 20            # Percentage of "meaningful" mutations (likelihood that a mutation may add a new variable).
  forced_diversity_pct: 0                     # If >0%, the population is filtered every forced_diversity_epochs according to this value. A feature is considered different if it is present with a given sign in individual A but not in B. Linear models (Bin, Ter, Pow2) are grouped and filtered together, while ratio models are compared separately.
  forced_diversity_epochs: 10                 # If forced_diversity_pct > 0%, the epoch gap between two diversity filters.
  random_sampling_pct: 0                      # If >0%, each generation is fitted on only a percentage of random samples to reduce overfitting.
  random_sampling_epochs: 1                   # If random_sampling_pct > 0, number of epochs during which the same randomized dataset is kept.

beam:
  method: LimitedExhaustive                   # LimitedExhaustive: generate all combinations (k out of features_to_keep). ParallelForward: extend each extendable model by one feature chosen from features_to_keep.
  k_start: 1                                  # Number of variables used in the initial population.
  k_stop: 100                                 # Maximum number of variables to consider in a single model (variable count limit for beam algorithm).
  best_models_criterion : 10                  # If ≤1: alpha for FBM confidence interval based on best fit (smaller = larger range). If >1: percentage of best models to keep (e.g., 5 = top 5%, R-like approach with frequency-based feature selection).
  fbm_ci_method: wilson                          # CI method for beam FBM selection: wald, wald_continuity (blaise), wilson, agresti_coull, clopper_pearson.
  max_nb_of_models: 20000                     # Limits the number of features_to_keep at each epoch according to the number of models made possible by them (truncated according to significance).

mcmc:
  method: gibbs                               # "gibbs" (joint variable selection, default) or "sbs" (sequential backward selection).
  n_iter: 1000                                # Number of MCMC iterations. Gibbs: more = better posterior. SBS: per elimination step.
  n_burn: 500                                 # Burn-in period: first n_burn iterations discarded (chain hasn't converged yet).
  lambda: 0.001                               # L2 regularization on beta coefficients (prior strength). Higher = stronger shrinkage.
  nmin: 10                                    # SBS only: minimum features to keep after elimination. 0 = disable SBS.
  p0: 0.1                                     # Gibbs only: prior inclusion probability per feature. Lower = sparser models. E[k] ≈ p0 × n_features.
  n_chains: 1                                 # Number of parallel MCMC chains. >1 requires large n_iter for each chain to converge.
  #save_trace_outdir:                         # Uncomment to export MCMC trace to this directory.

# ── Ant Colony Optimization ────────────────────────────────────────
aco:
  n_ants: 100                                 # Number of ants per iteration. Each constructs one model. More ants = broader search.
  max_iterations: 200                         # Maximum iterations before stopping.
  min_iterations: 10                          # Minimum iterations before early stopping can trigger.
  alpha: 1.0                                  # Pheromone importance in probability formula: P ∝ τ^α × η^β.
  beta: 2.0                                   # Heuristic (feature significance) importance: higher = more greedy toward significant features.
  rho: 0.1                                    # Evaporation rate [0,1]. Higher = faster forgetting, more exploration.
  tau_min: 0.01                               # Minimum pheromone (MMAS). Prevents convergence lock.
  tau_max: 1.0                                # Maximum pheromone (MMAS).
  elite_weight: 2.0                           # Extra pheromone deposit for the global best ant.
  k_min: 1                                    # Minimum features per model.
  k_max: 200                                  # Maximum features per model.
  max_age_best_model: 10                      # Early stop if best model hasn't improved for this many iterations.

# ── Simulated Annealing ───────────────────────────────────────────
sa:
  initial_temperature: 1.0                    # Starting temperature. Higher = more random exploration initially.
  cooling_rate: 0.995                         # Temperature multiplier per iteration: T(n+1) = T(n) × cooling_rate. Slower (0.999) = more thorough.
  min_temperature: 0.001                      # Stop when temperature drops below this (effectively greedy).
  max_iterations: 10000                       # Maximum iterations.
  snapshot_interval: 100                      # Save population snapshot every N iterations (for generation_tracking).
  k_min: 1                                    # Minimum features per model.
  k_max: 200                                  # Maximum features per model.

# ── Iterated Local Search ─────────────────────────────────────────
ils:
  max_iterations: 100                         # Number of perturb-then-local-search cycles.
  perturbation_size: 3                        # Number of random feature changes per perturbation (larger = bigger jumps).
  local_search_steps: 50                      # Max single-feature moves per local search phase.
  max_no_improve: 20                          # Stop if no improvement for this many iterations.
  snapshot_interval: 100                      # Save snapshot every N iterations.
  k_min: 1                                    # Minimum features per model.
  k_max: 200                                  # Maximum features per model.

# ── LASSO / Elastic Net ───────────────────────────────────────────
lasso:
  alpha_min: 0.001                            # Minimum regularization (many features). The path goes from alpha_max to alpha_min.
  alpha_max: 1.0                              # Maximum regularization (few/no features).
  n_alphas: 100                               # Number of alpha values along the regularization path.
  max_iter: 1000                              # Max coordinate descent iterations per alpha.
  tolerance: 0.0001                           # Convergence tolerance for coordinate descent.
  l1_ratio: 1.0                               # 1.0 = pure LASSO (L1), 0.5 = Elastic Net (L1+L2), 0.0 = pure Ridge (L2).

gpu:
  fallback_to_cpu: true                       # Execute the code on the CPU (integrated graphics) if there is no GPU available (recommended).
  memory_policy: Strict                       # [Strict: panic if limits are not available | Adaptive: adapt if limits are not available | Performance: use all available GPU memory regardless of limits]
  max_total_memory_mb: 256                    # Limit in MB defining the maximum amount of GPU memory used by all buffers.
  max_buffer_size_mb: 128                     # Limit in MB defining the maximum amount of GPU memory used by a single buffer.