boa-forecaster/config.example.yaml at main · TomCardeLo/boa-forecaster · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# $schema-compatible
# Validated at load time by boa_forecaster.config_schema.BoaConfig.
# ──────────────────────────────────────────────────────────────────────────────
# boa-sarima-forecaster  –  Example configuration file
#
# Copy this file to config.yaml and fill in your values.
# config.yaml is listed in .gitignore so secrets are never committed.
# ──────────────────────────────────────────────────────────────────────────────

# ── Data Ingestion ─────────────────────────────────────────────────────────────
data:
  # Path to the input Excel workbook (relative or absolute).
  # See docs/methodology.md for the expected column structure.
  # Required columns: Date, CS
  # Optional columns (auto-defaulted if absent): SKU (→ 1), Country (→ "_")
  input_path: "data/input/sales.xlsx"

  # Name of the worksheet containing the time-series data.
  sheet_name: "Data"

  # Number of meta-header rows to skip before the column-name row.
  # Set to 0 if your file has no extra header rows.
  skip_rows: 2

  # strptime-compatible format string for the Date column.
  # "%Y%m" matches values like "202201" (January 2022).
  date_format: "%Y%m"

  # Extend the time axis up to this date when filling missing periods.
  end_date: "2026-01-01"

  # Pandas offset alias for the time-series sampling frequency.
  # Common values: "MS" (month start), "W" (weekly), "D" (daily), "h" (hourly).
  # When changing freq, also update model.sarima.seasonal_period to match the
  # seasonality cycle you care about — e.g. "W" → 52 (annual), "D" → 7 (weekly).
  freq: "MS"

# ── Optimisation ──────────────────────────────────────────────────────────────
optimization:
  # Inclusive integer ranges for the SARIMA(p,d,q)(P,D,Q,m) search space.
  p_range: [0, 3]   # autoregressive order
  d_range: [0, 2]   # degree of differencing
  q_range: [0, 3]   # moving-average order
  P_range: [0, 2]   # seasonal AR order
  D_range: [0, 1]   # seasonal differencing order
  Q_range: [0, 2]   # seasonal MA order

  # Total number of Optuna trials per time series.
  # Higher values improve quality but increase runtime.
  n_calls: 50

  # Number of parallel Optuna workers.
  # Use -1 to auto-detect CPU cores; use 1 for deterministic single-thread runs.
  n_jobs: 1

# ── Model ──────────────────────────────────────────────────────────────────────
model:
  sarima:
    # Fixed seasonal period (m) — NOT optimised by the Bayesian search.
    # Must match the seasonality cycle of your data:
    #   freq "MS" (monthly)  → seasonal_period 12  (annual cycle)
    #   freq "W"  (weekly)   → seasonal_period 52  (annual cycle)
    #   freq "D"  (daily)    → seasonal_period 7   (weekly cycle)
    #   freq "h"  (hourly)   → seasonal_period 24  (daily cycle)
    seasonal_period: 12

    # Complexity constraints applied before fitting each trial (penalty = 1e6):
    #   (p + q) <= 4   — limits non-seasonal parameter count
    #   (P + Q) <= 3   — limits seasonal parameter count
    search_space:
      p: [0, 3]
      d: [0, 2]
      q: [0, 3]
      P: [0, 2]
      D: [0, 1]
      Q: [0, 2]

# ── Standardization ────────────────────────────────────────────────────────────
standardization:
  window: 6
  sigma_threshold: 2.5  # ±Nσ clipping. Use 1.0-2.0 for clean data, 2.5-3.0 for promo-heavy series

# ── Metrics ────────────────────────────────────────────────────────────────────
metrics:
  # Weighted objective minimised by the Bayesian optimiser.
  # Each component specifies a registered metric name and its scalar weight.
  # Available metrics: smape, rmsle, mae, rmse, mape
  # Weights do not need to sum to 1 but it is recommended for interpretability.
  #
  # Default — demand forecasting profile (relative accuracy + log-scale stability):
  components:
    - metric: smape
      weight: 0.7
    - metric: rmsle
      weight: 0.3
  #
  # Alternative examples:
  #   Revenue / price forecasting (absolute scale matters):
  #     components:
  #       - metric: mae
  #         weight: 0.6
  #       - metric: rmse
  #         weight: 0.4
  #
  #   Pure percentage accuracy:
  #     components:
  #       - metric: mape
  #         weight: 1.0

# ── Forecast ──────────────────────────────────────────────────────────────────
forecast:
  # Number of months to forecast into the future.
  n_periods: 12

  # Significance level for SARIMA confidence intervals (0.05 → 95% CI).
  alpha: 0.05

# ── Output ────────────────────────────────────────────────────────────────────
output:
  # Directory where Excel result files will be written.
  output_path: "data/output/"

  # Run identifier prepended to every output filename.
  # Increment this for each production run to avoid overwriting previous results.
  run_id: "RUN-2025-01"

# ── Logging ───────────────────────────────────────────────────────────────────
logging:
  level: "INFO"   # DEBUG | INFO | WARNING | ERROR | CRITICAL

# ── v2.0 Model registry (ML models) ──────────────────────────────────────────
# Select which model the main pipeline uses and configure its search space.
# Options: "sarima", "random_forest", "xgboost", "lightgbm"
models:
  active: sarima

  sarima:
    enabled: true
    seasonal_period: 12
    search_space:
      p: {low: 0, high: 3}
      d: {low: 0, high: 2}
      q: {low: 0, high: 3}
      P: {low: 0, high: 2}
      D: {low: 0, high: 1}
      Q: {low: 0, high: 2}
    constraints:
      max_p_plus_q: 4
      max_P_plus_Q: 3
    warm_starts:
      - {p: 1, d: 1, q: 1, P: 1, D: 1, Q: 1}
      - {p: 1, d: 1, q: 0, P: 0, D: 0, Q: 0}

  random_forest:
    enabled: false
    forecast_horizon: 12
    search_space:
      n_estimators:      {type: int,         low: 50,   high: 500,  log: true}
      max_depth:         {type: int,         low: 2,    high: 20}
      min_samples_split: {type: float,       low: 0.01, high: 0.3,  log: true}
      min_samples_leaf:  {type: int,         low: 1,    high: 20}
      max_features:      {type: categorical, choices: ["sqrt", "log2", 0.5, 0.8, 1.0]}
    warm_starts:
      - {n_estimators: 100, max_depth: 5,  min_samples_split: 0.1,  min_samples_leaf: 1, max_features: sqrt}
      - {n_estimators: 200, max_depth: 10, min_samples_split: 0.05, min_samples_leaf: 3, max_features: log2}

  xgboost:
    enabled: false
    forecast_horizon: 12
    early_stopping_rounds: 20
    search_space:
      n_estimators:     {type: int,   low: 50,    high: 1000, log: true}
      max_depth:        {type: int,   low: 2,     high: 10}
      learning_rate:    {type: float, low: 0.005, high: 0.3,  log: true}
      subsample:        {type: float, low: 0.5,   high: 1.0}
      colsample_bytree: {type: float, low: 0.5,   high: 1.0}
      min_child_weight: {type: int,   low: 1,     high: 20}
      reg_alpha:        {type: float, low: 1.0e-8, high: 10.0, log: true}
      reg_lambda:       {type: float, low: 1.0e-8, high: 10.0, log: true}
      gamma:            {type: float, low: 0.0,   high: 5.0}
    warm_starts:
      - {n_estimators: 100, max_depth: 6, learning_rate: 0.1,  subsample: 0.8,
         colsample_bytree: 0.8, min_child_weight: 1, reg_alpha: 0.0, reg_lambda: 1.0, gamma: 0.0}
      - {n_estimators: 300, max_depth: 4, learning_rate: 0.05, subsample: 0.7,
         colsample_bytree: 0.7, min_child_weight: 5, reg_alpha: 0.1, reg_lambda: 1.0, gamma: 0.1}

  lightgbm:
    enabled: false
    forecast_horizon: 12
    early_stopping_rounds: 20
    search_space:
      n_estimators:      {type: int,   low: 50,    high: 1000, log: true}
      num_leaves:        {type: int,   low: 8,     high: 256,  log: true}
      max_depth:         {type: int,   low: -1,    high: 15}
      learning_rate:     {type: float, low: 0.005, high: 0.3,  log: true}
      subsample:         {type: float, low: 0.5,   high: 1.0}
      colsample_bytree:  {type: float, low: 0.5,   high: 1.0}
      min_child_samples: {type: int,   low: 5,     high: 100}
      reg_alpha:         {type: float, low: 1.0e-8, high: 10.0, log: true}
      reg_lambda:        {type: float, low: 1.0e-8, high: 10.0, log: true}
    warm_starts:
      - {n_estimators: 100, num_leaves: 31, max_depth: -1, learning_rate: 0.05,
         subsample: 0.8, colsample_bytree: 0.8, min_child_samples: 20,
         reg_alpha: 0.0, reg_lambda: 1.0}
      - {n_estimators: 300, num_leaves: 63, max_depth: 7, learning_rate: 0.02,
         subsample: 0.7, colsample_bytree: 0.7, min_child_samples: 10,
         reg_alpha: 0.1, reg_lambda: 1.0}

# ── Feature Engineering (ML models) ──────────────────────────────────────────
# Consumed by RandomForest, XGBoost, and LightGBM. Ignored by SARIMA.
features:
  # Autoregressive lag features — list of integer lag periods (in time steps).
  lag_periods: [1, 2, 3, 6, 12]

  # Rolling-window statistics (mean, std) — list of integer window sizes.
  rolling_windows: [3, 6, 12]

  # Add month-of-year, quarter, and day-of-week dummies.
  include_calendar: true

  # Add a linear trend feature (time index normalised to [0, 1]).
  include_trend: true

  # Add expanding-window mean/std features (computationally heavier).
  include_expanding: false