based_api/based_trainer.py at master · jameshgrn/based_api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from data_format import generate_data

optuna.logging.set_verbosity(optuna.logging.WARNING)

FEATURES = ['log_Q', 'log_w', 'log_S']
MODEL_PATH = 'based_model.ubj'
VALIDATION_IMG = 'img/BASED_validation.png'

# PyPI xgboost wheels don't ship the Metal backend; hist+cpu uses all cores natively
DEVICE = 'cpu'


def add_log_features(df):
    df = df.copy()
    df['log_Q'] = np.log10(df['discharge'])
    df['log_w'] = np.log10(df['width'])
    df['log_S'] = np.log10(df['slope'])
    df['log_h'] = np.log10(df['depth'])
    return df


def smape(y_true, y_pred):
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))


def optuna_objective(trial, X_train, y_train):
    params = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'device': DEVICE,
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_child_weight': trial.suggest_int('min_child_weight', 3, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.5, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 10.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 0.85),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.85),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in kf.split(X_train):
        dtrain = xgb.DMatrix(X_train.iloc[train_idx], label=y_train.iloc[train_idx])
        dval = xgb.DMatrix(X_train.iloc[val_idx], label=y_train.iloc[val_idx])
        model = xgb.train(
            params, dtrain,
            num_boost_round=1500,
            evals=[(dval, 'val')],
            early_stopping_rounds=30,
            verbose_eval=False,
        )
        scores.append(smape(y_train.iloc[val_idx].values, model.predict(dval)))
    return np.mean(scores)


def main():
    generate_data()

    df = pd.read_csv('data/based_input_data_clean.csv')
    df = df[~df['source'].str.contains('Trampush', case=False, na=False)]
    df = add_log_features(df)

    X, y = df[FEATURES], df['log_h']

    # Stratify split by depth quantile so all depth ranges are represented
    depth_bins = pd.qcut(df['depth'], q=5, labels=False)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=depth_bins
    )
    print(f"Device: {DEVICE}  |  Train: {len(X_train)}  Test: {len(X_test)}")

    # Hyperparameter search
    study = optuna.create_study(direction='minimize')
    study.optimize(
        lambda trial: optuna_objective(trial, X_train, y_train),
        n_trials=100,
        show_progress_bar=True,
    )
    best_params = {
        **study.best_params,
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'device': DEVICE,
    }
    print(f"Best params: {best_params}")

    # Find optimal number of rounds via CV with early stopping
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    cv_results = xgb.cv(
        best_params, dtrain,
        num_boost_round=2000,
        nfold=5,
        metrics=['mae'],
        early_stopping_rounds=30,
        seed=42,
    )
    n_best = int(cv_results['test-mae-mean'].idxmin()) + 1
    print(f"Optimal rounds: {n_best}")

    # Train final model on full training set
    final_model = xgb.train(best_params, dtrain, num_boost_round=n_best)
    final_model.save_model(MODEL_PATH)
    print(f"Saved model → {MODEL_PATH}")

    # Evaluate in original (non-log) space
    log_pred = final_model.predict(dtest)
    pred = 10 ** log_pred
    true = 10 ** y_test.values

    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    r2 = r2_score(true, pred)
    mape = np.mean(np.abs((true - pred) / true)) * 100

    print(f"\nTest set metrics (original space):")
    print(f"  MAE:  {mae:.3f} m")
    print(f"  RMSE: {rmse:.3f} m")
    print(f"  R2:   {r2:.4f}")
    print(f"  MAPE: {mape:.1f}%")

    # Validation plot
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.scatter(true, pred, color='#FFCCBC', edgecolor='k', s=60, alpha=0.7)
    lim = [min(true.min(), pred.min()) * 0.8, max(true.max(), pred.max()) * 1.2]
    ax.plot(lim, lim, 'k--', lw=2, label='1:1')
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('Measured Channel Depth (m)')
    ax.set_ylabel('Predicted Channel Depth (m)')
    ax.set_title(f'BASED Validation | n = {len(y_test)}')
    ax.legend()
    plt.tight_layout()
    plt.savefig(VALIDATION_IMG, dpi=250)
    print(f"Saved validation plot → {VALIDATION_IMG}")


if __name__ == '__main__':
    main()