FitTrack/model_training.py at main · Suansh25/FitTrack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
import numpy as np
import joblib
import optuna
import shap
import time
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split, cross_val_score


print("Loading data (100k rows)...")
df = pd.read_csv('assets/train.csv')

df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
df['Intensity_Factor'] = df['Duration'] * df['Heart_Rate']
df['Sex_encoded'] = df['Sex'].map({'male': 1, 'female': 0})

features = ['Sex_encoded', 'Age', 'Height', 'Weight', 'BMI', 'Duration', 'Heart_Rate', 'Body_Temp', 'Intensity_Factor']
X = df[features]
y = df['Calories']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X_shard = X_train.sample(50000, random_state=42)
y_shard = y_train.loc[X_shard.index]


def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.1),
        'tree_method': 'hist',
        'device': 'cpu'
    }
    model = XGBRegressor(**param, random_state=42)

    return -cross_val_score(model, X_shard, y_shard, cv=2, scoring='neg_mean_absolute_error').mean()

print("Tuning XGBoost...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)


print("Training Final Ensemble of XGBoost, CatBoost and LightGBM on full rows (This will take time)...")
best_xgb = XGBRegressor(**study.best_params, random_state=42)
lgbm = LGBMRegressor(n_estimators=300, learning_rate=0.07, n_jobs=-1, random_state=42)
cat = CatBoostRegressor(n_estimators=300, learning_rate=0.07, verbose=0, random_state=42)

ensemble_model = VotingRegressor(estimators=[('xgb', best_xgb), ('lgbm', lgbm), ('cat', cat)])
start = time.time()
ensemble_model.fit(X_train, y_train)
print(f"Training Complete in {time.time()-start:.2f}s")


best_xgb.fit(X_train, y_train)
explainer = shap.TreeExplainer(best_xgb)


joblib.dump(ensemble_model, 'calories_model.pkl')
joblib.dump(explainer, 'shap_explainer.pkl')
joblib.dump(features, 'feature_names.pkl')
print("All models saved.")