ForgeOne/train_gridsearch.py at main · neuralforgeone/ForgeOne · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
10-hour hyperparameter search with extensive grid.

This will train many models with different hyperparameters
and select the best one.
"""

from algo_trader.research.train import train_single_split
from algo_trader.research.dataset import build_ml_table, load_features_labels
from algo_trader.research.splits import make_walk_forward_splits
from algo_trader.common.logger import setup_logger, logger
from algo_trader.common.config import settings
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from datetime import datetime
import numpy as np
import pandas as pd

setup_logger(level="INFO", json=False)

print("=" * 60)
print("🔍 10-HOUR HYPERPARAMETER GRID SEARCH")
print("=" * 60)
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)
print()

# Load data
features, labels = load_features_labels(timeframe="1h")
X, y, meta = build_ml_table(features, labels, target="direction_4")

print(f"Dataset: {len(X):,} samples, {len(X.columns)} features")
print()

# Define extensive parameter grid
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 50.0, 100.0],  # 10 values
    'classifier__class_weight': ['balanced', None],  # 2 values
    'classifier__max_iter': [5000, 10000, 20000, 50000],  # 4 values
    'classifier__solver': ['lbfgs', 'saga'],  # 2 values
}

# Total combinations: 10 * 2 * 4 * 2 = 160 models
# With 5-fold CV: 160 * 5 = 800 model fits!

print(f"Parameter grid:")
for param, values in param_grid.items():
    print(f"  {param}: {values}")
print()
print(f"Total combinations: {10 * 2 * 4 * 2} models")
print(f"With 5-fold CV: {10 * 2 * 4 * 2 * 5} fits")
print()

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])

# Grid search with cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='roc_auc',
    n_jobs=-1,  # Use all CPU cores
    verbose=3,  # Show progress
    return_train_score=True,
)

print("🚀 Starting grid search...")
print("This will take approximately 10 hours...")
print()

start_time = datetime.now()

# Fit grid search
grid_search.fit(X.values, y.values)

end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 3600

print()
print("=" * 60)
print("✅ GRID SEARCH COMPLETED!")
print("=" * 60)
print(f"Duration: {duration:.2f} hours")
print(f"Best score (AUC): {grid_search.best_score_:.4f}")
print(f"Best parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")
print()

# Save results
results_df = pd.DataFrame(grid_search.cv_results_)
results_path = settings.artifacts_dir / "gridsearch_results.csv"
results_df.to_csv(results_path, index=False)
print(f"Results saved to: {results_path}")

# Save best model
from algo_trader.models.sklearn_baseline import SklearnBaseline
from algo_trader.research.registry import save_run

best_params = grid_search.best_params_
model = SklearnBaseline(
    balanced=(best_params['classifier__class_weight'] == 'balanced'),
    C=best_params['classifier__C'],
    max_iter=best_params['classifier__max_iter'],
    random_state=42,
)
model.fit(X, y)

model_path = save_run(
    model=model,
    calibrator=None,
    metrics={'best_cv_auc': grid_search.best_score_},
    config={
        'timeframe': '1h',
        'target': 'direction_4',
        'search_type': 'gridsearch',
        'best_params': best_params,
    },
    feature_names=list(X.columns),
)

print(f"Best model saved to: {model_path}")
print("=" * 60)