-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_test_data.py
More file actions
63 lines (51 loc) · 2.07 KB
/
create_test_data.py
File metadata and controls
63 lines (51 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
# Define the features as in the notebook
INPUT_FEATURES = [
'sst_media_weekly', 'tavg', 'tmin', 'tmax', 'prcp', 'wdir',
'wspd', 'pres', 'year', 'month', 'day', 'lag_1', 'lag_2', 'lag_3',
'lag_4', 'lag_5', 'forcast', 'soil_temperature_0_to_7cm_mean',
'soil_temperature_7_to_28cm_mean', 'soil_moisture_0_to_7cm_mean',
'soil_moisture_7_to_28cm_mean', 'soil_temp_7_28_lag1',
'soil_temp_0_7_lag1', 'soil_temp_0_7_lag2',
'soil_moisture_0_to_7cm_lag1', 'soil_moisture_7_to_28cm_mean_lag1',
'sst_anomaly', 'soil_pressure_interaction', 'sst_anomaly_lag1',
'soil_pressure_interaction_lag1', 'soil_pressure_interaction_lag2',
'temperature_8am'
]
# Load your original data
data = pd.read_csv('Data/Datasets/LA_dataset.csv')
# Clean data
if "Unnamed: 0" in data.columns:
data = data.drop(columns=["Unnamed: 0"])
data.dropna(inplace=True)
# Create test dataset
df = data.copy()
# Create date column if it doesn't exist
if 'date' not in df.columns:
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
X = df[INPUT_FEATURES].values
y = df['target'].values
# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, shuffle=False)
# Get corresponding dates for test set
test_dates = df['date'].iloc[-len(X_test):].values
# Create test DataFrame
test_data = pd.DataFrame(X_test, columns=INPUT_FEATURES)
test_data['date'] = test_dates
test_data['target'] = y_test
# Save test data and scaler
os.makedirs("Models/Models", exist_ok=True)
test_data.to_csv("Models/Models/test_data.csv", index=False)
pd.DataFrame({'feature': INPUT_FEATURES}).to_csv("Models/Models/feature_names.csv", index=False)
print("Test data saved successfully!")
print(f"Number of test samples: {len(test_data)}")
print(f"Date range: from {test_data['date'].min()} to {test_data['date'].max()}")
print(f"Features: {', '.join(INPUT_FEATURES)}")