Skip to content

Commit 160f3cd

Browse files
committed
adding baires dev interview problem
1 parent ecdfb89 commit 160f3cd

File tree

6 files changed

+11243
-0
lines changed

6 files changed

+11243
-0
lines changed
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import pandas as pd
2+
import xgboost as xgb
3+
from sklearn.model_selection import StratifiedKFold, GridSearchCV
4+
from sklearn.metrics import f1_score, make_scorer
5+
from sklearn.impute import SimpleImputer
6+
from imblearn.over_sampling import SMOTE
7+
from imblearn.pipeline import Pipeline
8+
9+
# Set the seed
10+
seed = 1234
11+
12+
# Load the dataset
13+
train_data = pd.read_csv("train.csv")
14+
15+
# Remove IDs
16+
train_data = train_data.drop(columns=['ID'])
17+
18+
# Identify imbalanced dataset
19+
train_data["Target"].value_counts()
20+
'''
21+
>>> train_data["Target"].value_counts()
22+
Target
23+
0 6420
24+
1 316
25+
Name: count, dtype: int64
26+
'''
27+
28+
# Impute missing values with the mean
29+
imputer = SimpleImputer(strategy='mean')
30+
train_data.iloc[:, :] = imputer.fit_transform(train_data)
31+
32+
# Separate features and target
33+
X = train_data.drop(columns=["Target"])
34+
y = train_data["Target"]
35+
36+
# Define the XGBoost model
37+
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
38+
39+
# Define the hyperparameter grid
40+
param_grid = {
41+
'model__subsample': [0.6, 0.8],
42+
'model__colsample_bytree': [0.6, 0.8],
43+
'model__n_estimators': [50, 100, 200],
44+
'model__max_depth': [3, 5, 7],
45+
'model__learning_rate': [0.01, 0.1, 0.2]
46+
}
47+
48+
# Define F1 scorer
49+
f1_scorer = make_scorer(f1_score, pos_label=1)
50+
51+
# Create a pipeline with SMOTE and the XGBoost model
52+
pipeline = Pipeline([
53+
('smote', SMOTE(random_state=seed)),
54+
('model', model)
55+
])
56+
57+
# Perform 5-fold stratified cross-validation with grid search
58+
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
59+
grid_search = GridSearchCV(
60+
estimator=pipeline,
61+
param_grid=param_grid,
62+
scoring=f1_scorer,
63+
cv=skf,
64+
verbose=1,
65+
n_jobs=-1
66+
)
67+
68+
# Fit the grid search
69+
grid_search.fit(X, y)
70+
71+
# Print the best parameters and the best F1 score
72+
print("Best parameters: {}".format(grid_search.best_params_))
73+
print("Best F1 score: {}".format(grid_search.best_score_))
74+
75+
score = 100*grid_search.best_score_
76+
'''
77+
This is the solution you are lookin at since test.csv does not have
78+
Target column
79+
'''
80+
print('Solution: {}'.format(score))
81+
82+
# Train the best model on the full training set
83+
best_model = grid_search.best_estimator_
84+
best_model.fit(X, y)
85+
86+
# Load the test dataset
87+
test_data = pd.read_csv("test.csv")
88+
89+
# Remove the 'ID' column
90+
test_ids = test_data['ID'] # Save the IDs for the output file
91+
test_data = test_data.drop(columns=['ID'])
92+
93+
# Predict using the trained model
94+
predictions = best_model.predict(test_data)
95+
96+
# Create a DataFrame for the predictions
97+
output = pd.DataFrame({
98+
'ID': test_ids,
99+
'Target': predictions
100+
})
101+
102+
# Save the predictions to a CSV file
103+
output.to_csv("predictions.csv", index=False)
104+
105+
print("Predictions saved to predictions.csv")
106+

0 commit comments

Comments
 (0)