|
| 1 | +import pandas as pd |
| 2 | +import xgboost as xgb |
| 3 | +from sklearn.model_selection import StratifiedKFold, GridSearchCV |
| 4 | +from sklearn.metrics import f1_score, make_scorer |
| 5 | +from sklearn.impute import SimpleImputer |
| 6 | +from imblearn.over_sampling import SMOTE |
| 7 | +from imblearn.pipeline import Pipeline |
| 8 | + |
| 9 | +# Set the seed |
| 10 | +seed = 1234 |
| 11 | + |
| 12 | +# Load the dataset |
| 13 | +train_data = pd.read_csv("train.csv") |
| 14 | + |
| 15 | +# Remove IDs |
| 16 | +train_data = train_data.drop(columns=['ID']) |
| 17 | + |
| 18 | +# Identify imbalanced dataset |
| 19 | +train_data["Target"].value_counts() |
| 20 | +''' |
| 21 | +>>> train_data["Target"].value_counts() |
| 22 | +Target |
| 23 | +0 6420 |
| 24 | +1 316 |
| 25 | +Name: count, dtype: int64 |
| 26 | +''' |
| 27 | + |
| 28 | +# Impute missing values with the mean |
| 29 | +imputer = SimpleImputer(strategy='mean') |
| 30 | +train_data.iloc[:, :] = imputer.fit_transform(train_data) |
| 31 | + |
| 32 | +# Separate features and target |
| 33 | +X = train_data.drop(columns=["Target"]) |
| 34 | +y = train_data["Target"] |
| 35 | + |
| 36 | +# Define the XGBoost model |
| 37 | +model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss") |
| 38 | + |
| 39 | +# Define the hyperparameter grid |
| 40 | +param_grid = { |
| 41 | + 'model__subsample': [0.6, 0.8], |
| 42 | + 'model__colsample_bytree': [0.6, 0.8], |
| 43 | + 'model__n_estimators': [50, 100, 200], |
| 44 | + 'model__max_depth': [3, 5, 7], |
| 45 | + 'model__learning_rate': [0.01, 0.1, 0.2] |
| 46 | +} |
| 47 | + |
| 48 | +# Define F1 scorer |
| 49 | +f1_scorer = make_scorer(f1_score, pos_label=1) |
| 50 | + |
| 51 | +# Create a pipeline with SMOTE and the XGBoost model |
| 52 | +pipeline = Pipeline([ |
| 53 | + ('smote', SMOTE(random_state=seed)), |
| 54 | + ('model', model) |
| 55 | +]) |
| 56 | + |
| 57 | +# Perform 5-fold stratified cross-validation with grid search |
| 58 | +skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) |
| 59 | +grid_search = GridSearchCV( |
| 60 | + estimator=pipeline, |
| 61 | + param_grid=param_grid, |
| 62 | + scoring=f1_scorer, |
| 63 | + cv=skf, |
| 64 | + verbose=1, |
| 65 | + n_jobs=-1 |
| 66 | +) |
| 67 | + |
| 68 | +# Fit the grid search |
| 69 | +grid_search.fit(X, y) |
| 70 | + |
| 71 | +# Print the best parameters and the best F1 score |
| 72 | +print("Best parameters: {}".format(grid_search.best_params_)) |
| 73 | +print("Best F1 score: {}".format(grid_search.best_score_)) |
| 74 | + |
| 75 | +score = 100*grid_search.best_score_ |
| 76 | +''' |
| 77 | +This is the solution you are lookin at since test.csv does not have |
| 78 | +Target column |
| 79 | +''' |
| 80 | +print('Solution: {}'.format(score)) |
| 81 | + |
| 82 | +# Train the best model on the full training set |
| 83 | +best_model = grid_search.best_estimator_ |
| 84 | +best_model.fit(X, y) |
| 85 | + |
| 86 | +# Load the test dataset |
| 87 | +test_data = pd.read_csv("test.csv") |
| 88 | + |
| 89 | +# Remove the 'ID' column |
| 90 | +test_ids = test_data['ID'] # Save the IDs for the output file |
| 91 | +test_data = test_data.drop(columns=['ID']) |
| 92 | + |
| 93 | +# Predict using the trained model |
| 94 | +predictions = best_model.predict(test_data) |
| 95 | + |
| 96 | +# Create a DataFrame for the predictions |
| 97 | +output = pd.DataFrame({ |
| 98 | + 'ID': test_ids, |
| 99 | + 'Target': predictions |
| 100 | +}) |
| 101 | + |
| 102 | +# Save the predictions to a CSV file |
| 103 | +output.to_csv("predictions.csv", index=False) |
| 104 | + |
| 105 | +print("Predictions saved to predictions.csv") |
| 106 | + |
0 commit comments