Customer_Churn_Prediction/train_model.py at main · ANAGHAKTP/Customer_Churn_Prediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import joblib

def train():
    print("Loading data...")
    try:
        df = pd.read_excel('Telco_customer_churn.xlsx')
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # Preprocessing
    # 'Total Charges' is object type, coerce to numeric
    df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

    # Target
    target = 'Churn Value'

    # Features
    # We will exclude IDs, location details (too many categories), and churn-related output columns
    drop_cols = ['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code', 'Lat Long',
                 'Latitude', 'Longitude', 'Churn Label', 'Churn Value', 'Churn Score', 'CLTV', 'Churn Reason']

    # Identify numerical and categorical columns
    feature_cols = [c for c in df.columns if c not in drop_cols]

    print(f"Features selected: {feature_cols}")

    X = df[feature_cols]
    y = df[target]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define transformers
    numeric_features = ['Tenure Months', 'Monthly Charges', 'Total Charges']
    categorical_features = [c for c in feature_cols if c not in numeric_features]

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Model - Using Logistic Regression for smaller model size
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', LogisticRegression(random_state=42, max_iter=1000))])

    print("Training model...")
    clf.fit(X_train, y_train)

    print("Model score: %.3f" % clf.score(X_test, y_test))

    print("Saving model...")
    joblib.dump(clf, 'model.pkl')

    schema = {
        'numeric': numeric_features,
        'categorical': categorical_features
    }
    joblib.dump(schema, 'schema.pkl')

    print("Done.")

if __name__ == '__main__':
    train()