ChurnModeling/predict_churn_data.py at main · chandra122/ChurnModeling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
from pycaret.classification import predict_model, load_model

# Load our pre-trained logistic regression model that was saved during training
# This model was chosen as the best performing model during our model selection phase
model = load_model('lr')


def load_data(filepath):
    """
    Loads customer churn data from a CSV file into a pandas DataFrame.

    Args:
        filepath (str): Path to the CSV file containing customer data

    Returns:
        pd.DataFrame: DataFrame with customerID as index and all customer features
    """
    df = pd.read_csv(filepath, index_col='customerID')
    return df


def make_predictions(df, threshold=0.75):
    """
    Makes churn predictions for customers using our trained model.

    Args:
        df (pd.DataFrame): DataFrame containing customer features
        threshold (float): Probability threshold for churn classification (default: 0.75)

    Returns:
        pd.DataFrame: DataFrame containing only the churn predictions
    """
    # Get raw predictions and probability scores from the model
    predictions = predict_model(model, data=df)

    # Convert probability scores to binary predictions using the threshold
    predictions['Churn_prediction'] = (predictions['prediction_score'] >= threshold)

    # Convert boolean predictions to '0' (churn) and '1' (no churn)
    # Note: This is inverted from the usual convention to match the test data
    predictions['Churn_prediction'].replace({True: '0', False: '1'}, inplace=True)

    # Keep only the prediction column and drop all other columns
    drop_cols = predictions.columns.tolist()
    drop_cols.remove('Churn_prediction')
    return predictions.drop(drop_cols, axis=1)


def calculate_accuracy(predicted_labels, true_labels):
    """
    Calculates the accuracy of our churn predictions against true labels.

    Args:
        predicted_labels (list): List of predicted labels ('0' or '1')
        true_labels (list): List of true labels (0 or 1)

    Returns:
        float: Accuracy score between 0 and 1
    """
    # Count correct predictions (1 if prediction matches true label, 0 otherwise)
    correct_predictions = [0 if pred == true else 1 for pred, true in zip(predicted_labels, true_labels)]
    accuracy = sum(correct_predictions) / len(correct_predictions)
    return accuracy


if __name__ == "__main__":
    # Load new customer data and make predictions
    df = load_data(r'new_churn_data.csv')
    predictions = make_predictions(df)
    print('predictions:')
    print(predictions)

    # Known true values for our test customers
    true_values = [1, 0, 0, 1, 0]

    # Calculate and print prediction accuracy
    predicted_labels = predictions['Churn_prediction'].tolist()
    accuracy = calculate_accuracy(predicted_labels, true_values)

    print(f"Prediction Accuracy: {accuracy:.2f}")