CallDropPrediction/explore_page.py at master · Neelrayal/CallDropPrediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
import streamlit as st
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

df = pd.read_pickle("Models/saved_df")
X = pd.read_pickle("Models/X.pkl")
y = pd.read_pickle("Models/y.pkl")


def feature_selection():
    st.subheader("1. Univariate Selection")
    st.write("Importance of parameter based on their score."
             "Statistical tests can be used to select those features that have the strongest relationship with the output variable.")

    bestfeatures = SelectKBest(score_func=chi2, k=5)
    fit = bestfeatures.fit(X, y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    # concat two dataframes for better visualization
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Attribute Name', 'Score']  # naming the dataframe columns
    st.write(featureScores.nlargest(5, 'Score'))  # print 10 best features

    st.write("Here, rating has the strongest relationship with the output variable")


def feature_importance():
    st.subheader("2. Feature Importance")
    st.markdown("""
  You can get the feature importance of each feature of your dataset by using the feature importance property of the model.
  Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.
  Feature importance is an inbuilt class that comes with Tree Based Classifiers, we will be using Extra Tree Classifier for extracting the top 10 features for the dataset.
  """)

    from sklearn.ensemble import ExtraTreesClassifier
    model = ExtraTreesClassifier()
    model.fit(X, y)
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)

    fig = px.bar(feat_importances, orientation='h')
    st.write(fig)
    st.write("Visual representation of how strongly is output variable dependent on different input variables")


def correlation_matrix():
    st.subheader("3.Correlation Matrix with Heatmap")
    st.write("Correlation states how the features are related to each other or the target variable.")
    fig, ax = plt.subplots()
    sns.heatmap(df.corr(), ax=ax)
    st.write(fig)


def show_explore_page():
    st.title("""  Explore page """)
    feature_selection()
    feature_importance()
    correlation_matrix()

    st.subheader("4. Information for selected attributes")

    # checking is user has cliked the predict button
    if "flag" not in st.session_state:
        st.markdown(f"""
        #### <span style="color:#faa"> Please select inputs in predict page for further insights </span>
        """, unsafe_allow_html=True)
    else:
        # reding the data set
        df = pd.read_csv("MyCall_Data_September_2019_cleaning.csv")
        df.drop(['Latitude', 'Longitude'], axis='columns', inplace=True)
        df.rename(columns={
            'In Out Travelling': 'InOut', 'Network Type': 'NetworkType', 'Call Drop Category': 'CallDropCategory',
            'State Name': 'StateName'},
            inplace=True)

        # df = pd.read_pickle("Models/saved_df")
        old = df.copy()

        # setting the user selected attributes
        operator = st.session_state.operator
        inout = st.session_state.inout
        networktype = st.session_state.network
        statename = st.session_state.state
        ratings = st.session_state.rating

        # displaying the results
        d = old.groupby(['StateName']).size()
        d = d.reset_index()
        d1 = d[d['StateName'] == statename][0].to_string(index=False)

        d = old.groupby(['Operator']).size()
        d = d.reset_index()
        d2 = d[d['Operator'] == operator][0].to_string(index=False)

        d = old.groupby(['NetworkType']).size()
        d = d.reset_index()
        d3 = d[d['NetworkType'] == networktype][0].to_string(index=False)

        ans1 = str(old[
                       (old['StateName'] == statename) &
                       (old['Operator'] == operator) &
                       (old['NetworkType'] == networktype) &
                       (old['InOut'] == inout)
                       ].shape[0])

        st.markdown(f"""
      - In your state <span style="color:#8ef">{statename}</span> {str(d1)} calls were made
      - For your operator <span style="color:#8ef">({operator}) </span> , {str(d1)} many calls were made
      - For your network type <span style="color:#8ef">({(networktype)}) </span>  {str(d3)} calls were done
      - In your state <span style="color:#fea">({statename}) </span>  with the operator <span style="color:#fea">({operator}) </span> ,while the connection being <span style="color:#fea">({inout}) </span> , there are a total of <span style="color:#8ef">{ans1} </span>  calls.
        """, unsafe_allow_html=True)

        d = old.groupby(['Operator', 'CallDropCategory']).size()
        d = d.reset_index()
        d = d.rename(columns={0: 'Calls Dropped'})
        d = d[
            d['Operator'] == operator
            ]
        st.write("Category wise calls for your operator")
        st.write(d)

        st.subheader("5. Information about dataset")

        st.write("Calls made from each state")
        d = df.groupby(['StateName']).size()
        d = d.reset_index()
        d = d.rename(columns={0: 'Calls Dropped'})
        st.write(d)

        st.write("Calls from each operator")
        d = df.groupby(['Operator']).size()
        d = d.reset_index()
        d = d.rename(columns={0: 'Calls Dropped'})
        st.write(d)

        st.write("Calls from each network type")
        d = df.groupby(['NetworkType']).size()
        d = d.reset_index()
        d = d.rename(columns={0: 'Calls Dropped'})
        st.write(d)

        st.write("Statewise calls made")
        st.bar_chart(df.groupby(['StateName']).size())

        st.write("Average Ratind by Network Type, Separated by Operator")

        fig = plt.figure(figsize=(8, 5))
        sns.pointplot(x='NetworkType', y='Rating', data=df, hue='Operator')
        st.pyplot(fig)