Click-Analytics/CTGD.py at main · Mattral/Click-Analytics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
from utils import new_line

def handle_categorical_data(st, df):

    # Encoding
    new_line()
    st.markdown("### 🔠 Handling Categorical Data", unsafe_allow_html=True)
    new_line()
    with st.expander("Show Encoding"):
        new_line()

        # Explain Encoding
        exp_enc = st.checkbox("Explain Encoding", value=False, key='exp_enc')
        if exp_enc:
            col1, col2 = st.columns([0.8, 1])
            with col1:
                st.markdown("<h6 align='center'>Ordinal Encoding</h6>", unsafe_allow_html=True)
                cola, colb = st.columns(2)
                with cola:
                    st.write("Before Encoding")
                    st.dataframe(pd.DataFrame(np.array(['a', 'b', 'c', 'b', 'a'])), width=120, height=200)
                with colb:
                    st.write("After Encoding")
                    st.dataframe(pd.DataFrame(np.array([0, 1, 2, 1, 0])), width=120, height=200)

            with col2:
                st.markdown("<h6 align='center'>One Hot Encoding</h6>", unsafe_allow_html=True)
                cola, colb = st.columns([0.7, 1])
                with cola:
                    st.write("Before Encoding")
                    st.dataframe(pd.DataFrame(np.array(['a', 'b', 'c', 'b', 'a'])), width=150, height=200)
                with colb:
                    st.write("After Encoding")
                    st.dataframe(pd.DataFrame(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0]])), width=200, height=200)

            col1, col2, col3 = st.columns([0.5, 1, 0.5])
            with col2:
                new_line()
                st.markdown("<h6 align='center'>Count Frequency Encoding</h6>", unsafe_allow_html=True)
                cola, colb = st.columns([0.8, 1])
                with cola:
                    st.write("Before Encoding")
                    st.dataframe(pd.DataFrame(np.array(['a', 'b', 'c', 'b', 'a'])), width=150, height=200)
                with colb:
                    st.write("After Encoding")
                    st.dataframe(pd.DataFrame(np.array([0.4, 0.4, 0.2, 0.4, 0.4])), width=200, height=200)

            new_line()

        # Show Categorical Features
        show_cat = st.checkbox("Show Categorical Features", value=False, key='show_cat')
        if show_cat:
            col1, col2 = st.columns(2)
            col1.dataframe(df.select_dtypes(include=[object]), height=250, use_container_width=True)
            if len(df.select_dtypes(include=[object]).columns.tolist()) > 1:
                tmp = df.select_dtypes(include=[object])
                tmp = tmp.apply(lambda x: x.unique())
                tmp = tmp.to_frame()
                tmp.columns = ['Unique Values']
                col2.dataframe(tmp, height=250, use_container_width=True)

        # Further Analysis
        further_analysis = st.checkbox("Further Analysis", value=False, key='further_analysis')
        if further_analysis:
            col1, col2 = st.columns([0.5, 1])
            with col1:
                new_line()
                st.markdown("<h6 align='left'> Number of Unique Values</h6>", unsafe_allow_html=True)
                unique_values = pd.DataFrame(df.select_dtypes(include=[object]).nunique())
                unique_values.columns = ['# Unique Values']
                unique_values = unique_values.sort_values(by='# Unique Values', ascending=False)
                st.dataframe(unique_values, width=200, height=300)

            with col2:
                new_line()
                st.markdown("<h6 align='center'> Plot for the Count of Unique Values </h6>", unsafe_allow_html=True)
                unique_values = pd.DataFrame(df.select_dtypes(include=[object]).nunique())
                unique_values.columns = ['# Unique Values']
                unique_values = unique_values.sort_values(by='# Unique Values', ascending=False)
                unique_values['Feature'] = unique_values.index
                fig = px.bar(unique_values, x='Feature', y='# Unique Values', color='# Unique Values', height=350)
                st.plotly_chart(fig, use_container_width=True)

        # Input for encoding
        col1, col2 = st.columns(2)
        with col1:
            enc_feat = st.multiselect("Select Features", df.select_dtypes(include=[object]).columns.tolist(), key='encoding_feat', help="Select the categorical features to encode.")
        with col2:
            encoding = st.selectbox("Select Encoding", ["Select", "Ordinal Encoding", "One Hot Encoding", "Count Frequency Encoding"], key='encoding', help="Select the encoding method.")

        if enc_feat and encoding != "Select":
            new_line()
            col1, col2, col3 = st.columns([1, 0.5, 1])
            if col2.button("Apply", key='encoding_apply', use_container_width=True, help="Click to apply encoding."):
                # Perform encoding based on the selected method
                apply_encoding(df, enc_feat, encoding, st)

        # Show DataFrame Button
        col1, col2, col3 = st.columns([0.15, 1, 0.15])
        col2.divider()
        col1, col2, col3 = st.columns([1, 0.7, 1])
        with col2:
            show_df = st.button("Show DataFrame", key="cat_show_df", help="Click to show the DataFrame.")
        if show_df:
            st.dataframe(df, use_container_width=True)

def apply_encoding(df, features, method, st):
    if method == "Ordinal Encoding":
        from sklearn.preprocessing import OrdinalEncoder
        encoder = OrdinalEncoder()
        df[features] = encoder.fit_transform(df[features])
        st.success(f"The Categories of the features **`{features}`** have been encoded using Ordinal Encoding.")

    elif method == "One Hot Encoding":
        df = pd.get_dummies(df, columns=features)
        st.success(f"The Categories of the features **`{features}`** have been encoded using One Hot Encoding.")

    elif method == "Count Frequency Encoding":
        for feature in features:
            freq = df[feature].value_counts() / len(df)
            df[feature] = df[feature].map(freq)
        st.success(f"The Categories of the features **`{features}`** have been encoded using Count Frequency Encoding.")