-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheda_analysis.py
More file actions
137 lines (120 loc) · 4.7 KB
/
eda_analysis.py
File metadata and controls
137 lines (120 loc) · 4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
import numpy as np
import pandas as pd
import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from dataloading_process import get_file_content_as_string, load_metadata, file_selector
def eda():
st.title('EDA')
st.header('Data Loading')
filename = file_selector()
st.write('You selected `%s`' % filename)
df = load_metadata(filename)
# df for missing data and percentage
nans = pd.concat([df.isnull().sum(), (df.isnull().sum() / df.shape[0]) * 100], axis=1, keys=['Num_NaN', 'NaN_Percent'])
nans = nans.sort_values(by=['NaN_Percent'], ascending=False)
def catd():
@st.cache
def select_data():
return df.select_dtypes(include=['object'])
df_cat = select_data()
if(df_cat.empty):
st.error('No Categorical Data')
else:
bar = st.progress(0)
for i, c in enumerate(df_cat.columns.tolist()):
check_plot = st.checkbox(c)
bar.progress((i+1)/len(df_cat.columns.tolist()))
if(check_plot):
fig1 = px.histogram(df_cat, x=c)
st.plotly_chart(fig1)
if(len(df_cat[c].unique().tolist())<=20):
freq = pd.DataFrame(df_cat[c].value_counts())
fig2 = px.pie(df_cat, values=freq[c], names=freq.index.tolist(), title=c)
st.plotly_chart(fig2)
def boold():
@st.cache
def select_data():
return df.select_dtypes(include='bool')
df_bool = select_data()
if(df_bool.empty):
st.error('No Boolean Data')
def numd():
@st.cache
def select_data():
return df.select_dtypes(include='number')
df_num = select_data()
if(df_num.empty):
st.error('No Numerical Data')
else:
bar = st.progress(0)
for i, c in enumerate(df_num.columns.tolist()):
check_plot = st.checkbox(c)
bar.progress((i+1)/len(df_num.columns.tolist()))
if(check_plot):
fig2 = px.histogram(df_num, x = c, marginal='box')
st.plotly_chart(fig2)
st.warning('Missing values are droped for density plot')
fig3 = ff.create_distplot([df_num[c].dropna()], [c], show_hist=False)
st.plotly_chart(fig3)
def dated():
@st.cache
def select_data():
return df.select_dtypes(include='datetime')
df_date = select_data()
if(df_date.empty):
st.error('No Date Time Data')
return
def selectDataType():
selected = st.selectbox('Select Data Types', ['Categorical', 'Numeric', 'Boolean', 'Data Time'])
switcher = {
'Categorical': catd,
'Numeric': numd,
'Boolean': boold,
'Data Time': dated
}
func = switcher.get(selected, lambda: "Invalid Method")
func()
def ms_over():
ranges = st.sidebar.slider('Range of Features to View', 0.0, 100.0, (0.0, 25.0))
cols_with_nans = nans.index.tolist()
col_ranges = cols_with_nans[int(len(cols_with_nans)*ranges[0]/100):int(len(cols_with_nans)*ranges[1]/100)]
df_select = df[col_ranges]
msno.matrix(df=df_select, figsize=(30, 15), color=(0.24, 0.77, 0.77))
st.pyplot()
st.table(nans.loc[col_ranges])
def only_ms():
labels = []
ranges = st.sidebar.slider('Range of Missing% to View', 0.0, 100.0, (50.0, 70.0))
for i in nans.index:
if(nans['NaN_Percent'][i]<=ranges[1] and nans['NaN_Percent'][i]>=ranges[0]):
labels.append(i)
elif(nans['NaN_Percent'][i]<ranges[1]):
break
df_select = df[labels]
msno.matrix(df=df_select, figsize=(30, 15), color=(0.24, 0.77, 0.77))
st.pyplot()
st.table(nans.loc[labels])
return
def selectMissingValues():
selected = st.selectbox('Missing Value Visualizations', ['By Feature', 'By Missing%'])
switcher = {
'By Feature': ms_over,
'By Missing%': only_ms
}
func = switcher.get(selected, lambda: "Invalid Method")
func()
return
method = st.sidebar.selectbox('Select EDA View', ['By Data Type', 'By Misssing Values'])
def selectMethod(arg):
switcher = {
'By Data Type': selectDataType,
'By Misssing Values': selectMissingValues,
}
func = switcher.get(arg, lambda: "Invalid Method")
func()
selectMethod(method)
return