-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataExploration.py
More file actions
90 lines (76 loc) · 3.22 KB
/
DataExploration.py
File metadata and controls
90 lines (76 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import CreateDataframe as data
import plotly.graph_objects as go
import plotly.express as px
def plot_histogram(title, df):
"""
Plots a histogram with 5 bins.
:param title: histogram title
:param df: dataframe to plot
"""
fig, ax = plt.subplots()
ax.hist(x=df, histtype="bar", bins=[0, 1, 2, 3, 4, 5], width=0.5)
ax.set_xticks([0.3, 1.3, 2.3, 3.3, 4.3])
ax.set_xticklabels(CreateDataframe.crimes_dict.values(), rotation=10)
plt.title("Label Histogram (" + title + ")")
plt.ylabel("Number of samples")
plt.show()
def create_label_histograms(original_full, train, validation, test):
"""
Plots 4 label histograms of different splits of the data (All of it, train,
validation, test)
"""
plot_histogram("Original Full Dataset", original_full["Primary Code"])
plot_histogram("Train Dataset", train["Primary Code"])
plot_histogram("Validation Dataset", validation["Primary Code"])
plot_histogram("Test Dataset", test["Primary Code"])
def plot_matrix_cor(df):
corr = df.corr()
fig = px.imshow(df.corr())
fig.show()
def plot_correlation_plot(df, df_name, feature1, feature2, style, with_errors):
"""
Plots one graph of 2-feature correlation.
:param style: 'lines+markers' or 'markers'
:param with_errors: True for error bars, False for no error bars
"""
my_title = df_name + " - Correlation between " + feature1 + " and " + feature2
if with_errors:
go.Figure([go.Scatter(x=df[feature1], y=df[feature2], mode=style,
error_y=dict(type='percent', value=50, visible=True))],
layout=go.Layout(title=my_title, xaxis_title=feature1,
yaxis_title=feature2)).show()
else:
go.Figure([go.Scatter(x=df[feature1], y=df[feature2], mode=style)],
layout=go.Layout(title=my_title, xaxis_title=feature1,
yaxis_title=feature2)).show()
def create_all_features_correlations(df, df_name):
"""
Plots several 2-feature correlation graphs.
To add a plot, call plot_correlation_plot with desired 2 features, style
(lines or dots, and error mode (True for error bars, False for no error bars)
"""
lines, dots = 'lines+markers', 'markers' # styles to choose from
plot_correlation_plot(df, df_name, "District", "Ward", dots, False)
def plot_histogram_2(title, df):
"""
Plots a histogram with 5 bins.
:param title: histogram title
:param df: dataframe to plot
"""
for i in range(3):
ds = df[df["Hour Code"]==i]
fig, ax = plt.subplots()
ax.hist(x=ds["Primary Code"], histtype="bar", bins=[0, 1, 2, 3,4 ,5], width=0.5)
ax.set_xticks([0.3, 1.3, 2.3, 3.3, 4.3])
ax.set_xticklabels(CreateDataframe.crimes_dict.values(), rotation=10)
plt.title("Label Histogram (" + title + ")")
plt.ylabel("Number of samples")
plt.show()
if __name__ == "__main__":
create_label_histograms(data.original_full_p, data.train_p,
data.validation_p, data.test_p)
create_all_features_correlations(data.train_p, "Train dataset")