-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
198 lines (160 loc) · 8.75 KB
/
app.py
File metadata and controls
198 lines (160 loc) · 8.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from flask import Flask, request, render_template, url_for
import pandas as pd
import os
import io
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
app = Flask(__name__)
# Ensure the static folder is created for saving charts
os.makedirs('static/charts', exist_ok=True)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return "No file uploaded", 400
file = request.files['file']
if file.filename == '':
return "No file selected", 400
# File size limit: 10MB
file.seek(0, 2)
file_size = file.tell()
file.seek(0)
if file_size > 10 * 1024 * 1024:
return "File too large. Maximum size is 10MB.", 400
# Check file extension
filename = file.filename.lower()
if not (filename.endswith('.csv') or filename.endswith('.xlsx')):
return "Unsupported file format. Please upload CSV or Excel files.", 400
try:
# Load the dataset
if filename.endswith('.csv'):
df = pd.read_csv(file)
else:
df = pd.read_excel(file)
if df.empty:
return "Uploaded file is empty.", 400
# Get preprocessing options from form
preprocess_options = request.form.getlist('preprocess')
selected_charts = request.form.getlist('charts')
# Save original dataset
original_file_path = 'static/original_dataset.csv'
df.to_csv(original_file_path, index=False)
# Preprocess the dataset
df_cleaned = df.copy()
# Default: remove missing values
if 'remove_na' in preprocess_options or not preprocess_options:
df_cleaned = df_cleaned.dropna()
# Remove duplicates
if 'remove_duplicates' in preprocess_options:
df_cleaned = df_cleaned.drop_duplicates()
# Remove outliers (IQR method for numerical columns)
if 'remove_outliers' in preprocess_options:
numerical_cols = df_cleaned.select_dtypes(include=['number']).columns
for col in numerical_cols:
Q1 = df_cleaned[col].quantile(0.25)
Q3 = df_cleaned[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
# Normalize numerical columns
if 'normalize' in preprocess_options:
numerical_cols = df_cleaned.select_dtypes(include=['number']).columns
if len(numerical_cols) > 0:
scaler = MinMaxScaler()
df_cleaned[numerical_cols] = scaler.fit_transform(df_cleaned[numerical_cols])
# Encode categorical columns (one-hot encoding)
if 'encode_categoricals' in preprocess_options:
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
for col in categorical_cols:
df_cleaned = pd.get_dummies(df_cleaned, columns=[col], prefix=col, drop_first=True)
# Save the cleaned dataset
cleaned_file_path = 'static/cleaned_dataset.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)
# Generate summaries
original_summary = df.describe(include='all').to_html(classes='table table-striped')
cleaned_summary = df_cleaned.describe(include='all').to_html(classes='table table-striped')
# Render the results page
return render_template(
'result.html',
original_file=original_file_path,
cleaned_file=cleaned_file_path,
original_summary=original_summary,
cleaned_summary=cleaned_summary,
charts_selected=bool(selected_charts),
charts=generate_comparison_charts(df, df_cleaned, selected_charts)
)
except Exception as e:
return f"Error processing file: {str(e)}", 500
def generate_comparison_charts(df_original, df_cleaned, selected_charts):
charts = []
numerical_cols = df_original.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df_original.select_dtypes(include=['object']).columns.tolist()
# Helper to get first suitable column
def get_first_num_col():
return numerical_cols[0] if numerical_cols else None
def get_first_cat_col():
return categorical_cols[0] if categorical_cols else None
def get_two_num_cols():
return numerical_cols[:2] if len(numerical_cols) >= 2 else None
# Generate Histogram
if 'histogram' in selected_charts and numerical_cols:
col = get_first_num_col()
fig_orig = px.histogram(df_original, x=col, title=f'Original Data - Histogram ({col})', marginal='rug')
html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Original Histogram ({col})', 'html': html_orig})
fig_clean = px.histogram(df_cleaned, x=col, title=f'Cleaned Data - Histogram ({col})', marginal='rug')
html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Cleaned Histogram ({col})', 'html': html_clean})
# Generate Box Plot
if 'boxplot' in selected_charts and numerical_cols:
col = get_first_num_col()
fig_orig = px.box(df_original, y=col, title=f'Original Data - Box Plot ({col})')
html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Original Box Plot ({col})', 'html': html_orig})
fig_clean = px.box(df_cleaned, y=col, title=f'Cleaned Data - Box Plot ({col})')
html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Cleaned Box Plot ({col})', 'html': html_clean})
# Generate Scatter Plot
if 'scatter' in selected_charts and len(numerical_cols) >= 2:
cols = get_two_num_cols()
fig_orig = px.scatter(df_original, x=cols[0], y=cols[1], title=f'Original Data - Scatter Plot ({cols[0]} vs {cols[1]})')
html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Original Scatter Plot ({cols[0]} vs {cols[1]})', 'html': html_orig})
fig_clean = px.scatter(df_cleaned, x=cols[0], y=cols[1], title=f'Cleaned Data - Scatter Plot ({cols[0]} vs {cols[1]})')
html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Cleaned Scatter Plot ({cols[0]} vs {cols[1]})', 'html': html_clean})
# Generate Line Chart
if 'line' in selected_charts and numerical_cols:
col = get_first_num_col()
fig_orig = px.line(df_original, x=df_original.index, y=col, title=f'Original Data - Line Chart ({col})')
html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Original Line Chart ({col})', 'html': html_orig})
fig_clean = px.line(df_cleaned, x=df_cleaned.index, y=col, title=f'Cleaned Data - Line Chart ({col})')
html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Cleaned Line Chart ({col})', 'html': html_clean})
# Generate Bar Chart
if 'bar' in selected_charts and numerical_cols:
col = get_first_num_col()
fig_orig = px.bar(df_original.head(10), x=df_original.index[:10], y=col, title='Original Data - Bar Chart (First 10 rows)')
html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': 'Original Bar Chart', 'html': html_orig})
fig_clean = px.bar(df_cleaned.head(10), x=df_cleaned.index[:10], y=col, title='Cleaned Data - Bar Chart (First 10 rows)')
html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': 'Cleaned Bar Chart', 'html': html_clean})
# Generate Pie Chart
if 'pie' in selected_charts and categorical_cols:
col = get_first_cat_col()
fig_orig = px.pie(df_original, names=col, title=f'Original Data - Pie Chart ({col})')
html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Original Pie Chart ({col})', 'html': html_orig})
fig_clean = px.pie(df_cleaned, names=col, title=f'Cleaned Data - Pie Chart ({col})')
html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
charts.append({'name': f'Cleaned Pie Chart ({col})', 'html': html_clean})
return charts
if __name__ == '__main__':
app.run(debug=True)