-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleaning_engine.py
More file actions
191 lines (148 loc) · 5.63 KB
/
cleaning_engine.py
File metadata and controls
191 lines (148 loc) · 5.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# cleaning_engine.py
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
@dataclass
class CleaningSummary:
"""
Simple container for information about what happened during cleaning.
"""
rows_before: int
rows_after: int
duplicates_removed: int
missing_values_before: int
missing_values_after: int
column_notes: Dict[str, str] = field(default_factory=dict)
def as_dict(self) -> Dict[str, object]:
"""
Convert the summary to a plain dict (useful later for reports).
"""
return {
"rows_before": self.rows_before,
"rows_after": self.rows_after,
"duplicates_removed": self.duplicates_removed,
"missing_values_before": self.missing_values_before,
"missing_values_after": self.missing_values_after,
"column_notes": self.column_notes,
}
def _count_missing(df: pd.DataFrame) -> int:
"""
Count total missing values in the DataFrame.
"""
return int(df.isna().sum().sum())
def _standardize_dates(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
"""
Try to standardize date columns.
Strategy:
- Any column whose name contains 'date' (case-insensitive) is a candidate.
- We call pandas.to_datetime on those columns with errors='ignore'.
Returns:
(updated_df, notes)
"""
df = df.copy()
notes: Dict[str, str] = {}
for col in df.columns:
if "date" in col.lower():
# Try to convert to datetime; if it fails, pandas leaves it unchanged.
before_dtype = str(df[col].dtype)
df[col] = pd.to_datetime(df[col], errors="ignore")
after_dtype = str(df[col].dtype)
if before_dtype != after_dtype:
notes[col] = f"Converted to datetime (from {before_dtype} to {after_dtype})."
else:
notes[col] = "Left as-is (conversion not necessary or not possible)."
return df, notes
def _normalize_numeric_types(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
"""
Try to turn object columns that look like numbers into numeric dtype.
Strategy:
- For each 'object' column, attempt pd.to_numeric with errors='ignore'.
- If dtype changes, we record a note.
"""
df = df.copy()
notes: Dict[str, str] = {}
for col in df.columns:
if df[col].dtype == "object":
before_dtype = str(df[col].dtype)
converted = pd.to_numeric(df[col], errors="ignore")
after_dtype = str(converted.dtype)
if before_dtype != after_dtype:
df[col] = converted
notes[col] = f"Converted to numeric (from {before_dtype} to {after_dtype})."
return df, notes
def _fill_missing_values(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
"""
Fill missing values using a simple, transparent strategy:
- Numeric columns: fill with median.
- Boolean columns: fill with False.
- Datetime columns: leave missing values as-is.
- Object/string columns: leave missing values as-is.
Returns:
(updated_df, notes_per_column)
"""
df = df.copy()
notes: Dict[str, str] = {}
for col in df.columns:
series = df[col]
dtype = series.dtype
# Numeric
if np.issubdtype(dtype, np.number):
median_value = series.median()
df[col] = series.fillna(median_value)
notes[col] = f"Filled numeric missing values with median ({median_value})."
# Boolean
elif dtype == bool:
df[col] = series.fillna(False)
notes[col] = "Filled boolean missing values with False."
# Datetime => leave missing
elif np.issubdtype(dtype, np.datetime64):
notes[col] = "Left datetime missing values as-is."
# Object/string => leave missing
else:
notes[col] = "Left missing values as-is (object/string)."
return df, notes
def clean_dataset(df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningSummary]:
"""
High-level function to clean a dataset.
Steps:
- Record initial stats.
- Remove duplicate rows.
- Standardize date columns.
- Normalize numeric types where possible.
- Fill missing values for numeric and boolean columns.
- Build and return a CleaningSummary.
Returns:
(cleaned_df, summary)
"""
rows_before = len(df)
missing_before = _count_missing(df)
# 1) Remove duplicates
df_no_dupes = df.drop_duplicates()
rows_after_dupes = len(df_no_dupes)
duplicates_removed = rows_before - rows_after_dupes
# 2) Standardize date columns
df_dates, date_notes = _standardize_dates(df_no_dupes)
# 3) Normalize numeric types
df_numeric, numeric_notes = _normalize_numeric_types(df_dates)
# 4) Fill missing values
df_filled, fill_notes = _fill_missing_values(df_numeric)
missing_after = _count_missing(df_filled)
# Merge notes
column_notes: Dict[str, str] = {}
for notes_dict in (date_notes, numeric_notes, fill_notes):
for key, value in notes_dict.items():
# If there are multiple notes for same column, join them.
if key in column_notes:
column_notes[key] += " | " + value
else:
column_notes[key] = value
summary = CleaningSummary(
rows_before=rows_before,
rows_after=len(df_filled),
duplicates_removed=duplicates_removed,
missing_values_before=missing_before,
missing_values_after=missing_after,
column_notes=column_notes,
)
return df_filled, summary