-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_helper.py
More file actions
127 lines (110 loc) · 3.66 KB
/
process_helper.py
File metadata and controls
127 lines (110 loc) · 3.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 21 22:22:14 2020
Helper functions for processing input data
@author: al2357
"""
import pandas as pd
import numpy as np
'''
Removes strings from numerical values(e.g. 12km > 12) and converts columns to numeric.
data pandas data frame
cols dictionary {"col_name":"remove_from_string", ...};
remove_from_string: "non_num", [actual string]
'''
def cols_to_numeric(data, cols = {}):
for col in cols:
replace = cols[col]
if col in data:
replace = "[^0-9]" if replace == "non_num" else replace
data[col] = pd.to_numeric(data[col].str.replace(replace, ""))
#end if
#end for
return data
#end def
'''
Convert caregories to ints - numpy replace
data pandas data frame
cols list list of columns to convert from str to int
'''
def cat_to_int_replace(data, cols = []):
replace_map = {}
for col in cols:
unique_vals = np.unique(data[col].values)
col_map = {}
i = 1
for unique_val in unique_vals:
col_map[unique_val] = i
i += 1
replace_map[col] = col_map
#end for
data.replace(replace_map, inplace=True)
return data, replace_map
#end cat_to_int_replace
'''
Converts categories to ints - scikit LabelEncoder. Used mainly for labels(2D)
data pandas data frame
cols list list of columns to convert from str to int
'''
def cat_to_int_label_enc(data, cols = []):
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_classes = {}
for col in cols:
le.fit(data[col]) #.drop_duplicates()) # optionally drop duplicates - is it necessary?
le_classes[col] = le.classes_
data[col] = le.transform(data[col])
#end for
return data, le_classes
#end cat_to_int_label_enc
'''
Converts categories to ints - scikit OrdinalEncoder.
data pandas data frame
cols list list of columns to convert from str to int
'''
def cat_to_int_ord_enc(data, cols = []):
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
for col in cols:
oe.fit(data[col])
data[col] = oe.transform(data[col])
#end for
return data
#end cat_to_int_ord_enc
'''
'''
def cat_to_oh_enc(data, cols_to_encode = [], cats_to_drop = {}):
from sklearn.preprocessing import OneHotEncoder
drop_lst = []
# create drop list for all columns - defined in cats_to_drop and not
for col_idx in range(len(cols_to_encode)):
col = cols_to_encode[col_idx]
if col in cats_to_drop:
drop_lst.append(cats_to_drop[col])
else:
drop_lst.append(data[col][0])
#end for
data_copy_nc = data.copy()
data_copy_nc.drop(cols_to_encode, axis=1, inplace=True)
data_categorical = data[cols_to_encode]
# encode
ohe = OneHotEncoder(sparse=False, drop=drop_lst)
data_cat_oh = ohe.fit_transform(data_categorical)
# rename encoded columns
data_cat_oh_col_names = []
col_i = 0
for cat_arr in ohe.categories_:
for cat in cat_arr:
if cat != drop_lst[col_i]:
data_cat_oh_col_names.append(cols_to_encode[col_i]+"_"+cat)
else:
continue
#end if
#end for
col_i = col_i + 1
#end for
data_cat_oh_df = pd.DataFrame(data_cat_oh,
columns=data_cat_oh_col_names)
data = pd.concat([data_cat_oh_df.reset_index(drop=True), data_copy_nc.reset_index(drop=True)], axis=1)
return data, drop_lst, data_cat_oh_col_names
#end cat_to_oh_enc