W207_Group_Project/OBDB_processing.py at master · redheadjune/W207_Group_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import pandas as pd
import numpy as np

data_df = pd.read_json('/home/yulia/Documents/SNS/Data/obdatadump.json', lines = True)
list(data_df)

# deletinng unnecessary columns
del data_df['videoType']
del data_df['videoFileURL']

data_df.shape
# (127833, 16)

"""Selecting big 3 lifts"""

exercise = data_df['exercise'].value_counts()
exercise.shape
# (6251,)

exercise_names = [ex.lower() for ex in list(exercise.keys())]
len(exercise_names)
# 6521
len(set(exercise_names))
# 5727

#convdl = ["conventional deadlift", "conv deadlift", "conv dl", "conventional dl",
#          "competition deadlift", "comp deadlift", "comp dl"]
#convdl_list = []
#for ex in data_df['exercise']:
#    if type(ex) == str and ex not in convdl_list and any(word in ex for word in convdl):
#       convdl_list.append(ex)
#convdl_list

valid_exercise_list = ['bench', 'bench press', 'bp', 'competition bench', 'comp bench',
              'squat', 'back squat', 'competition squat', 'comp squat',
              "deadlift", "sumo_deadlift", "conventional deadlift", "comp deadlift",
              "competition deadlift"]

data_df['exercise_valid'] = data_df['exercise'].str.lower().str.strip().isin(valid_exercise_list)
np.sum(data_df['exercise_valid'])
# 52842

dirty_exercise_list = []
for ex in data_df['exercise']:
    if type(ex) == str and ex not in dirty_exercise_list and any(word in ex for word in valid_exercise_list):
        dirty_exercise_list.append(ex)

len(dirty_exercise_list)
# 1328

data_df['exercise_dirty'] = data_df['exercise'].str.lower().str.strip().isin(dirty_exercise_list)
np.sum(data_df['exercise_dirty'])
# 70560

data_df_study = data_df.loc[data_df['exercise_dirty'] == 1]
data_df_study.shape
# (70560, 18)

"""Removing invalid work sets"""

data_df_study = data_df_study.loc[data_df_study['removed'] == 0]
data_df_study.shape
# (68593, 18)

data_df_study['deleted'].value_counts(dropna=False)
#NaN     41710
# 0.0    26447
# 1.0      436

data_df_study = data_df_study.loc[(data_df_study['deleted'].isnull()) | (data_df_study['deleted'] == 0)]
data_df_study.shape
# (68157, 18)

# write out sets data to another csv without rep data
set_df = data_df_study.drop('reps', axis = 1)
set_df.shape
# (68157, 17)
set_df.to_csv('/home/yulia/Documents/SNS/Data/set_data_w207.csv')

"""Reps: New Data Format"""

data_df_new = data_df_study.loc[pd.notna(data_df['initialStartTime'])]
data_df_new.shape
# (62439, 18)

reps_list = data_df_new['reps'].tolist()
sets_list = data_df_new['setID'].tolist()

# setting names for the first 22 metrics in reps['data']
colnames = ['StartMessg',   'RepN',         'AvgVel',   'ROM',          'PeakVel',
            'PeakVelLoc',   'PeakAccel',    'RepDur',   'TimeBWReps',   'TimeRepComp',
            'TimeRepWait',  'SlowAllow',    'Backlight','MinAllow',     'ComprEnable',
            'FiltrEnable',  'CodeV',        'UnitN',    'LED',          'Bright',
            'LowPow',       'BulkStart']

# creating a dictionary for all sets and reps
# primary key: setID
# secondary key: rep count
# values: rep values and rep['data'] values
# keeping only the first 13 rep['data'] values
# I'm getting rid of those with less than 13 elements
set_dict = {}
for set_index in range(len(reps_list)):
    set_dict[sets_list[set_index]] = {}
    for n, rep in enumerate(reps_list[set_index]):
        if len(rep.get('data')) > 12:
            set_dict[sets_list[set_index]][n] = rep.copy()
            for col_index in range(13):
                set_dict[sets_list[set_index]][n][colnames[col_index]] = rep.get('data')[col_index]
            del set_dict[sets_list[set_index]][n]['data']

len(set_dict)
# 62439 number of unique sets

# creating a list of lists
# each inner list = one rep with setID and rep count
set_dict_list = []
for setid in set_dict:
    for repnum in set_dict[setid]:
        temp_list = [setid, repnum]
        for metric in set_dict[setid][repnum]:
            temp_list.append(set_dict[setid][repnum][metric])
        set_dict_list.append(temp_list)
len(set_dict_list)
# 255269 number of unique reps

# creating column names for the output dataframe
some_set_id = list(set_dict.keys())[0]
colnames_full = ['setID', 'RepCount']
for metric in set_dict[some_set_id][0]:
    colnames_full.append(metric)
len(colnames_full)
# 22

# converting list of lists to a dataframe
rep_df = pd.DataFrame(set_dict_list)
rep_df.columns = colnames_full
rep_df.shape
# (255269, 22)

rep_df['isValid'].value_counts(dropna=False)
#True     255108
#False       161

rep_df['removed'].value_counts(dropna=False)
# False    241838
# True      13431

rep_df = rep_df.loc[rep_df['removed'] == 0]
rep_df.shape
# (241838, 22)

rep_df = rep_df.loc[rep_df['isValid'] == 1]
rep_df.shape
# (241761, 22)

# writing out reps data to a csv
rep_df.to_csv('/home/yulia/Documents/SNS/Data/rep_data_w207_new.csv')

"""Reps: Old Data Format"""

data_df_old = data_df_study.loc[pd.isna(data_df_study['initialStartTime'])]
data_df_old.shape
# (5718, 18)

reps_list = data_df_old['reps'].tolist()
sets_list = data_df_old['setID'].tolist()

set_dict = {}
# first record is nan, hence, skip it
for set_index in range(1, len(reps_list)):
    set_dict[sets_list[set_index]] = {}
    for n, rep in enumerate(reps_list[set_index]):
        if len(rep.get('data')) > 12:
            set_dict[sets_list[set_index]][n] = rep.copy()
            for col_index in range(13):
                set_dict[sets_list[set_index]][n][colnames[col_index]] = rep.get('data')[col_index]
            del set_dict[sets_list[set_index]][n]['data']
len(set_dict)
# 5717

set_dict_list17, set_dict_list19 = [], []
for setid in set_dict:
    for repnum in set_dict[setid]:
        temp_list = [setid, repnum]
        for metric in set_dict[setid][repnum]:
            temp_list.append(set_dict[setid][repnum][metric])
        if len(temp_list) == 17:
            set_dict_list17.append(temp_list)
        elif len(temp_list) == 19:
            set_dict_list19.append(temp_list)
len(set_dict_list17)
# 8519
len(set_dict_list19)
# 17493

# creating column names for the output dataframe
colnames_full17 = ['setID', 'RepCount']
for metric in set_dict[set_dict_list17[0][0]][0]:
    colnames_full17.append(metric)
len(colnames_full17)
# 17

colnames_full19 = ['setID', 'RepCount']
for metric in set_dict[set_dict_list19[0][0]][0]:
    colnames_full19.append(metric)
len(colnames_full19)
# 19

rep_df17 = pd.DataFrame(set_dict_list17)
rep_df17.columns = colnames_full17
rep_df17.shape
# (8519, 17)

rep_df19 = pd.DataFrame(set_dict_list19)
rep_df19.columns = colnames_full19
rep_df19.shape
# (17493, 19)

rep_df17['isValid'].value_counts(dropna=False)
#True     8516
#False       3

rep_df17['removed'].value_counts(dropna=False)
# False    8309
# True      210

rep_df17 = rep_df17.loc[rep_df17['removed'] == 0]
rep_df17.shape
# (8309, 17)

rep_df17 = rep_df17.loc[rep_df17['isValid'] == 1]
rep_df17.shape
# (8306, 22)

rep_df19['isValid'].value_counts(dropna=False)
#True     17478
#False       15

rep_df19['removed'].value_counts(dropna=False)
# False    16120
# True      1373

rep_df19 = rep_df19.loc[rep_df19['removed'] == 0]
rep_df19.shape
# (16120, 19)

rep_df19 = rep_df19.loc[rep_df19['isValid'] == 1]
rep_df19.shape
# (16113, 19)

# writing out reps data to a csv
rep_df17.to_csv('/home/yulia/Documents/SNS/Data/rep_data_w207_old17.csv')
rep_df19.to_csv('/home/yulia/Documents/SNS/Data/rep_data_w207_old19.csv')