-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRealData.py
More file actions
158 lines (122 loc) · 6.62 KB
/
RealData.py
File metadata and controls
158 lines (122 loc) · 6.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
input_path = r"C:\Users\jsntg\OneDrive\Desktop\3YP\code\data\rubbish.csv"
output_path01 = r"C:\Users\jsntg\OneDrive\Desktop\3YP\code\data\TrainTest.csv"
output_path02 = r"C:\Users\jsntg\OneDrive\Desktop\3YP\code\data\Validation.csv"
df = pd.read_csv(input_path)
df = df.drop(columns=['time_hour', 'gcs_eye_norm', 'gcs_verbal_norm', 'gcs_motor_norm'], errors='ignore')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['hospital_admission_id', 'timestamp']).reset_index(drop=True)
fill_cols = [
'respiratory_rate', 'oxygen_saturation', 'temperature',
'systolic_bp', 'diastolic_bp', 'heart_rate', 'consciousness'
]
df[fill_cols] = df.groupby('hospital_admission_id')[fill_cols].ffill()
df = df.dropna(subset=fill_cols).reset_index(drop=True)
df = df[df['gender'] == 'F'].reset_index(drop=True)
df = df.drop(columns=['gender'])
def assign_risk_level(df):
# respiratory_rate
rr_cond = [(df['respiratory_rate'] < 12) | (df['respiratory_rate'] > 25),
(df['respiratory_rate'] >= 21) & (df['respiratory_rate'] <= 25),
(df['respiratory_rate'] >= 12) & (df['respiratory_rate'] <= 20)]
rr_score = np.select(rr_cond, [3, 2, 0], default=np.nan)
# oxygen_saturation
o2_cond = [df['oxygen_saturation'] < 92,
(df['oxygen_saturation'] >= 92) & (df['oxygen_saturation'] <= 95),
df['oxygen_saturation'] > 95]
o2_score = np.select(o2_cond, [3, 2, 0], default=np.nan)
# temperature
temp_cond = [(df['temperature'] < 36) | (df['temperature'] > 37.7),
(df['temperature'] >= 37.3) & (df['temperature'] <= 37.7),
(df['temperature'] >= 36.1) & (df['temperature'] <= 37.2)]
temp_score = np.select(temp_cond, [3, 2, 0], default=np.nan)
# systolic_bp
sbp_cond = [(df['systolic_bp'] < 90) | (df['systolic_bp'] > 160),
(df['systolic_bp'] >= 151) & (df['systolic_bp'] <= 160),
(df['systolic_bp'] >= 141) & (df['systolic_bp'] <= 150),
(df['systolic_bp'] >= 90) & (df['systolic_bp'] <= 140)]
sbp_score = np.select(sbp_cond, [3, 2, 1, 0], default=np.nan)
# diastolic_bp
dbp_cond = [df['diastolic_bp'] > 110,
(df['diastolic_bp'] >= 101) & (df['diastolic_bp'] <= 110),
(df['diastolic_bp'] >= 91) & (df['diastolic_bp'] <= 100),
(df['diastolic_bp'] >= 60) & (df['diastolic_bp'] <= 90),
df['diastolic_bp'] < 60]
dbp_score = np.select(dbp_cond, [3, 2, 1, 0, 3], default=np.nan)
# heart_rate
hr_cond = [(df['heart_rate'] < 50) | (df['heart_rate'] > 120),
((df['heart_rate'] >= 50) & (df['heart_rate'] <= 60)) | ((df['heart_rate'] >= 111) & (df['heart_rate'] <= 120)),
(df['heart_rate'] >= 101) & (df['heart_rate'] <= 110),
(df['heart_rate'] >= 61) & (df['heart_rate'] <= 100)]
hr_score = np.select(hr_cond, [3, 2, 1, 0], default=np.nan)
# consciousness
consc_score = np.where(df['consciousness'] == 0, 3, 0)
total_score = rr_score + o2_score + temp_score + sbp_score + dbp_score + hr_score + consc_score
risk_cond = [total_score == 0,
(total_score >= 1) & (total_score <= 4),
(total_score >= 5) & (total_score <= 6),
total_score >= 7]
risk_level = np.select(risk_cond, [0, 1, 2, 3], default=np.nan)
return risk_level
df['risk_level'] = assign_risk_level(df)
patient_labels = df.groupby('patient_id')['sepsis'].max()
patients = patient_labels.index.values
labels = patient_labels.values
train_test_patients, val_patients = train_test_split(
patients, test_size=0.2, stratify=labels, random_state=42
)
print(f"Train + Test patients: {len(train_test_patients)}, Validation patients: {len(val_patients)}")
df_train_test = df[df['patient_id'].isin(train_test_patients)].copy()
sepsis_admissions = df_train_test.groupby('hospital_admission_id')['sepsis'].max()
sepsis_admissions = sepsis_admissions[sepsis_admissions == 1].index.tolist()
keep_indices = []
dropped_admissions = []
for adm_id, group in df_train_test.groupby('hospital_admission_id'):
group = group.sort_values('timestamp')
original_sepsis = group['sepsis'].iloc[0]
has_high_risk = (group['risk_level'] >= 2).any()
# sepsis = 1 but no high risk scores -> drop all data from this admission
if original_sepsis == 1 and not has_high_risk:
dropped_admissions.append(adm_id)
continue
# sepsis = 0 but has high risk scores -> drop all data from this admission
if original_sepsis == 0 and has_high_risk:
dropped_admissions.append(adm_id)
continue
# sepsis = 1 and high risk scores -> cut off at first high risk time
if original_sepsis == 1 and has_high_risk:
onset_idx = group[group['risk_level'] >= 2].index.min()
keep_indices.extend(group.loc[:onset_idx].index)
# sepsis = 0 and no high risk scores -> keep all data from this admission
if original_sepsis == 0 and not has_high_risk:
keep_indices.extend(group.index)
df_clean = df_train_test.loc[keep_indices].reset_index(drop=True)
df_clean['is_onset'] = 0
for adm_id in df_clean[df_clean['sepsis'] == 1]['hospital_admission_id'].unique():
sub = df_clean[df_clean['hospital_admission_id'] == adm_id]
onset_mask = (sub['risk_level'] >= 2)
if onset_mask.any():
first_onset_idx = sub[onset_mask].index[0]
df_clean.loc[first_onset_idx, 'is_onset'] = 1
print(f"Dropped {len(dropped_admissions)} admissions due to inconsistency (sepsis label vs risk level).")
print(f"Original rows: {len(df)}, Cleaned rows: {len(df_clean)}")
df_clean = df_clean.drop(columns=['risk_level'])
df_clean.to_csv(output_path01, index=False)
print(f"Cleaned data saved to {output_path01}")
print(f"Original rows: {len(df)}, Cleaned rows: {len(df_clean)}")
print(f"Number of sepsis admissions: {len(sepsis_admissions)}")
print(f"Number of admissions with onset found and truncated: {df_clean['is_onset'].sum()}")
df_val = df[df['patient_id'].isin(val_patients)].copy()
df_val['is_onset'] = 0
for adm_id, group in df_val.groupby('hospital_admission_id'):
onset_mask = (group['risk_level'] >= 2)
if onset_mask.any():
first_onset_idx = group[onset_mask].index[0]
df_val.loc[first_onset_idx, 'is_onset'] = 1
df_val = df_val.drop(columns=['risk_level'])
df_val.to_csv(output_path02, index=False)
print(f"Validation set saved to {output_path02}")
print(f"Number of time points in validation set: {len(df_val)}")
print(f"Number of positive cases in validation set: {df_val['is_onset'].sum()}")