-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
173 lines (144 loc) · 7.23 KB
/
data_preprocessing.py
File metadata and controls
173 lines (144 loc) · 7.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import math
import numpy as np
from collections import defaultdict
from scapy.all import rdpcap, IP, TCP, UDP, Raw
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# ---------------------------------------------------------------------
# Flow key utility
# ---------------------------------------------------------------------
def get_flow_key(pkt):
if IP in pkt:
proto = pkt[IP].proto
if TCP in pkt or UDP in pkt:
sport = pkt.sport
dport = pkt.dport
return (pkt[IP].src, pkt[IP].dst, sport, dport, proto)
return None
# ---------------------------------------------------------------------
# Extract flow features from packets
# ---------------------------------------------------------------------
def extract_flow_features(packets):
flows = defaultdict(lambda: defaultdict(list))
for pkt in packets:
key = get_flow_key(pkt)
if not key:
continue
direction = "fwd" # forward = as defined in tuple order
rev_key = (key[1], key[0], key[3], key[2], key[4])
if rev_key in flows:
key = rev_key
direction = "bwd"
flow = flows[key]
flow["times"].append(pkt.time)
flow["sizes"].append(len(pkt))
flow["dirs"].append(direction)
if TCP in pkt:
flow["tcp_flags"].append(pkt[TCP].flags)
flow["header_len"].append(pkt[TCP].dataofs * 4)
elif UDP in pkt:
flow["header_len"].append(8)
else:
flow["header_len"].append(0)
feature_rows = []
for key, flow in flows.items():
row = {}
src, dst, sport, dport, proto = key
row["Destination Port"] = dport
row["Flow Duration"] = max(flow["times"]) - min(flow["times"]) if len(flow["times"]) > 1 else 0
# Split fwd/bwd packets
fwd_sizes = [s for s, d in zip(flow["sizes"], flow["dirs"]) if d == "fwd"]
bwd_sizes = [s for s, d in zip(flow["sizes"], flow["dirs"]) if d == "bwd"]
fwd_times = [t for t, d in zip(flow["times"], flow["dirs"]) if d == "fwd"]
bwd_times = [t for t, d in zip(flow["times"], flow["dirs"]) if d == "bwd"]
# Counts
row["Total Fwd Packets"] = len(fwd_sizes)
row["Total Backward Packets"] = len(bwd_sizes)
row["Total Length of Fwd Packets"] = sum(fwd_sizes)
row["Total Length of Bwd Packets"] = sum(bwd_sizes)
# Fwd size stats
row["Fwd Packet Length Max"] = np.max(fwd_sizes) if fwd_sizes else 0
row["Fwd Packet Length Min"] = np.min(fwd_sizes) if fwd_sizes else 0
row["Fwd Packet Length Mean"] = np.mean(fwd_sizes) if fwd_sizes else 0
row["Fwd Packet Length Std"] = np.std(fwd_sizes) if fwd_sizes else 0
# Bwd size stats
row["Bwd Packet Length Max"] = np.max(bwd_sizes) if bwd_sizes else 0
row["Bwd Packet Length Min"] = np.min(bwd_sizes) if bwd_sizes else 0
row["Bwd Packet Length Mean"] = np.mean(bwd_sizes) if bwd_sizes else 0
row["Bwd Packet Length Std"] = np.std(bwd_sizes) if bwd_sizes else 0
# Flow throughput
duration = row["Flow Duration"] if row["Flow Duration"] > 0 else 1e-6
row["Flow Bytes/s"] = (sum(flow["sizes"])) / duration
row["Flow Packets/s"] = (len(flow["sizes"])) / duration
row["Fwd Packets/s"] = row["Total Fwd Packets"] / duration
row["Bwd Packets/s"] = row["Total Backward Packets"] / duration
# Inter-arrival times
def iat_stats(times):
if len(times) < 2:
return (0, 0, 0, 0)
iats = np.diff(sorted(times))
return (np.mean(iats), np.std(iats), np.max(iats), np.min(iats))
row["Flow IAT Mean"], row["Flow IAT Std"], row["Flow IAT Max"], row["Flow IAT Min"] = iat_stats(flow["times"])
row["Fwd IAT Total"] = (fwd_times[-1] - fwd_times[0]) if len(fwd_times) > 1 else 0
row["Fwd IAT Mean"], row["Fwd IAT Std"], row["Fwd IAT Max"], row["Fwd IAT Min"] = iat_stats(fwd_times)
row["Bwd IAT Total"] = (bwd_times[-1] - bwd_times[0]) if len(bwd_times) > 1 else 0
row["Bwd IAT Mean"], row["Bwd IAT Std"], row["Bwd IAT Max"], row["Bwd IAT Min"] = iat_stats(bwd_times)
# TCP Flags
flags = flow["tcp_flags"]
row["FIN Flag Count"] = sum(f & 0x01 != 0 for f in flags)
row["SYN Flag Count"] = sum(f & 0x02 != 0 for f in flags)
row["RST Flag Count"] = sum(f & 0x04 != 0 for f in flags)
row["PSH Flag Count"] = sum(f & 0x08 != 0 for f in flags)
row["ACK Flag Count"] = sum(f & 0x10 != 0 for f in flags)
row["URG Flag Count"] = sum(f & 0x20 != 0 for f in flags)
row["ECE Flag Count"] = sum(f & 0x40 != 0 for f in flags)
# Header lengths
row["Fwd Header Length"] = sum(h for h, d in zip(flow["header_len"], flow["dirs"]) if d == "fwd")
row["Bwd Header Length"] = sum(h for h, d in zip(flow["header_len"], flow["dirs"]) if d == "bwd")
row["Fwd Header Length.1"] = row["Fwd Header Length"]
# Packet length stats
row["Min Packet Length"] = np.min(flow["sizes"]) if flow["sizes"] else 0
row["Max Packet Length"] = np.max(flow["sizes"]) if flow["sizes"] else 0
row["Packet Length Mean"] = np.mean(flow["sizes"]) if flow["sizes"] else 0
row["Packet Length Std"] = np.std(flow["sizes"]) if flow["sizes"] else 0
row["Packet Length Variance"] = np.var(flow["sizes"]) if flow["sizes"] else 0
# Ratios & averages
row["Down/Up Ratio"] = (len(bwd_sizes) / len(fwd_sizes)) if fwd_sizes else 0
row["Average Packet Size"] = row["Packet Length Mean"]
row["Avg Fwd Segment Size"] = row["Fwd Packet Length Mean"]
row["Avg Bwd Segment Size"] = row["Bwd Packet Length Mean"]
# Subflow counts
row["Subflow Fwd Packets"] = len(fwd_sizes)
row["Subflow Fwd Bytes"] = sum(fwd_sizes)
row["Subflow Bwd Packets"] = len(bwd_sizes)
row["Subflow Bwd Bytes"] = sum(bwd_sizes)
# Init window bytes (approximate)
row["Init_Win_bytes_forward"] = 0
row["Init_Win_bytes_backward"] = 0
# Others (placeholders, requires TCP analysis of payload)
row["Fwd PSH Flags"] = 0
row["act_data_pkt_fwd"] = len(fwd_sizes)
row["min_seg_size_forward"] = min(fwd_sizes) if fwd_sizes else 0
# Active/Idle times placeholders
row["Active Mean"] = row["Active Std"] = row["Active Max"] = row["Active Min"] = 0
row["Idle Mean"] = row["Idle Std"] = row["Idle Max"] = row["Idle Min"] = 0
feature_rows.append(row)
return pd.DataFrame(feature_rows)
# ---------------------------------------------------------------------
# Preprocess for ML
# ---------------------------------------------------------------------
def preprocess_data(df):
try:
imputer = SimpleImputer(strategy="mean")
df_imputed = imputer.fit_transform(df)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_imputed)
return pd.DataFrame(df_scaled, columns=df.columns)
except Exception as e:
print(f"Error in preprocessing: {e}")
return pd.DataFrame()
def load_model(model_path):
import joblib
model = joblib.load(model_path)
return model