MalwareDetector-AI/main-XSS(linear2).py at main · ilmito/MalwareDetector-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import CountVectorizer


# Carica il dataset
ds = pd.read_csv("/home/g_sml/Challenge/datasets-cyber/XSS_SQL.csv")
ds = ds.dropna(subset=['Sentence'])  # Elimina righe con NaN in 'Sentence'


# Preprocessing dei dati
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(ds['Sentence'])  # Vettorizzazione delle frasi
y = ds['Label'].values

# Dividi il dataset in training set e test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creazione dei DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = TensorDataset(torch.tensor(X_train.toarray()).float(), torch.tensor(y_train).long())
test_dataset = TensorDataset(torch.tensor(X_test.toarray()).float(), torch.tensor(y_test).long())

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, pin_memory=True)

# Definizione del modello
class TinyModel(nn.Module):
    def __init__(self, input_dim):
        super(TinyModel, self).__init__()
        self.linear1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(128, 128)
        self.dropout = nn.Dropout(0.22)
        self.linear3 = nn.Linear(128, 128)
        self.relu = nn.ReLU()
        self.linear4 = nn.Linear(128, 64)
        self.linear6 = nn.Linear(64, 64)
        self.dropout1 = nn.Dropout(0.15)
        self.linear5 = nn.Linear(64, 2)  # 2 classi di output per un problema di classificazione binaria

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.relu(x)
        x = self.linear4(x)
        x = self.linear6(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.linear5(x)
        return x


input_dim = X_train.shape[1]  # Numero di features dopo la vettorizzazione

# Inizializzazione del modello
model = TinyModel(input_dim).to(device)

# Definizione della loss e dell'ottimizzatore
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0000002)

# Addestramento del modello
epochs = 250
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

# Valutazione del modello
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            output = model(batch_X)
            _, predicted = torch.max(output.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    accuracy = correct / total
    print(f"Epoch [{epoch+1}/{epochs}], Test Accuracy: {accuracy:.4f}, Loss {loss:.3}")


model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save('/home/g_sml/Challenge/Modelli/model4-ultimo.pt') # Save