-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
99 lines (68 loc) · 3.12 KB
/
preprocessing.py
File metadata and controls
99 lines (68 loc) · 3.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from nltk import ngrams
import os
from math import log
def opcode_file_to_ngrams(file_name, n=3):
opcode_sequence = []
with open(file_name) as file:
for line in file:
opcode_sequence.append(line.rstrip())
return ngrams(opcode_sequence,n)
def add_ngram_ocurrence(ngram, is_malicious, ngrams_ocurrences):
if ngram in ngrams_ocurrences:
if is_malicious:
ngrams_ocurrences[ngram][1] += 1
else:
ngrams_ocurrences[ngram][0] += 1
else:
if is_malicious:
ngrams_ocurrences[ngram] = [0, 1, 0, 0]
else:
ngrams_ocurrences[ngram] = [1, 0, 0, 0]
def get_directory_opcodes(directory, is_malicious, ngrams_ocurrences):
cardinality = 0
for file in os.listdir(directory):
if not file.endswith(".txt") and not file.startswith("."):
file_ngrams = set(opcode_file_to_ngrams(directory + '/' + file))
cardinality += 1
for gram in file_ngrams:
add_ngram_ocurrence(gram, is_malicious, ngrams_ocurrences)
return cardinality
def dataset_document_frequency(class_cardinality, ngram_cardinality):
return log(ngram_cardinality)/log(class_cardinality)
def document_frequency(benign_cardinality, malicious_cardinality, ngrams_dataset):
for gram, record in ngrams_dataset.items():
if record[0] != 0:
ngrams_dataset[gram][2] = dataset_document_frequency(benign_cardinality, record[0])
if record[1] != 0:
ngrams_dataset[gram][3] = dataset_document_frequency(malicious_cardinality, record[1])
return ngrams_dataset
def document_frequency_feature_extraction(ngram_dataset, L):
sorted_benign = sorted(ngram_dataset.items(), key=lambda x: x[1][2], reverse=True)
sorted_malicious = sorted(ngram_dataset.items(), key=lambda x: x[1][3], reverse=True)
selected_features = set()
for b, m in zip(sorted_benign, sorted_malicious):
selected_features.add(b[0])
selected_features.add(m[0])
if len(selected_features) >= L:
return list(selected_features)[:L]
def file_feature_vector_extraction(file, features):
file_ngrams = list(opcode_file_to_ngrams(file))
return [f in file_ngrams for f in features]
def test_number_of_ngrams():
benign_ocurrences = 0
malicious_ocurrences = 0
for gram, record in ngrams_ocurrences.items():
print(f'{gram}: {record}')
benign_ocurrences += record[0]
malicious_ocurrences += record[1]
print(f'benign ngrams: {benign_ocurrences}')
print(f'malicious ngrams: {malicious_ocurrences}')
benign_path = '/Users/andreugirones/Documents/tfm/virustotal_samples/ELF/benign_opcodes'
malicious_path = '/Users/andreugirones/Documents/tfm/virustotal_samples/ELF/malicious_opcodes'
ngrams_ocurrences = {}
benign_cardinality = get_directory_opcodes(benign_path, False, ngrams_ocurrences)
malicious_cardinality = get_directory_opcodes(malicious_path, True, ngrams_ocurrences)
ngrams_ocurrences = document_frequency(benign_cardinality, malicious_cardinality, ngrams_ocurrences)
selected_features = document_frequency_feature_extraction(ngrams_ocurrences, 25)
file_feature_vector = file_feature_vector_extraction(benign_path + '/' + 'ls', selected_features)
file_feature_vector = [int(b) for b in file_feature_vector]