malware_detector/preprocessing.py at main · agirones/malware_detector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from nltk import ngrams
import os
from math import log


def opcode_file_to_ngrams(file_name, n=3):
  opcode_sequence = []

  with open(file_name) as file:
    for line in file:
      opcode_sequence.append(line.rstrip())

  return ngrams(opcode_sequence,n)


def add_ngram_ocurrence(ngram, is_malicious, ngrams_ocurrences):
  if ngram in ngrams_ocurrences:
    if is_malicious:
      ngrams_ocurrences[ngram][1] += 1
    else:
      ngrams_ocurrences[ngram][0] += 1

  else:
    if is_malicious:
      ngrams_ocurrences[ngram] = [0, 1, 0, 0]
    else:
      ngrams_ocurrences[ngram] = [1, 0, 0, 0]


def get_directory_opcodes(directory, is_malicious, ngrams_ocurrences):
  cardinality = 0
  for file in os.listdir(directory):
    if not file.endswith(".txt") and not file.startswith("."):
      file_ngrams = set(opcode_file_to_ngrams(directory + '/' + file))
      cardinality += 1
      for gram in file_ngrams:
        add_ngram_ocurrence(gram, is_malicious, ngrams_ocurrences)

  return cardinality


def dataset_document_frequency(class_cardinality, ngram_cardinality):
  return log(ngram_cardinality)/log(class_cardinality)


def document_frequency(benign_cardinality, malicious_cardinality, ngrams_dataset):
  for gram, record in ngrams_dataset.items():
    if record[0] != 0:
      ngrams_dataset[gram][2] = dataset_document_frequency(benign_cardinality, record[0])
    if record[1] != 0:
      ngrams_dataset[gram][3] = dataset_document_frequency(malicious_cardinality, record[1])

  return ngrams_dataset


def document_frequency_feature_extraction(ngram_dataset, L):
  sorted_benign = sorted(ngram_dataset.items(), key=lambda x: x[1][2], reverse=True)
  sorted_malicious = sorted(ngram_dataset.items(), key=lambda x: x[1][3], reverse=True)

  selected_features = set()
  for b, m in zip(sorted_benign, sorted_malicious):
    selected_features.add(b[0])
    selected_features.add(m[0])
    if len(selected_features) >= L:
      return list(selected_features)[:L]


def file_feature_vector_extraction(file, features):
  file_ngrams = list(opcode_file_to_ngrams(file))
  return [f in file_ngrams for f in features]


def test_number_of_ngrams():
  benign_ocurrences = 0
  malicious_ocurrences = 0

  for gram, record in ngrams_ocurrences.items():
    print(f'{gram}: {record}')
    benign_ocurrences += record[0]
    malicious_ocurrences += record[1]

  print(f'benign ngrams: {benign_ocurrences}')
  print(f'malicious ngrams: {malicious_ocurrences}')


benign_path = '/Users/andreugirones/Documents/tfm/virustotal_samples/ELF/benign_opcodes'
malicious_path = '/Users/andreugirones/Documents/tfm/virustotal_samples/ELF/malicious_opcodes'

ngrams_ocurrences = {}

benign_cardinality = get_directory_opcodes(benign_path, False, ngrams_ocurrences)
malicious_cardinality = get_directory_opcodes(malicious_path, True, ngrams_ocurrences)

ngrams_ocurrences = document_frequency(benign_cardinality, malicious_cardinality, ngrams_ocurrences)

selected_features = document_frequency_feature_extraction(ngrams_ocurrences, 25)

file_feature_vector = file_feature_vector_extraction(benign_path + '/' + 'ls', selected_features)
file_feature_vector = [int(b) for b in file_feature_vector]