-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBloomFilter.py
More file actions
158 lines (122 loc) · 4.77 KB
/
BloomFilter.py
File metadata and controls
158 lines (122 loc) · 4.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
Project #2: Bloom Filter with multiple hashes.
Author: Diego Martinez Garcia
Pip installs: mmh3 & bitarray
"""
import mmh3, math, sys, csv
from bitarray import bitarray
class BloomFilter(object):
'''
Class for Bloom filter, using murmur3 hash function
'''
def __init__(self, num_items:int, false_pos:int):
'''
items_count : int
Number of expected items to be stored in bloom filter
false_pos : float
False Positive probability in decimal
'''
# Size for bit array
self.size = BloomFilterSize(num_items, false_pos)
# Number of hash functions to use
self.hash_count = HashAmount(self.size, num_items)
# Creation of bitarray with all values equal to cero
self.bit_array = bitarray(self.size)
self.bit_array.setall(0)
def add_to_filter(self, item:str):
'''
Add an item to the filter
item : str
Item to be added to the filter
'''
# Saving bits that will be turned on the bitarray.
bits_pos = []
for i in range(self.hash_count):
#Creating hash fuction using 'i' as the seed value and saving hash value
hash_value = mmh3.hash(item, i) % self.size
bits_pos.append(hash_value)
# Turning bit on using hash value
self.bit_array[hash_value] = True
def in_filter(self, item:str):
'''
Check if item is in filter
item : int
item to be searhed
'''
for i in range(self.hash_count):
hash_value = mmh3.hash(item, i) % self.size
if self.bit_array[hash_value] == False:
# If condition is true, the item is not in filter
return False
#Else it may be in the filter
return True
def BloomFilterSize(num_items:int, false_pos:int):
'''
Return the size of bit array to used using
this formula:
size = ceil((num_items * log(false_pos)) / log(1 / pow(2, log(2))))
num_items : int
number of items expected to be stored in filter
false_pos : float
False Positive probability in decimal
'''
size = math.ceil((num_items * math.log(false_pos)) / math.log(1 / pow(2, math.log(2))))
return int(size)
def HashAmount(num_items: int, size: int):
'''
Return the hash function to be used using
this formula:
k = (size/num_items) * lg(2)
size : int
size of bit array
num_items : int
number of items expected to be stored in filter
'''
r = num_items / size
k = math.log(2) * r
return int(k)
if __name__ == "__main__":
#False positivity value:
false_pos = 0.0000001
num_items = 0
#Checking if arguments where passed
if len(sys.argv) > 1:
#Saving file that was passed through the commandline in a variable
input_db = sys.argv[1]
check_db = sys.argv[2]
# Test case made by me:
input_db_test = "test1.csv"
check_db_test = "test2.csv"
#Variable that will store the emails found on the csv
emails = []
with open(input_db, 'r') as csvfile:
#Creation of file reader
csvreader = csv.reader(csvfile)
for row in csvreader:
#Saving email
email = row[0]
#Checking if the row has the heathers/titles
if email == "Email":
pass
#Saving email on array
else:
emails.append(email)
#Ceating the BloomFilter Object
bloom_filter = BloomFilter(len(emails), false_pos)
#Adding emails to filter:
for email in emails:
bloom_filter.add_to_filter(email)
with open(check_db, 'r') as csvfile2:
#Creation of second file reader
csvreader2 = csv.reader(csvfile2)
for row in csvreader2:
#Saving email
email_check = row[0]
#Checking if the row has the heathers/titles
if email_check == "Email": pass
#Printing if email is in the filter
else:
if bloom_filter.in_filter(email_check) == True:
print(f"{email_check},Probably in the DB")
else:
print(f"{email_check},Not in the DB")