-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdist_model_1.py
More file actions
44 lines (30 loc) · 1.21 KB
/
dist_model_1.py
File metadata and controls
44 lines (30 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import math
import re
from collections import Counter
# regular expression
# \w matches any alphanumeric character and the underscore
# + causes the RE to match 1 or more repetitions of the preceding RE
WORD = re.compile(r'\w+')
def textToVector(text):
words = WORD.findall(text)
# unordered collection where elements are stored as dict keys, and counts are stored as dict vals
return Counter(words)
def cosDistance(vector1, vector2):
# set of unordered collection of unique items
intersection = set(vector1.keys()) & set(vector2.keys()) # return set with elements in intersection
numerator = sum([vector1[x] * vector2[x] for x in intersection])
sum1 = sum([vector1[x] ** 2 for x in vector1.keys()])
sum2 = sum([vector2[x] ** 2 for x in vector2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def readFile(fileName):
return open("../data/" + fileName, 'r').read()
text1 = readFile("nowIsTheTime.txt")
text2 = readFile("quickBrownFox.txt")
vector1 = textToVector(text1)
vector2 = textToVector(text2)
cosine = cosDistance(vector1, vector2)
print("Cosine Distance:\t", cosine)