forked from capjamesg/build-a-search-index
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtfidf.py
More file actions
72 lines (54 loc) · 1.74 KB
/
tfidf.py
File metadata and controls
72 lines (54 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import math
import string
from collections import defaultdict
import time
documents = [
{
"title": "tolerate it",
"lyric": "I made you my temple, my mural, my sky"},
{
"title": "my tears ricochet",
"lyric": "And I still talk to you when I'm screaming at the sky",
},
{
"title": "The Bolter",
"lyric": "Started with a kiss"
},
] * 20000
document_count = len(documents)
word_counts = defaultdict(int)
document_term_frequencies = {}
inverse_document_frequencies = {}
for doc in documents:
lyric = doc["lyric"].lower().translate(str.maketrans("", "", string.punctuation))
number_of_words = len(lyric.split())
document_term_frequencies[doc["title"]] = defaultdict(int)
for word in lyric.split():
word_counts[word] += 1
document_term_frequencies[doc["title"]][word] += 1
document_term_frequencies[doc["title"]] = {
word: count / number_of_words
for word, count in document_term_frequencies[doc["title"]].items()
}
for word, count in word_counts.items():
inverse_document_frequencies[word] = (
math.log(document_count / count)
)
def tfidf(query, documents = documents):
words = query.split()
results = {}
for doc in documents:
tfidfs = [
document_term_frequencies[doc["title"]].get(word, 0)
* inverse_document_frequencies.get(word, 0)
for word in words
]
results[doc["title"]] = sum(tfidfs)
results = sorted(results.items(), key=lambda x: x[1], reverse=True)
return results
start = time.time()
for _ in range(10):
for result in tfidf("my sky started with a kiss"):
print(result)
end = time.time()
print("Time taken for tfidf:", end - start)