-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_semantic_map_for_notes.rb
More file actions
69 lines (57 loc) · 1.59 KB
/
create_semantic_map_for_notes.rb
File metadata and controls
69 lines (57 loc) · 1.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
require 'sqlite3'
require 'date'
require 'stash'
class Array
def count(t)
self.length - (self - [t]).length
end
end
db = SQLite3::Database.open( "notes.db" )
notes = db.execute("select * from notes;")
# Get the post IDs (last part of permalink) and raw text,
# downcased and stripped of HTML tags and symbols.
notes = notes.collect {|n|
[
n.last.split("/").last.to_i,
n[2]
.downcase
.gsub(/<\/?[^>]*>/, "")
.gsub(/[^a-zA-Z0-9\-\s]/, "")
]
}
index = {} # Maps from each post ID to a list of words in that post.
reverse_index = {} # Maps from each word to a list of posts containing it.
# Rip through the notes and build those indexes.
notes.each do |doc|
id = doc[0]
words = doc[1].split(/\s+/)
index[id] = words
words.each do |term|
(reverse_index[term] ||= []) << id
end
end
def tf(term, doc, index)
"Normalized term frequency within a document"
num = index[doc].count(term)
denom = index[doc].length.to_f
num / denom
end
def idf(term, index, reverse_index)
"Inverse document frequency of a term"
num = index.keys.length
denom = reverse_index[term].length.to_f
Math::log(num / (denom + 1))
end
def tf_idf(term, doc, index, reverse_index)
tf(term, doc, index) * idf(term, index, reverse_index)
end
semantics = {} # Maps words to a set [tf-idf weight of word in doc, docID] pairs.
index.each do |doc, words|
p doc
words.uniq.collect {|w| [tf_idf(w, doc, index, reverse_index), w]}.each do |pair|
word = pair[1]
weight = pair[0]
(semantics[word] ||= []) << [weight, doc]
end
end
ObjectStash.store semantics, './semantics.stash'