-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_unigram.py
More file actions
46 lines (37 loc) · 1.12 KB
/
get_unigram.py
File metadata and controls
46 lines (37 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import nltk
import os
import re
from nltk import bigrams
from nltk.probability import FreqDist
from pprint import pprint
def get_unigram(Classlabel,path,IterSet):
"Function to get the unigram model of the given Classlabel(pos/neg) using files in the path"
#filepath = path
#files = os.listdir(filepath);
Prob_dict = dict();
for idx in IterSet:
fpath = path + "/" + str(idx) + "/" + Classlabel
print fpath
files = os.listdir(fpath)
for fil in files:
fh=open(fpath+"/"+fil,"r")
lines = fh.read()
lines = re.sub("[`\"()+.,\']",'',lines)
words = nltk.tokenize.word_tokenize(lines)
freq_dist_unigram = FreqDist(words)
Prob_dict.update(freq_dist_unigram)
prob_sum = 0
num_words = 0
for elem in Prob_dict.values():
num_words += elem
for k in Prob_dict:
val = Prob_dict[k]
Prob_dict[k] = val#/float(num_words)
#prob_sum = prob_sum + Prob_dict[k]
#print Prob_dict
#print prob_sum
return Prob_dict
if __name__ == "__main__":
#temp = get_unigram('pos','/home/chintu/NLP/hw1/dataset',set([1,2,3,4]))
temp = get_unigram('neg','/home/chintu/NLP/hw1/dataset',set([1,2,3,4]))
#print temp