-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTopicModelling.py
More file actions
29 lines (28 loc) · 1.02 KB
/
TopicModelling.py
File metadata and controls
29 lines (28 loc) · 1.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from pymongo import MongoClient
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim
from gensim import corpora
from gensim.corpora import BleiCorpus
corpus_collection = MongoClient("mongodb://localhost:27017/")["Dataset_Challenge_Reviews"]["Corpus"]
corpus_cursor = corpus_collection.find()
mycorpus_cursor = corpus_collection.find()
dictionary = corpora.Dictionary(review['words'] for review in corpus_cursor)
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()
corpora.Dictionary.save(dictionary,"DataModels/dictionary.dict")
ncorpus =[]
i=0
for review in mycorpus_cursor:
print i
i+=1
ncorpus.append(dictionary.doc2bow(review["words"]))
corpora.BleiCorpus.serialize("DataModels/corpus.mm",ncorpus)
dcorpus = corpora.BleiCorpus("DataModels/corpus.mm")
lda = gensim.models.LdaModel(dcorpus, num_topics=60, id2word=dictionary)
lda.save("DataModels/lda_model_topics.lda")
i=0
for topic in lda.show_topics(num_topics=60):
print '#' + str(i) + ': ' + str(topic)
i += 1