recommendr/learn.py at master · amitnavindgi/recommendr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import xml.etree.ElementTree
import re
import pickle
from gensim import corpora, models, similarities
from sklearn import linear_model
from sklearn.datasets import make_classification
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction import FeatureHasher
import gensim
from bs4 import BeautifulSoup

clf = Perceptron()
dictionary = corpora.Dictionary.load('question.dict')
corpus = corpora.MmCorpus('question.mm')
e = xml.etree.ElementTree.parse('Posts.xml').getroot()
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

questions = {}
with open('tm_out', 'r') as doc:
    documents = pickle.load(doc)
for a in e.findall('row'):
    tags = a.get('Tags')
    id = a.get('Id')
    type = a.get('PostTypeId')
    user = a.get('OwnerUserId')
    if(type == '1'):
        if(id in questions):
                cs = questions[id]
        else:
            cs = ""
        tags = [m.group(1) for m in re.finditer(r'\<([^\>]*)\>', tags)]
        for tag in tags:
            cs += "\t" + tag
        if(id in documents):
            for items in documents[id]:
                cs += "\t" + str(items[0])
        questions[id] = cs
y = []
X = []
dictionary = corpora.Dictionary.load('question.dict')
mm = corpora.MmCorpus('question.mm')
d = {}
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=50)
for a in e.findall('row'):
    tags = []
    type = a.get('PostTypeId')
    pId = a.get("ParentId")
    user = a.get('OwnerUserId')
    if(type == '2'):
        if(user is not None):
            tags.append(questions[pId])
    text = a.get('Body')
    s = a.get('Score')
    c = a.get('CommentCount')
    id = a.get('Id')
    y.append(id)
    X.append(tags)
    x.append(('s', float(s)/100.0))
    x.append(('c', float(c)/100.0))
X = featureHasher.transform(X)
y = numpy.array(y)
model = clf.partial_fit(X, y, classes=allClasses)
with open("tags.model", 'w') as fout:
    pickle.dump(model, fout)
for a in e.findall('row'):
    tags = []
    type = a.get('PostTypeId')
    pId = a.get("ParentId")
    user = a.get('OwnerUserId')
    if(type == '2'):
        if(user is not None):
            tags.append(questions[pId])
    text = a.get('Body')
    s = a.get('Score')
    c = a.get('CommentCount')
    soup = BeautifulSoup(text)
    text = soup.get_text()
    query_bow = dictionary.doc2bow(text.lower().split())
    id = a.get('Id')
    y.append(id)
    X.append(tags)
    X.append(lda[query_bow])
    x.append(('s', float(s)/100.0))
    x.append(('c', float(c)/100.0))
X = featureHasher.transform(X)
y = numpy.array(y)
model = clf.partial_fit(X, y, classes=allClassesy)
with open("postctopic.model", 'w') as fout:
    pickle.dump(model, fout)