-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlearn.py
More file actions
91 lines (88 loc) · 2.67 KB
/
learn.py
File metadata and controls
91 lines (88 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import xml.etree.ElementTree
import re
import pickle
from gensim import corpora, models, similarities
from sklearn import linear_model
from sklearn.datasets import make_classification
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction import FeatureHasher
import gensim
from bs4 import BeautifulSoup
clf = Perceptron()
dictionary = corpora.Dictionary.load('question.dict')
corpus = corpora.MmCorpus('question.mm')
e = xml.etree.ElementTree.parse('Posts.xml').getroot()
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
questions = {}
with open('tm_out', 'r') as doc:
documents = pickle.load(doc)
for a in e.findall('row'):
tags = a.get('Tags')
id = a.get('Id')
type = a.get('PostTypeId')
user = a.get('OwnerUserId')
if(type == '1'):
if(id in questions):
cs = questions[id]
else:
cs = ""
tags = [m.group(1) for m in re.finditer(r'\<([^\>]*)\>', tags)]
for tag in tags:
cs += "\t" + tag
if(id in documents):
for items in documents[id]:
cs += "\t" + str(items[0])
questions[id] = cs
y = []
X = []
dictionary = corpora.Dictionary.load('question.dict')
mm = corpora.MmCorpus('question.mm')
d = {}
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=50)
for a in e.findall('row'):
tags = []
type = a.get('PostTypeId')
pId = a.get("ParentId")
user = a.get('OwnerUserId')
if(type == '2'):
if(user is not None):
tags.append(questions[pId])
text = a.get('Body')
s = a.get('Score')
c = a.get('CommentCount')
id = a.get('Id')
y.append(id)
X.append(tags)
x.append(('s', float(s)/100.0))
x.append(('c', float(c)/100.0))
X = featureHasher.transform(X)
y = numpy.array(y)
model = clf.partial_fit(X, y, classes=allClasses)
with open("tags.model", 'w') as fout:
pickle.dump(model, fout)
for a in e.findall('row'):
tags = []
type = a.get('PostTypeId')
pId = a.get("ParentId")
user = a.get('OwnerUserId')
if(type == '2'):
if(user is not None):
tags.append(questions[pId])
text = a.get('Body')
s = a.get('Score')
c = a.get('CommentCount')
soup = BeautifulSoup(text)
text = soup.get_text()
query_bow = dictionary.doc2bow(text.lower().split())
id = a.get('Id')
y.append(id)
X.append(tags)
X.append(lda[query_bow])
x.append(('s', float(s)/100.0))
x.append(('c', float(c)/100.0))
X = featureHasher.transform(X)
y = numpy.array(y)
model = clf.partial_fit(X, y, classes=allClassesy)
with open("postctopic.model", 'w') as fout:
pickle.dump(model, fout)