-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtopic.py
More file actions
87 lines (72 loc) · 2.95 KB
/
topic.py
File metadata and controls
87 lines (72 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
每⼀篇论⽂都会涉及到⼀个或多个主题,请你先定出主题词,
然后根据每个‘团队’发表的论⽂的情况,提炼出这个团队最常涉猎
的主题。
"""
import codecs
import re
import json
from utils import load_data, load_teams, pat
from gensim.corpora import Dictionary
from gensim.models import LdaModel
def load_stopwords(file_path='./data/topic/stop_words'):
stop_words = codecs.open(file_path, 'r', encoding='utf8').readlines()
stop_words = [w.strip() for w in stop_words]
return stop_words
# training the lda model
def train(data, save_path='./data/topic/'):
train_data = []
stop_words = load_stopwords()
for item in data:
words = pat.sub('', item['title']).lower().split()
train_data.append([w for w in words if w not in stop_words])
# Generating the word bag data of the LDA model, each element is a two-group (id, frequency)
dictionary = Dictionary(train_data)
dictionary.save(save_path + 'titles.dict')
corpus = [dictionary.doc2bow(text) for text in train_data]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
lda.save(save_path + 'model/lda.model')
return lda
def load_team_works(data, team_path='./data/author_list/team'):
team_set = load_teams(team_path)
team_papers = {}
for item in data:
authors = item['author']
for team in team_set:
if team.issubset(set(authors)):
key = list(team)
key.sort()
key = ','.join(key)
if key not in team_papers:
team_papers[key] = []
team_papers[key].append(item['title'])
break
return team_papers
def get_topics(lda, team_papers, dict_path='./data/topic/titles.dict'):
team_topics = {}
stop_words = load_stopwords()
dictionary = Dictionary.load(dict_path)
for team in team_papers.keys():
if team not in team_topics:
team_topics[team] = []
papers = team_papers[team]
for title in papers:
words = [w for w in title.split() if w not in stop_words]
title_bow = dictionary.doc2bow(words)
topic_prob = max(lda[title_bow], key=lambda x: x[1])
str_topic = lda.print_topic(topic_prob[0])
team_topics[team].append(parse_topic(str_topic))
return team_topics
# make the top-3 keywords as topic
def parse_topic(str_topic, idx=3):
p = re.compile(r'\"(.+?)\"')
topics = re.findall(p, str_topic)
return ' '.join(topics[0:idx])
if __name__ == '__main__':
lda = LdaModel.load('./data/topic/model/lda.model')
for topic in lda.print_topics():
print(topic)
data = load_data()
team_papers = load_team_works(data)
with codecs.open('./data/topic/team_topics.json', 'w', encoding='utf8') as f:
f.write(json.dumps(get_topics(lda, team_papers)))