forked from ucalyptus/dirac-dev
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathKeyword_Extract.py
More file actions
31 lines (26 loc) · 972 Bytes
/
Keyword_Extract.py
File metadata and controls
31 lines (26 loc) · 972 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 4 13:25:43 2019
@author: Shatadru Majumdar
"""
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def keyword_extraction(tfidf_vector,document_list):
keywords=[]
for i in range(0,len(document_list)):
response2=tfidf_vector.transform([document_list[i]])
feature_array = np.array(tfidf_vector.get_feature_names())
tfidf_sorting = np.argsort(response2.toarray()).flatten()[::-1]
n=12
top_n = feature_array[tfidf_sorting][:n]
keywords.append(top_n)
return keywords
fileName = "report4.txt"
d=open(fileName,"r",encoding='utf8')
document=d.read()
document_list=document.split("\n\n")
tfidf_vector=TfidfVectorizer(stop_words='english',ngram_range=(1,3))
response=tfidf_vector.fit_transform(document_list)
keywords=keyword_extraction(tfidf_vector,document_list)
keywords=np.asarray(keywords)
print(keywords)