-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
120 lines (99 loc) · 4.05 KB
/
app.py
File metadata and controls
120 lines (99 loc) · 4.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from flask import Flask, render_template, request
app = Flask(__name__)
from urllib.request import urlopen as ur
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import logging
from nltk.corpus import stopwords
import nltk
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
stop_words = stopwords.words('english')
stop_words.extend(['mainframe','tech','lead','work','want','developer','program','olf','br'])
import fasttext
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
from scipy.cluster.hierarchy import dendrogram, linkage
corpus_list = []
cluster=0
@app.route('/')
def home():
return render_template('form.html')
@app.route('/result',methods = ['POST', 'GET'])
def result():
if request.method == 'POST':
result = request.form
for key,value in result.items():
corpus_list.append(value)
return render_template("form.html")
@app.route('/scrape',methods=['POST','GET'])
def scrape():
if request.method=='POST':
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
# parse the html using beautiful soup and store in variable `soup`
from urllib.request import Request, urlopen
corpus_text=[]
for i in corpus_list:
links = ['p','li']
req = Request(i, headers={'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = BeautifulSoup(page)
for link in links:
all_data=soup.find_all(link)
for data in all_data:
data = data.text
data = data.encode('ascii', 'ignore').decode('ascii')
corpus_text.append(data)
corpus_text = list(filter(None,corpus_text))
file=open('text_corpus.txt','w')
for sentence in corpus_text:
sentence = sentence.split()
for word in sentence:
file.writelines(word)
file.writelines(' ')
return render_template("input.html")
@app.route('/predict',methods=['POST','GET'])
def predict():
if request.method=='POST':
file = request.files['dataset']
if not file: return render_template('input.html', label="No file")
#give the domain specific corpus to fasttext for training word vectors
model = fasttext.skipgram('text_corpus.txt', 'model',lr=0.1,dim=300)
# Reading the data to be clustered can be changed depending upon the file
new_data=pd.read_excel(file)
y=new_data.iloc[:,0].values
# cleaning the sentences of the data to be clustered
corpus=[]
for i in range(0,len(y)):
review = re.sub('[^a-zA-Z]',' ',y[i])
review = review.lower()
review = review.split()
review = [word for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus.append(review)
#making sentence vectors, directly give the sentences to the fasttext model0
vector=[]
for sentence in corpus:
vector.append(model[sentence])
if request.method == 'POST':
result = request.form
for key,value in result.items():
if(key=='clusters'):
cluster=value
#cluster the vector embedding
total_clusters = int(cluster) # may vary depending on the use case
agg_cluster = AgglomerativeClustering(n_clusters = total_clusters)
assigned_clusters = agg_cluster.fit_predict(vector)
y_pred = pd.DataFrame(data=assigned_clusters)
df = pd.DataFrame(data=y)
df['Cluster']=y_pred.values
writer = ExcelWriter('result.xlsx')
df.to_excel(writer,'Sheet1',index=False)