-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDatapreprocessing.py
More file actions
93 lines (76 loc) · 3.25 KB
/
Datapreprocessing.py
File metadata and controls
93 lines (76 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import csv
import re
import os
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
import sys
ps = PorterStemmer()
directory = sys.argv[1]
mainlist = {}
listlabel={}
# a file with nltk stop words and the words which occurred more frequently
f1=open(sys.argv[2],'r')
stopwordstr=f1.read()
#Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl. If the pattern isn’t found, string is returned unchanged. repl can be a string or a function.
#so in our case : #pattern is any non-alphanumeric character. #[\w] means any alphanumeric character and is equal to the character set [a-zA-Z0-9_]
#a to z, A to Z , 0 to 9 and underscore.#so we match any non-alphanumeric character and replace it with a space . #and then we split() it which splits string by space and converts it to a list
stopwordlist=re.sub("[^\w]", " ", stopwordstr).split()
cnt=0
id=0
for filename in os.listdir(directory):
if filename.endswith(".html"):
content=[]
wordList=[]
f=open(directory+"/"+filename,'r',encoding="utf8")
html_doc=f.read()
soup = BeautifulSoup(html_doc, 'html.parser')
try:
p=list(soup.body.children)[9]
st=list(p.children)
l=soup.find_all('strong')[1]
la=list(l.parents)[0].text
if(len(list(p.children))<35):
continue
s=list(p.children)[5].text+list(p.children)[35].text # takes the title and the content both
# Print the first few characters
#print(soup.find('div',{"class":"texte"}))
labels = []
contentmain=[]
wordList = re.sub("[^\w]", " ", s).split()
labellist = la.split('\n')
la=[]
for l in labellist:
if (l != ''):
labels.append(l)
if(labels.pop(0)!='EUROVOC descriptor:'): #no eurovoc descriptors availabel then we dont add both the labels and the content to the respective list
raise Exception("incorrect labelling")
content=[word for word in wordList if word not in stopwordlist]
labelsmain=[word for word in labels if word not in stopwordlist]
content=[x.lower() for x in content]
for w in content:
contentmain.append(ps.stem(w)) #porter stemming
for s in labelsmain:
la.append(ps.stem(s))
mainlist[id]=(list(set(contentmain)))
listlabel[id]=labels
except IndexError as error:
print(error)
except AttributeError as aerror:
print(aerror)
except NameError as err:
print(err)
except:
print ("dk")
id=id+1
continue
else:
continue
#create two csv files one for content and one for labels
wm= csv.writer(open("content.csv", "w",encoding="utf-8"))
for key, val in mainlist.items():
wm.writerow([key, val])
w = csv.writer(open("label.csv", "w",encoding="utf-8"))
for key, val in listlabel.items():
w.writerow([key, val])