-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathExtractor.py
More file actions
107 lines (83 loc) · 2.99 KB
/
Extractor.py
File metadata and controls
107 lines (83 loc) · 2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import csv
import os
def extract(path) :
try:
soup = BeautifulSoup(open(path), 'html.parser')
except:
print("error happend when parser html file:"+path)
return
for element in soup(text=lambda text:isinstance(text, Comment)): #discard comments
element.extract()
text = soup.find_all(text=True)
placeholders = soup.find_all(placeholder=True)
mattooltips = soup.find_all(mattooltip=True)
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'script',
'style',
# there may be more elements you don't want, such as "style", etc.
]
def addTextToOutput():
global output
for t in text:
if not t.strip().encode('UTF-8').isdigit():
if not isInterpolationExpressions(t):
if t.parent.name not in blacklist:
if not t.strip() in output:
output.append(t.strip())
writer.writerow([t.strip(),path])
def addPlaceholderTextToOutput():
global output
for s in placeholders:
t=s['placeholder']
if not t.strip().encode('UTF-8').isdigit():
if not isInterpolationExpressions(t):
if s.name not in blacklist:
if not t.strip() in output:
output.append(t.strip())
writer.writerow([t.strip(),path])
def addMatTooltipTextToOutput():
global output
for s in mattooltips:
t=s['mattooltip']
if not t.strip().encode('UTF-8').isdigit():
if not isInterpolationExpressions(t):
if s.name not in blacklist:
if not t.strip() in output:
output.append(t.strip())
writer.writerow([t.strip(),path])
def isInterpolationExpressions(t):
return '{{' in t.strip() and '}}' in t.strip()
addMatTooltipTextToOutput()
addTextToOutput()
addPlaceholderTextToOutput()
def findHTML(path):
global htmls
currentPath=path
for f in os.listdir(currentPath):
if(os.path.isdir(currentPath +'/' +f)):
findHTML(currentPath +'/' + f)
else:
if(f.split('.')[-1]=='html'):
htmls.append(currentPath +'/' + f)
if __name__ == "__main__":
output=[]
htmls=[]
path='src'
findHTML(path)
with open("names.csv", 'a+',newline="") as csvfile:
writer = csv.writer(csvfile,dialect='excel')
writer.writerow(['Term','Url'])
for p in htmls:
print('Current File' + p)
extract(p)
for te in output:
print(te)