Simple-HTML-Extractor-/Extractor.py at master · the-real-yey/Simple-HTML-Extractor- · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import csv
import os

def extract(path) :
    try:
        soup = BeautifulSoup(open(path), 'html.parser')

    except:
        print("error happend when parser html file:"+path)
        return


    for element in soup(text=lambda text:isinstance(text, Comment)): #discard comments
        element.extract()
    text = soup.find_all(text=True)
    placeholders = soup.find_all(placeholder=True)
    mattooltips = soup.find_all(mattooltip=True)


    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head',
        'script',
        'style',
        # there may be more elements you don't want, such as "style", etc.
    ]

    def addTextToOutput():
        global output
        for t in text:
            if not t.strip().encode('UTF-8').isdigit():
                if not isInterpolationExpressions(t):
                    if t.parent.name not in blacklist:
                        if not t.strip() in output:
                            output.append(t.strip())
                            writer.writerow([t.strip(),path])


    def addPlaceholderTextToOutput():
        global output
        for s in placeholders:
            t=s['placeholder']
            if not t.strip().encode('UTF-8').isdigit():
                if not isInterpolationExpressions(t):
                    if s.name not in blacklist:
                        if not t.strip() in output:
                            output.append(t.strip())
                            writer.writerow([t.strip(),path])

    def addMatTooltipTextToOutput():
        global output
        for s in mattooltips:
            t=s['mattooltip']
            if not t.strip().encode('UTF-8').isdigit():
                if not isInterpolationExpressions(t):
                    if s.name not in blacklist:
                        if not t.strip() in output:
                            output.append(t.strip())
                            writer.writerow([t.strip(),path])


    def isInterpolationExpressions(t):
        return '{{' in t.strip() and '}}' in t.strip()

    addMatTooltipTextToOutput()
    addTextToOutput()
    addPlaceholderTextToOutput()


def findHTML(path):
    global htmls
    currentPath=path
    for f in os.listdir(currentPath):
        if(os.path.isdir(currentPath +'/' +f)):
            findHTML(currentPath +'/' + f)
        else:
            if(f.split('.')[-1]=='html'):
                htmls.append(currentPath +'/' + f)


if __name__ == "__main__":
    output=[]
    htmls=[]
    path='src'
    findHTML(path)

    with open("names.csv", 'a+',newline="") as csvfile:
        writer = csv.writer(csvfile,dialect='excel')
        writer.writerow(['Term','Url'])
        for p in htmls:
            print('Current File' + p)
            extract(p)

    for te in output:
        print(te)