grad-statement-analysis/HTMLutils.py at master · Rypo/grad-statement-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from bs4 import BeautifulSoup
from FaiText import Tokenizer, defaults
from types import SimpleNamespace

TK_QUOB,TK_QUOE,TK_HLB,TK_HLE = 'xxquob','xxquoe','xxhlb','xxhle'
TK_DEL,TK_BLD,TK_ITL, TK_RED,TK_BLU,TK_GRN = 'xxdel','xxbld','xxitl', 'xxred','xxblu','xxgrn'
CSPEC_CASE = [TK_QUOB,TK_QUOE,TK_HLB,TK_HLE, TK_DEL,TK_BLD,TK_ITL, TK_RED,TK_BLU,TK_GRN]

def to_soup(htext):
    return htext if isinstance(htext, BeautifulSoup) else BeautifulSoup(htext,'html.parser')

def tokwrap(sent, insertion):
    insertion = ' '+insertion.strip()+' ' # normalize spacing around tokens
    return insertion + sent.replace(' ', insertion)+' '

def _insert_outer(ele, head_tok, tail_tok):
    """Prepend `head_tok` and append `tail_tok` to a soup element in place"""
    ele.insert(0,f' {head_tok} ')
    ele.extend(f' {tail_tok} ')
    ele.unwrap()

def findall_replace(text, token_map=None, **kwargs):
    """Insert custom tokens before each word in a tag span

    Args:
        token_map: dict, mapping of {TOK:'tag'} or {TOK:['tag1','tag2',..]}
            (default: {TK_DEL: 'del', TK_BLD: ['b', 'strong'], TK_ITL: ['i', 'em']} )
    """
    token_map = token_map if token_map is not None else {TK_DEL: 'del', TK_BLD: ['b', 'strong'], TK_ITL: ['i', 'em']}
    soup = to_soup(text)
    for tok,tags in token_map.items():
        for t in soup.find_all(tags):
            t.replace_with(tokwrap(t.get_text(), tok))
    return soup

def process_spantags(text, max_tok_ins=5, **kwargs):
    """Replace span tags with corresponding color/highlight tokens

    Args:
        max_tok_ins: int, maximum number of specific color tokens
                     before defaulting to generic highlight token
    """
    soup = to_soup(text)
    for t in soup.find_all('span'):
        if len(t.get_text().split()) > max_tok_ins or not t.has_attr('class'):
            _insert_outer(t,TK_HLB,TK_HLE)
        else:
            colattr = t['class'][0]
            if colattr == 'r':   t.replace_with(tokwrap(t.get_text(), TK_RED))
            elif colattr == 'b': t.replace_with(tokwrap(t.get_text(), TK_BLU))
            elif colattr == 'g': t.replace_with(tokwrap(t.get_text(), TK_GRN))
    return soup

def destroy_tags(text, rm_tags=None, **kwargs):
    """Completely remove elements from DOM, including tags and inner items

    Args:
        rmtags: list(str), tag elements to remove (default: ['ul','a','img','li'])

    """
    rm_tags = rm_tags if rm_tags is not None else ['ul','a','img','li']
    soup = to_soup(text)
    for t in soup.find_all(rm_tags):
        t.decompose() # destroy these elements
    return soup

def wrap_quotes(text, **kwargs):
    # add quoted text tags
    soup = to_soup(text)
    for t in soup.find_all('div',{'class':'q'}):
        _insert_outer(t,TK_QUOB,TK_QUOE)
    return soup

def br2newline(text, **kwargs):
    """Replace `<br/>` with `\n` """
    soup = to_soup(text)
    for t in soup.find_all('br'):
        t.replace_with('\n')
    return soup

def unwrap_tags(text, untag=None, **kwargs):
    """Remove elements' tags, but preserve inner items

    Args:
        untag: list(str), element tags to unwrap (default: ['h2'])
    """
    untag = untag if untag is not None else ['h2']
    soup = to_soup(text)
    for ut in untag:
        for t in soup.select(ut):
            t.unwrap()  # extract text from h2
    return soup


_funcdict = {'findall_replace': findall_replace,
             'process_spantags': process_spantags,
             'destroy_tags': destroy_tags,
             'wrap_quotes': wrap_quotes,
             'br2newline': br2newline,
             'unwrap_tags': unwrap_tags
             }
available_functions = list(_funcdict.keys())


class HTMLTokenizer(Tokenizer):
    """**kwargs are passed to Tokenizer"""

    def __init__(self, markup='', parser='html.parser', **kwargs):
        self.soup = BeautifulSoup(markup, parser)
        super().__init__(**kwargs)

    def partial_process(self, *args, **kwargs):
        for func in args:
            func = _funcdict[func] if isinstance(func, str) else func
            func(self.soup, **kwargs)
        return self.soup

    def full_process_html(self):
        unwrap_tags(self.soup, untag=['h2'])
        br2newline(self.soup)
        destroy_tags(self.soup, rm_tags=['ul', 'a', 'img', 'li'])
        findall_replace(self.soup, token_map={TK_DEL: 'del', TK_BLD: ['b', 'strong'], TK_ITL: ['i', 'em']})
        process_spantags(self.soup, max_tok_ins=5)
        wrap_quotes(self.soup)
        # remove the misc content and starting div
        unwrap_tags(self.soup, untag=['div.cb', 'div.pTx'])

        return str(self.soup)

    @classmethod
    def tokenize(cls, x):
        return cls(x).full_process_html()

# ht.partial_process(br2newline, destroy_tags, findall_replace, unwrap_tags, token_map={TK_DEL: 'del'}, untag=['h2','div.cb','div.pTx'])