telex2reddit/telexhtmlparser.py at main · ForroKulcs/telex2reddit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import dateutil.parser
import html.parser
import logging.config
import re


class HungarianParserInfo(dateutil.parser.parserinfo):
    MONTHS = ['január',
              'február',
              'március',
              'április',
              'május',
              'június',
              'július',
              'augusztus',
              'szeptember',
              'október',
              'november',
              'december']


class TelexHTMLParser(html.parser.HTMLParser):
    def __init__(self, log: logging.Logger = None):
        html.parser.HTMLParser.__init__(self)
        self._link_pattern = re.compile(r'(?:(?:https?://)(?:www\.)?telex\.hu)?/+([\w-]+/+\d+/+\d+/+\d+(?:/+[\w-]+)+)/*', re.IGNORECASE)
        self._in_article_date = False
        self._in_article_title = False
        self._in_article_title_bottom = False
        self._log = log
        self.article_date = None
        self.article_title = None
        self.links: list[str] = []

    def error(self, message):
        raise Exception(message)

    def handle_starttag(self, tag: str, attrs: list):
        lowtag = tag.lower()

        if lowtag == 'h1':
            for attr in attrs:
                if len(attr) < 2:
                    continue
                if attr[0].lower() != 'class':
                    continue
                if attr[1] is None:
                    continue
                if attr[1] == 'article_title':
                    self._in_article_title = True
            return

        if lowtag == 'div':
            for attr in attrs:
                if len(attr) < 2:
                    continue
                if attr[0].lower() != 'class':
                    continue
                if attr[1] is None:
                    continue
                if attr[1] == 'article_date':
                    self._in_article_date = True
                elif attr[1] == 'article_title-bottom':
                    self._in_article_title_bottom = True
            return

        if lowtag == 'a':
            for attr in attrs:
                if len(attr) < 2:
                    continue
                if attr[0].lower() != 'href':
                    continue
                if attr[1] is None:
                    continue
                match = self._link_pattern.fullmatch(attr[1].strip().lower())
                if match:
                    self.links.append(match.group(1))
                return

    def handle_endtag(self, tag: str):
        lowtag = tag.lower()
        if lowtag == 'h1':
            self._in_article_title = False
        elif lowtag == 'div':
            self._in_article_date = False
            self._in_article_title_bottom = False

    def handle_data(self, data: str):
        if self._in_article_title_bottom and self._in_article_date:
            self._in_article_date = False
            assert self.article_date is None
            date_text = data.strip()
            if '(' in date_text:
                date_text = date_text[0:date_text.index('(')].rstrip()
            if date_text == '':
                return
            try:
                self.article_date = dateutil.parser.parser(HungarianParserInfo()).parse(date_text, fuzzy=True)
            except:
                if self._log:
                    self._log.exception(f'Exception: self.article_date = dateutil.parser.parser(HungarianParserInfo()).parse("{data.strip()}", fuzzy = True)')
                else:
                    raise
            return

        if self._in_article_title:
            self._in_article_title = False
            assert self.article_title is None
            self.article_title = data.strip()
            return