-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtelexhtmlparser.py
More file actions
109 lines (97 loc) · 3.7 KB
/
telexhtmlparser.py
File metadata and controls
109 lines (97 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import dateutil.parser
import html.parser
import logging.config
import re
class HungarianParserInfo(dateutil.parser.parserinfo):
MONTHS = ['január',
'február',
'március',
'április',
'május',
'június',
'július',
'augusztus',
'szeptember',
'október',
'november',
'december']
class TelexHTMLParser(html.parser.HTMLParser):
def __init__(self, log: logging.Logger = None):
html.parser.HTMLParser.__init__(self)
self._link_pattern = re.compile(r'(?:(?:https?://)(?:www\.)?telex\.hu)?/+([\w-]+/+\d+/+\d+/+\d+(?:/+[\w-]+)+)/*', re.IGNORECASE)
self._in_article_date = False
self._in_article_title = False
self._in_article_title_bottom = False
self._log = log
self.article_date = None
self.article_title = None
self.links: list[str] = []
def error(self, message):
raise Exception(message)
def handle_starttag(self, tag: str, attrs: list):
lowtag = tag.lower()
if lowtag == 'h1':
for attr in attrs:
if len(attr) < 2:
continue
if attr[0].lower() != 'class':
continue
if attr[1] is None:
continue
if attr[1] == 'article_title':
self._in_article_title = True
return
if lowtag == 'div':
for attr in attrs:
if len(attr) < 2:
continue
if attr[0].lower() != 'class':
continue
if attr[1] is None:
continue
if attr[1] == 'article_date':
self._in_article_date = True
elif attr[1] == 'article_title-bottom':
self._in_article_title_bottom = True
return
if lowtag == 'a':
for attr in attrs:
if len(attr) < 2:
continue
if attr[0].lower() != 'href':
continue
if attr[1] is None:
continue
match = self._link_pattern.fullmatch(attr[1].strip().lower())
if match:
self.links.append(match.group(1))
return
def handle_endtag(self, tag: str):
lowtag = tag.lower()
if lowtag == 'h1':
self._in_article_title = False
elif lowtag == 'div':
self._in_article_date = False
self._in_article_title_bottom = False
def handle_data(self, data: str):
if self._in_article_title_bottom and self._in_article_date:
self._in_article_date = False
assert self.article_date is None
date_text = data.strip()
if '(' in date_text:
date_text = date_text[0:date_text.index('(')].rstrip()
if date_text == '':
return
try:
self.article_date = dateutil.parser.parser(HungarianParserInfo()).parse(date_text, fuzzy=True)
except:
if self._log:
self._log.exception(f'Exception: self.article_date = dateutil.parser.parser(HungarianParserInfo()).parse("{data.strip()}", fuzzy = True)')
else:
raise
return
if self._in_article_title:
self._in_article_title = False
assert self.article_title is None
self.article_title = data.strip()
return