-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathtraversal_rule_identifier.py
More file actions
106 lines (91 loc) · 4.28 KB
/
traversal_rule_identifier.py
File metadata and controls
106 lines (91 loc) · 4.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import logging
import stanza
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.DEBUG)
# This class gives the match rule for traversal for template given parsed beautiful soup object and author_name
# and given a traversal rule identify the author
class TraversalRule:
NER_TAG_PERSON = "PERSON"
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
def __init__(self, parsed_page, author_name, traversal_rule):
self.parsed_page = parsed_page
self.author_name = author_name
self.traversal_rule = list()
if traversal_rule:
self.traversal_rule = traversal_rule
# https://stackoverflow.com/questions/54265391/find-all-end-nodes-that-contain-text-using-beautifulsoup4
def mark_if_leaf_with_text(self, node):
if node.name in ["style", "script", "link", "meta"]:
return False
if not node.text:
return False
elif len(node.find_all(text=False)) > 0: # no other tags inside other than text
return False
node.leaf = True
return False
def is_leaf_nodes_with_people(self, node):
if not node.leaf:
return False
text = node.text
processed_text = self.nlp(text)
no_of_entities = len(processed_text.entities)
people = [ent for ent in processed_text.entities if ent.type == self.NER_TAG_PERSON]
no_of_people = len(people)
self.candidates = list()
if no_of_people > 0:
node.no_of_people = no_of_people
node.no_of_entities = no_of_entities
node.people = people
return True
return False
def find_leaf_nodes_with_people(self, node):
self.mark_if_leaf_with_text(node)
return self.is_leaf_nodes_with_people(node)
def find_candidates(self):
logging.info("Finding Candidate Authors")
# Use stanza to identify people in text, the nodes containing them and traversal for the leaf nodes
candidate_author_nodes_details = self.parsed_page.find_all(self.find_leaf_nodes_with_people)
candidate_authors = list()
for node in candidate_author_nodes_details:
candidate_author = dict()
# This condition may not hold
if node.no_of_people == 1:
candidate_author['author_entity'] = node.people[0].text
candidate_author['ancestors'] = [parent.name for parent in node.parents]
candidate_author['node_name'] = node.name
candidate_authors.append(candidate_author)
self.candidates = candidate_authors
def pick_traversal_from_author(self):
logging.info("Picking the traversal rule given author")
#self.find_candidates()
for candidate in self.candidates:
candidate_author_name = candidate['author_entity']
normal_name = self.get_normal_name()
logging.info("Candidate Author Name: {}, normal name {}".format(candidate_author_name, normal_name))
if candidate_author_name == self.author_name or candidate_author_name == normal_name:
self.traversal_rule = candidate['ancestors']
def get_author_from_traversal(self):
logging.info("Picking the author from candidates based on the traversal rule")
self.find_candidates()
candidates = self.candidates
if candidates:
for candidate in candidates:
if candidate['ancestors'] == self.traversal_rule:
return candidate['author_entity']
return None
def get_normal_name(self):
if "," in self.author_name:
split = self.author_name.split(",")
not_last_name = split[1]
last_name = split[0]
normal_name = not_last_name + " " + last_name
return normal_name.strip()
return "None"
if __name__ == "__main__":
with open("./resources/linkedin-origin-ab-testing-nicolai-kramer-jakobsen.html", "r", encoding="UTF-8") as fp:
html_content = fp.read()
soup = BeautifulSoup(html_content, 'lxml')
soup = BeautifulSoup(soup.prettify('utf-8'), 'lxml') # some inputs are so messy that they affect the output
t_rule = TraversalRule(soup, "Nicolai Kramer Jakobsen", None)
t_rule.pick_traversal_from_author()
print(t_rule.traversal_rule)