-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathcms_maker.py
More file actions
74 lines (60 loc) · 2.49 KB
/
cms_maker.py
File metadata and controls
74 lines (60 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import urllib3
import traversal_rule_identifier
from bs4 import BeautifulSoup
import json
import tldextract
import certifi
import ssl
ssl_context = ssl.SSLContext()
ssl_context.load_verify_locations(certifi.where())
http = urllib3.PoolManager(ssl_context=ssl_context)
class AuthorTraversalRules:
persistence_type = "json"
def __init__(self, filename):
self.filename = filename
self.author_traversal_rules = dict()
self.load_author_traversal_rules()
def load_author_traversal_rules(self):
with open(self.filename, "r") as fp:
self.author_traversal_rules = json.load(fp)
def get_author_traversal_for_url(self, url):
extract_result = tldextract.extract(url)
host_url = extract_result.registered_domain
if host_url in self.author_traversal_rules:
return self.author_traversal_rules[host_url]
return None
class FindAuthorWithTraversal:
def __init__(self, url, author_traversal_rule_for_site):
self.url = url
self.author_traversal_rule = author_traversal_rule_for_site
self.page_content = None
def load_page_content(self):
self.page_content = http.request('GET', self.url).data
def get_author(self):
self.load_page_content()
soup = BeautifulSoup(self.page_content, 'lxml')
soup = BeautifulSoup(soup.prettify('utf-8'), 'lxml')
t = traversal_rule_identifier.TraversalRule(soup, None, self.author_traversal_rule)
return t.get_author_from_traversal()
class FindAuthor:
domain_traversal_file = "./resources/domain_traversal_rules-500.json"
domain_traversal = AuthorTraversalRules(domain_traversal_file)
domain_found = False
def __init__(self, url):
self.url = url
extracted = tldextract.extract(url)
site = extracted.registered_domain
domains = self.domain_traversal.author_traversal_rules.keys()
if site not in domains:
pass
else:
self.domain_found = True
self.find_author = FindAuthorWithTraversal(self.url, self.domain_traversal.author_traversal_rules[site])
def get_author(self):
if self.domain_found:
return self.find_author.get_author()
else:
print("WARNING: The traversal rules doesn't exist for the domain, please enter another url")
return ""
if __name__ == "__main__":
print(FindAuthor("https://www.linkedin.com/pulse/automating-user-creation-aws-sftp-service-transfer-arjun-dandagi/").get_author())