-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathscrape_webpage_contents.py
More file actions
executable file
·85 lines (60 loc) · 3.13 KB
/
scrape_webpage_contents.py
File metadata and controls
executable file
·85 lines (60 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/python3
import bs4
import re
import generate_report
import datetime
def read_webpage_contents(file_path):
"""
This function simply writes the content of the html webpage
section of where the shopee orders are. This step is done
to easily analyze the contents of the webpage during
development so I won't have to read the html over and
over again.
"""
with open(file_path, "r", encoding = "utf-8") as f:
contents = f.read()
return contents
def get_dict_items(file_path, account_name):
"""
This function reads the file where the webpage contents
are saved in the user's computer. Then it scrapes the
html elements where the purchased item name and puchased
item price are found.
"""
page_contents = read_webpage_contents(file_path)
item_dict = {}
bs_item_info = bs4.BeautifulSoup(page_contents, "html.parser")
# This method is not ordered.
# item_name_tag = {"class": "order-content__item__detail-content"}
# item_price_tag = {"class": "purchase-card-buttons__total-price"}
# item_name_list = [i.getText() for i in bs.find_all(attrs = item_name_tag)]
# item_price_list = [i.getText() for i in bs.find_all(attrs = item_price_tag)]
# for item_num, (item_name, item_price) in enumerate(zip(item_name_list, item_price_list)):
# print("{:<3} {:<175} {:<20}".format(item_num, item_name, item_price))
item_info_tag = {"class": "_32cw_C"}
item_info_list = bs_item_info.find_all(attrs = item_info_tag)
for item_num, item_info in enumerate(item_info_list,1):
# Each item_info is a html_element blocks about the item purchased
# like name and its price. We extract it again so it is ordered.
# The first methos above is not ordered.
#print(item_info)
#print()
item_name_regex_pattern = r'<span class="_18b78C">(.+?)<\/span>?'
item_name = re.findall(item_name_regex_pattern, item_info.decode())
item_price_regex_pattern = r'<div class="isoXOF">(.+?)<\/div>?'
item_price = re.findall(item_price_regex_pattern, item_info.decode())
#print(item_name[0], item_price[0].replace("₱","").replace(",",""))
item_dict[item_num] = {"name": item_name[0], "price": item_price[0].replace("₱","").replace(",","")}
# for item in item_dict:
# print(item_dict[item])
return item_dict
if __name__ == "__main__":
# Sample test only.
sample_username = "romnegrillo"
file_title = "Summary of your Shopee Orders"
date_generated = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
file_name = "shopee-summary-" + date_generated.replace(" ", "-").replace("/","-").replace(":","-").replace(",","-") + ".pdf"
item_info = get_dict_items("webpage_contents.html", sample_username)
total_money_spent = sum([float(val["price"]) for key, val in item_info.items()])
file_description = "Account of: {}\nGenerated on: {}\n Total Money Spent on Shopee: {:,.2f}".format(sample_username, date_generated, total_money_spent)
generate_report.generate_report_from_dict(file_name, file_title, file_description, item_info)