-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scraper.py
More file actions
96 lines (78 loc) · 3.22 KB
/
web_scraper.py
File metadata and controls
96 lines (78 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from bs4 import BeautifulSoup
import requests
import os
from pymongo import MongoClient
# Gather total number of new items added to database
total_listings_added = 0
connection_string = 'mongodb://localhost:27017/'
client = MongoClient(connection_string)
db = client.jerseyscraper_db
collection = db.jersey_info
# Find newly published jesey listings
def VFSScraper():
new_listings = 0
try:
# Since the VFS website may add many jerseys to the site at a time,
# search first three pages for any new additions
for i in range(1,4):
URL = SOME_URL
result = requests.get(URL)
# Scrape all listings on "New products" page
soup = BeautifulSoup(result.content, 'html.parser')
jerseysListVFS = soup.find('div', id="content").find_all("div", class_="col4")
# Collect name, price and link
for jersey in jerseysListVFS:
name = jersey.find('img')['alt']
link = jersey.find('a')['href']
price = jersey.find('span').text
new_listings += db_add(name, link, price)
except Exception as e:
print(e)
return new_listings
def VFAScraper():
new_listings = 0
try:
baseURL= SOME_baseURL
# baseURL not included in href, so concatenate it manually
# Search first three pages for new items
for i in range(1,4):
URL = SOME_URL2
result = requests.get(URL)
# Scrape all listings on "Our collection" page
soup = BeautifulSoup(result.content, 'html.parser')
jerseysListVFA = soup.find('div', class_='collection-listing cf').find_all('div', class_='innerer')
# Collect name, price and link
for jersey in jerseysListVFA:
name = jersey.find('div', class_='title').text
link = baseURL + jersey.find('a', class_='product-link')['href']
price = jersey.find('span', class_='price').find('span', class_='theme-money').text
new_listings += db_add(name, link, price)
except Exception as e:
print(e)
return new_listings
# Send desktop notification when finished
def send_notification(title, text):
os.system("""
osascript -e 'display notification "{}" with title "{}"'
""".format(text, title))
# Returns 1 if no duplicate item found and adds new item to database
# Returns 0 if duplicate item found, nothing new added to database
def db_add(name, URL, cost):
# Create new document only if no duplicates found
if collection.find_one({'URL': URL}) == None:
new_doc = {
'name': name,
'URL': URL,
'price': cost
}
collection.insert_one(new_doc)
return 1
else:
return 0
# Run scrapers and send desktop notification with updates
if __name__ == "__main__":
total_listings_added += VFAScraper() # VFA new listings added to total
total_listings_added += VFSScraper() # VFS new listings added to total
ttl = "jerseyscraper finished"
msg = f"{total_listings_added} new listings found"
send_notification(ttl, msg) # Print finished message including the number of new listings added