-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimdb-bday-scraper.py
More file actions
104 lines (83 loc) · 4.46 KB
/
imdb-bday-scraper.py
File metadata and controls
104 lines (83 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 5 20:03:57 2016
@author: Erik Lundin
"""
from urllib import request
from lxml import html
import requests
import datetime
import os.path
import sys
from multiprocessing.dummy import Pool as ThreadPool
imdb_bday_url = 'http://www.imdb.com/search/name?birth_day={0}&birth_month={1}&refine=birth_monthday'
imdb_base_url = 'http://www.imdb.com'
imdb_list_xpath = '//*[@id="main"]/table/tr/td[@class="name"]/a' #removed tbody, http://stackoverflow.com/questions/32015083/python-xpath-returns-an-empty-list#comment51932748_32015083
imdb_img_xpath = '//*[@id="name-poster"]/@src'
imdb_birthyear_xpath = '//*[@id="name-born-info"]/time/a[2]/text()'
#imdb_deathyear_xpath = '//*[@id="name-death-info"]/time/a[2]/text()'
#imdb_birthdate_xpath = '//*[@id="name-born-info"]/time/@datetime'
#imdb_deathdate_xpath = '//*[@id="name-death-info"]/time/@datetime'
app_dir = "c:\\users\\erik.lundin\\desktop\\imdb_scraper\\{0}"
app_dir_fw = "c:/users/erik.lundin/desktop/imdb_scraper/{0}"
start_date = datetime.date(2016,1,1) # use 2016 since it's a leap year, other than that it's just an arbitrary value for year since year is irrelevant
end_date = datetime.date(2016,12,31) #(2016,1,2)
delta = datetime.timedelta(days=1) # set the iteration step to 1 day
persons_per_bday = 5
bdays = list() # the root node of the data to generate
d = start_date # d holds the current date iteration
existing_imgs = os.listdir(app_dir_fw.format(''))
dates = list()
while d <= end_date: # For each day, create a birthday object containing its date and an array of 5 birthday peeps
dates.append(d)
d += delta
def getBirthdayObject(d):
bday_temp = {} # Reset birthday object
bday_temp["persons"] = list()
stamp = d.strftime("%d/%m");
str_day = d.strftime("%d");
str_month = d.strftime("%m");
bday_temp["date"] = stamp
print("Starting downloads for " + stamp +
". URL: " + imdb_bday_url.format(str_day, str_month))
imdb_page = requests.get(imdb_bday_url.format(str_day,str_month)) # Get the web page content
page_tree = html.fromstring(imdb_page.content) # Get the html tree
elements = page_tree.xpath(imdb_list_xpath) # Get a list of html elements, each person in the listing
x = 0
x_max = persons_per_bday
while x < x_max: # Loop through the first <n> persons in the listing, <for x in range(0, persons_per_bday)>
try:
person_temp = {} # Reset person object
person_temp["url"] = imdb_base_url + elements[x].attrib.get('href') # Get the link to the person's imdb page
person_temp["name"] = elements[x].text # Get the person's name
print(stamp + " " + person_temp["name"])
imdb_person_page = requests.get(person_temp["url"]) # Get the web page content
person_tree = html.fromstring(imdb_person_page.content) # Get the html tree
img_url = person_tree.xpath(imdb_img_xpath) # Get image src URL
img_type = os.path.splitext(img_url[0])[1] # Get the file extension part
img_name = person_temp["name"].replace(" ", "") + img_type # Make a file name out of the person's full name
if img_name not in existing_imgs:
request.urlretrieve(img_url[0], app_dir.format(img_name))
person_temp["img"] = img_name
person_birthyear = person_tree.xpath(imdb_birthyear_xpath) # Get birthyear
if person_birthyear:
person_temp["birth_year"] = person_birthyear[0]
bday_temp["persons"].append(person_temp)
x+=1;
except:
print(">>>" + stamp + " " + "Unexpected error: ", sys.exc_info()[0])
x_max+=1
if x_max >= 20: # Arbitrary max amount of retries
print(stamp + " " + "Too many retries, can't find 5 persons, exiting loop for the given day")
break
bdays.append(bday_temp) # Add to birthday list
# Make the Pool of workers
pool = ThreadPool(12)
# Open the urls in their own threads and return the results
results = pool.map(getBirthdayObject, dates)
#close the pool and wait for the work to finish
pool.close()
pool.join()
import json
with open(app_dir_fw.format('birthdays.json'), 'w') as outfile:
json.dump(bdays, outfile, indent=4, sort_keys=True) # indent and sort_keys prettyfies the JSON