-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathThreeWordPhraseScraper.py
More file actions
53 lines (39 loc) · 1.63 KB
/
ThreeWordPhraseScraper.py
File metadata and controls
53 lines (39 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests, bs4, os
url = 'http://threewordphrase.com'
#setting directory for saving file
os.chdir("Specified Directory")
os.makedirs('Comics', exist_ok=True)
while not url.endswith('#'):
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'lxml')
# identify .gif within website
for comicImg in soup.findAll("div", {"align": "center"}):
for findImg in comicImg.findAll("table", {"width": "403"}):
getImg = findImg.findAll("img")
# identify previous comic URL
for comicPrev in soup.findAll("div", {"align": "center"}):
for checkPrev in comicPrev.findAll("td", {"width": "173"}):
for prevUrl in checkPrev.findAll("a"):
prevUrl = prevUrl['href']
# ensure that img object has found a picture
if (comicImg == []) or (getImg == []):
print('No image boys.')
# parse the comic url and verify it's valid
else:
try:
comicUrl = "http://threewordphrase.com/" + getImg[0].get('src')
print('Lets DL this: %s' % (comicUrl))
res = requests.get(comicUrl)
res.raise_for_status()
except requests.exceptions.MissingSchema:
url = 'http://threewordphrase.com' + prevUrl
continue
#write the picture to the drive
imageFile = open(os.path.join('Comics', os.path.basename(comicUrl)), 'wb')
for chunk in res.iter_content(1000000):
imageFile.write(chunk)
imageFile.close()
url = 'http://threewordphrase.com' + prevUrl
print('Done.')