PyComicScraper/ThreeWordPhraseScraper.py at master · DuldR/PyComicScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests, bs4, os
url = 'http://threewordphrase.com'

#setting directory for saving file
os.chdir("Specified Directory")
os.makedirs('Comics', exist_ok=True)

while not url.endswith('#'):

    print('Downloading page %s...' % url)
    res = requests.get(url)
    res.raise_for_status()

    soup = bs4.BeautifulSoup(res.text, 'lxml')

    # identify .gif within website
    for comicImg in soup.findAll("div", {"align": "center"}):
        for findImg in comicImg.findAll("table", {"width": "403"}):
            getImg = findImg.findAll("img")

    # identify previous comic URL
    for comicPrev in soup.findAll("div", {"align": "center"}):
        for checkPrev in comicPrev.findAll("td", {"width": "173"}):
            for prevUrl in checkPrev.findAll("a"):
                prevUrl = prevUrl['href']

    # ensure that img object has found a picture
    if (comicImg == []) or (getImg == []):
        print('No image boys.')

    # parse the comic url and verify it's valid
    else:
        try:
            comicUrl = "http://threewordphrase.com/" + getImg[0].get('src')
            print('Lets DL this: %s' % (comicUrl))
            res = requests.get(comicUrl)
            res.raise_for_status()

        except requests.exceptions.MissingSchema:

            url = 'http://threewordphrase.com' + prevUrl
            continue

    #write the picture to the drive
    imageFile = open(os.path.join('Comics', os.path.basename(comicUrl)), 'wb')
    for chunk in res.iter_content(1000000):
        imageFile.write(chunk)


    imageFile.close()
    url = 'http://threewordphrase.com' + prevUrl

print('Done.')