story_plot_generator/imdb_scraper.py at master · criticallycode/story_plot_generator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import requests
from bs4 import BeautifulSoup
import re
import csv
import numpy as np
import time

# genres = Drama, Horror, Thriller, Mystery, Action, Sci-fi

pages = list(range(1,9))
url_base = 'https://www.imdb.com/search/title?genres=Horror&start='
url_end = '&explore=title_type,genres&ref_=adv_nxt'
print(pages)

nums = np.arange(0, 1001, 50)
print(nums)

# handle loop through by incrementing number in groups of 50

for num in nums:
    time.sleep(3)
    complete_url = url_base + str(num) + url_end
    #print(complete_url)
    print("Getting URL data")
    r = requests.get(complete_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    blocks = (soup.find_all('h3', {'class':'lister-item-header'}))
    #text = links.find_all(class_='frame')
    #print(blocks)

    for title in blocks:

        titles = []

        for y in title.find_all('a'):
            titles.append(y.text)

        print(titles)

        with open('Horror_movies.csv', 'a') as file:
            writer = csv.writer(file, delimiter=',')
            writer.writerow(titles)