-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
84 lines (70 loc) · 3.48 KB
/
scraper.py
File metadata and controls
84 lines (70 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
def scrape_genre(url, genre):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
books = []
bookElements = WebDriverWait(driver, 10).until(
EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.bookImage'))
)
for index, element in enumerate(bookElements):
driver.execute_script("arguments[0].scrollIntoView(true);", element)
time.sleep(1)
image = element.get_attribute('src')
action = ActionChains(driver)
action.move_to_element(element).perform()
time.sleep(1)
title_elements = driver.find_elements(By.CSS_SELECTOR, 'a.readable.bookTitle')
titles = [title_element.text.strip() for title_element in title_elements if title_element.text.strip()]
author_elements = driver.find_elements(By.CSS_SELECTOR, '.authorName')
authors = [author_element.text.strip() for author_element in author_elements if author_element.text.strip()]
more_buttons = driver.find_elements(By.CSS_SELECTOR, 'div.addBookTipDescription a')
if more_buttons:
try:
more_button = more_buttons[index]
if more_button:
driver.execute_script("arguments[0].click();", more_button)
time.sleep(1)
long_description_elements = driver.find_elements(By.CSS_SELECTOR, 'div.addBookTipDescription span[id^=freeText]:not([id*="Container"])')
else:
long_description_elements = driver.find_elements(By.CSS_SELECTOR, 'div.addBookTipDescription span[id^=freeTextContainer]')
except Exception as e:
print(f"{index + 1}: {e}")
long_description_elements = driver.find_elements(By.CSS_SELECTOR, 'div.addBookTipDescription span[id^=freeText]:not([id*="Container"])')
long_descriptions = [description.text.strip() for description in long_description_elements if description.text.strip()]
for title, author, description in zip(titles, authors, long_descriptions):
print(f"Book {index + 1}: {title} by {author}. Url:{image}")
print(f"Description: {description}")
print("---")
books.append({'title': title, 'author': author, 'image_url':image, 'description': description, 'genre': genre})
if index < len(bookElements) - 1:
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.CONTROL + Keys.HOME)
time.sleep(1)
driver.quit()
return books
def append_to_csv(data, filename):
df = pd.DataFrame(data)
if os.path.exists(filename):
df.to_csv(filename, mode='a', header=False, index=False)
else:
df.to_csv(filename, index=False)
urls = [
('https://www.goodreads.com/genres/most_read/fiction/', 'fiction'),
('https://www.goodreads.com/genres/most_read/non-fiction/', 'non-fiction'),
]
output_file = 'books.csv'
for url, genre in urls:
print(f"Scraping {genre} books...")
books_data = scrape_genre(url, genre)
append_to_csv(books_data, output_file)
print(f"Finished scraping {genre} books. Data appended to {output_file}")
time.sleep(5)
print("All scraping complete!")