-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathosu-scrape.py
More file actions
101 lines (80 loc) · 4.06 KB
/
osu-scrape.py
File metadata and controls
101 lines (80 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import asyncio
from playwright.async_api import async_playwright
from pathlib import Path
async def scrape_osu_textbooks():
"""
Scrapes Oregon State Open Education textbooks catalog
and downloads all available PDFs in headless mode
"""
# Create downloads directory
download_dir = Path("./osu_textbooks")
download_dir.mkdir(exist_ok=True)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
accept_downloads=True
)
page = await context.new_page()
all_book_urls = []
# Step 1: Scrape all book URLs from paginated catalog
print("Collecting book URLs from catalog pages...")
for page_num in range(1, 7):
catalog_url = f"https://open.oregonstate.education/?pg={page_num}#catalog"
print(f"Scraping catalog page {page_num}...")
await page.goto(catalog_url, wait_until='networkidle')
await page.wait_for_timeout(2000)
# Extract book links - adjust selector based on actual HTML
book_links = await page.eval_on_selector_all(
'a[href*="open.oregonstate.edu"]',
'''(elements) => elements
.map(el => el.href)
.filter(href =>
href.includes('open.oregonstate.education/') &&
!href.includes('?pg=') &&
!href.includes('#catalog') &&
!href.includes('/h5p-listing') &&
href !== 'https://open.oregonstate.education/'
)
'''
)
unique_links = list(set(book_links))
all_book_urls.extend(unique_links)
print(f"Found {len(unique_links)} books on page {page_num}")
all_book_urls = list(set(all_book_urls))
print(f"\nTotal unique books found: {len(all_book_urls)}\n")
# Step 2: Download PDFs from each book page
for idx, book_url in enumerate(all_book_urls, 1):
try:
print(f"[{idx}/{len(all_book_urls)}] Processing: {book_url}")
# Navigate to book page first
await page.goto(book_url, wait_until='domcontentloaded')
await page.wait_for_timeout(1000)
# Extract book title for filename
title = await page.title()
safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
safe_title = safe_title[:100] or "untitled"
# Construct download URL
download_url = f"{book_url.rstrip('/')}/open/download?type=pdf"
print(f" Downloading from: {download_url}")
# METHOD 1: Use expect_download with JavaScript trigger
async with page.expect_download(timeout=120000) as download_info:
# Trigger download via JavaScript instead of page.goto
await page.evaluate(f"""
() => {{
window.location.href = '{download_url}';
}}
""")
download = await download_info.value
# Save with custom filename
file_path = download_dir / f"{safe_title}.pdf"
await download.save_as(file_path)
print(f" ✓ Saved: {file_path.name}\n")
# Wait a bit between downloads to be respectful
await page.wait_for_timeout(1000)
except Exception as e:
print(f" ✗ Error downloading {book_url}: {str(e)}\n")
continue
await browser.close()
print(f"\n✓ Download complete! Files saved to: {download_dir.absolute()}")
if __name__ == "__main__":
asyncio.run(scrape_osu_textbooks())