-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfema-scrape.py
More file actions
100 lines (78 loc) · 3.01 KB
/
fema-scrape.py
File metadata and controls
100 lines (78 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import re
from urllib.parse import urljoin, unquote
import time
# Setup
output_dir = Path("fema_guidance")
output_dir.mkdir(exist_ok=True)
# Get the page
url = "https://www.fema.gov/about/reports-and-data/guidance"
print(f"Fetching page: {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
soup = BeautifulSoup(response.text, 'html.parser')
# Find all PDF links
pdf_links = []
for link in soup.find_all('a', href=True):
href = link['href']
if '.pdf' in href or '/sites/default/files/' in href:
# Clean up the URL - remove trailing spaces and decode
full_url = urljoin(url, href)
full_url = full_url.strip() # Remove leading/trailing spaces
full_url = unquote(full_url) # Decode URL encoding
full_url = full_url.rstrip() # Remove any remaining whitespace
title = link.get_text(strip=True) or href.split('/')[-1]
pdf_links.append({'url': full_url, 'title': title})
print(f"Found: {title}")
print(f"\nTotal PDFs found: {len(pdf_links)}")
# Download each PDF
downloaded = 0
failed = 0
for i, pdf in enumerate(pdf_links, 1):
try:
# Create filename from URL
filename = pdf['url'].split('/')[-1]
# Remove query parameters
if '?' in filename:
filename = filename.split('?')[0]
# Clean filename - remove any spaces or weird characters
filename = filename.strip()
# Ensure .pdf extension
if not filename.endswith('.pdf'):
filename += '.pdf'
filepath = output_dir / filename
# Skip if exists
if filepath.exists():
print(f"[{i}/{len(pdf_links)}] ✓ Already exists: {filename}")
downloaded += 1
continue
# Download
print(f"[{i}/{len(pdf_links)}] Downloading: {filename}")
# Make sure URL doesn't have trailing spaces
clean_url = pdf['url'].strip()
pdf_response = requests.get(clean_url, headers=headers, timeout=60, stream=True)
pdf_response.raise_for_status()
# Save
with open(filepath, 'wb') as f:
for chunk in pdf_response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
size_kb = filepath.stat().st_size / 1024
print(f" ✓ Saved: {size_kb:.1f} KB")
downloaded += 1
time.sleep(0.5) # Be nice to the server
except requests.exceptions.HTTPError as e:
print(f" ✗ HTTP Error: {e}")
failed += 1
except Exception as e:
print(f" ✗ Error: {e}")
failed += 1
print(f"\n{'='*60}")
print(f"✓ Successfully downloaded: {downloaded}/{len(pdf_links)}")
print(f"✗ Failed: {failed}")
print(f"{'='*60}")
print(f"\n✓ Done! Files saved to: {output_dir.absolute()}")