-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsetup.py
More file actions
391 lines (327 loc) · 15.2 KB
/
setup.py
File metadata and controls
391 lines (327 loc) · 15.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python3
"""
Setup Script for Google Maps Scraper
=====================================
Downloads and seeds the top US cities into the database.
Run this once before using the scraper.
Usage:
python setup.py
"""
import sys
import requests
from rich.console import Console
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.table import Table
from db import DatabaseManager
console = Console()
# Data source for top US cities
CITIES_JSON_URL = "https://gist.githubusercontent.com/Miserlou/c5cd8364bf9b2420bb29/raw/2bf258763cdddd704f8ffd3ea9a3e81d25e2c6f6/cities.json"
SIMPLEMAPS_URL = "https://simplemaps.com/static/data/us-zips/1.90/basic/simplemaps_uszips_basicv1.90.zip"
SIMPLEMAPS_FALLBACK = "https://raw.githubusercontent.com/scpike/us-state-county-zip/master/geo-data.csv"
def download_cities() -> list:
"""Download city data from the JSON source."""
console.print("[yellow]Downloading city data from GitHub...[/yellow]")
try:
response = requests.get(CITIES_JSON_URL, timeout=30)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
console.print(f"[red]Failed to download city data: {e}[/red]")
sys.exit(1)
def format_city_data(raw_data: list) -> list:
"""
Format raw JSON into database tuples.
Input format: {"city": "New York", "state": "New York", "population": "8405837", "rank": "1"}
Output format: (rank, city, state, population, full_name)
"""
formatted = []
for item in raw_data:
try:
# Create standardized state abbreviation mapping
state_abbrev = get_state_abbreviation(item['state'])
full_name = f"{item['city']}, {state_abbrev}"
formatted.append((
int(item['rank']),
item['city'],
item['state'],
int(item['population']),
full_name
))
except (KeyError, ValueError) as e:
console.print(f"[yellow]Skipping malformed entry: {e}[/yellow]")
continue
return formatted
def get_state_abbreviation(state_name: str) -> str:
"""Convert full state name to two-letter abbreviation."""
abbreviations = {
"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR",
"California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
"Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID",
"Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS",
"Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
"Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS",
"Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV",
"New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY",
"North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK",
"Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
"South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT",
"Vermont": "VT", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV",
"Wisconsin": "WI", "Wyoming": "WY", "District of Columbia": "DC"
}
return abbreviations.get(state_name, state_name[:2].upper())
def download_zip_codes() -> list:
"""Download US zip code data with population from simplemaps (free tier)."""
import io, zipfile
console.print("[yellow]Downloading zip code data from simplemaps.com...[/yellow]")
try:
resp = requests.get(SIMPLEMAPS_URL, timeout=60, headers={'User-Agent': 'Mozilla/5.0'})
resp.raise_for_status()
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
csv_name = next(n for n in zf.namelist() if n.endswith('.csv'))
with zf.open(csv_name) as f:
import csv
reader = csv.DictReader(io.TextIOWrapper(f, encoding='utf-8'))
results = []
for row in reader:
zipcode = row.get('zip', '').strip().zfill(5)
if not zipcode or not zipcode.isdigit():
continue
# Skip military/non-standard zips
if row.get('military', '').lower() == 'true':
continue
population = 0
try:
population = int(float(row.get('population', 0) or 0))
except (ValueError, TypeError):
pass
lat = None
lng = None
try:
lat = float(row.get('lat', '') or 0) or None
lng = float(row.get('lng', '') or 0) or None
except (ValueError, TypeError):
pass
results.append((
zipcode,
row.get('state_id', '').strip(), # state_abbr
row.get('state_name', '').strip(), # state_name
row.get('city', '').strip().title(), # city
row.get('county_name', '').strip(), # county
population,
lat,
lng
))
console.print(f"[green]Loaded {len(results)} zip codes from simplemaps.[/green]")
return results
except Exception as e:
console.print(f"[yellow]simplemaps download failed ({e}), trying fallback source...[/yellow]")
# Fallback: scpike CSV (no population data)
try:
resp = requests.get(SIMPLEMAPS_FALLBACK, timeout=30)
resp.raise_for_status()
lines = resp.text.strip().split('\n')
results = []
for line in lines[1:]:
parts = line.split(',')
if len(parts) >= 6:
results.append((parts[3].strip().zfill(5), parts[2].strip(), parts[1].strip(), parts[5].strip().title(), parts[4].strip(), 0, None, None))
console.print(f"[yellow]Loaded {len(results)} zip codes (no population data - fallback source).[/yellow]")
return results
except Exception as e:
console.print(f"[red]Both zip code sources failed: {e}[/red]")
return []
def seed_database():
"""Main setup function: downloads cities and zip codes, populates database."""
console.print(Panel(
"[bold cyan]Google Maps Scraper - Database Setup[/bold cyan]\n"
"This will download and seed the top US cities and zip codes.",
title="Setup"
))
# Initialize database
console.print("\n[bold]Step 1:[/bold] Initializing database...")
db = DatabaseManager()
console.print("[green]Database initialized with WAL mode.[/green]")
# Check if cities already exist
existing_count = db.get_city_count()
if existing_count > 0:
console.print(f"[yellow]Database already contains {existing_count} cities.[/yellow]")
if not console.input("Overwrite? (y/N): ").lower().startswith('y'):
console.print("[dim]Skipping city import.[/dim]")
_seed_zip_codes(db)
show_summary(db)
return
# Download cities
console.print("\n[bold]Step 2:[/bold] Downloading city data...")
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task = progress.add_task("Fetching from GitHub...", total=None)
raw_data = download_cities()
progress.update(task, completed=True)
console.print(f"[green]Downloaded {len(raw_data)} cities.[/green]")
# Format and insert
console.print("\n[bold]Step 3:[/bold] Processing and inserting cities...")
formatted_data = format_city_data(raw_data)
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task = progress.add_task("Inserting into database...", total=None)
inserted = db.insert_city_bulk(formatted_data)
progress.update(task, completed=True)
console.print(f"[green]Inserted {len(formatted_data)} cities into database.[/green]")
# Seed zip codes
_seed_zip_codes(db)
# Show summary
show_summary(db)
db.close()
def _seed_zip_codes(db: DatabaseManager):
"""Download and seed US zip codes if not already present."""
existing_zips = db.get_zip_count()
if existing_zips > 0:
console.print(f"[yellow]Database already contains {existing_zips} zip codes.[/yellow]")
return
console.print("\n[bold]Step 4:[/bold] Downloading US zip codes...")
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task = progress.add_task("Fetching zip codes...", total=None)
zip_data = download_zip_codes()
progress.update(task, completed=True)
if not zip_data:
console.print("[yellow]Skipping zip code seeding (download failed).[/yellow]")
return
console.print(f"[green]Downloaded {len(zip_data)} zip codes.[/green]")
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task = progress.add_task("Inserting zip codes...", total=None)
db.insert_zip_codes_bulk(zip_data)
progress.update(task, completed=True)
console.print(f"[green]Inserted {len(zip_data)} zip codes into database.[/green]")
def show_summary(db: DatabaseManager):
"""Display summary of available presets."""
console.print("\n[bold]Available City Presets:[/bold]")
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Preset", style="cyan")
table.add_column("Cities", justify="right")
table.add_column("Example Cities")
presets = ["top_10", "top_100", "top_1000", "top_2500"]
for preset in presets:
cities = db.get_cities_by_preset(preset)
count = len(cities)
# Get example cities
if count >= 3:
examples = f"{cities[0]}, {cities[1]}, {cities[2]}..."
elif count > 0:
examples = ", ".join(cities)
else:
examples = "N/A"
table.add_row(preset, str(count), examples)
console.print(table)
# Show zip code presets
zip_count = db.get_zip_count()
if zip_count > 0:
console.print(f"\n[bold]Zip Code Presets:[/bold] ({zip_count} total zip codes)")
zip_table = Table(show_header=True, header_style="bold magenta")
zip_table.add_column("Preset", style="cyan")
zip_table.add_column("Zips", justify="right")
zip_table.add_column("Description")
for pname, pdesc in [
("top_10", "10 highest-population zips"),
("top_100", "100 highest-population zips"),
("top_1000", "1,000 highest-population zips"),
("top_5000", "5,000 highest-population zips"),
("top_10000", "10,000 highest-population zips"),
("all", f"All {zip_count} zip codes"),
]:
count = min(int(pname.replace('top_', '')) if pname != 'all' else zip_count, zip_count)
zip_table.add_row(pname, str(count), pdesc)
console.print(zip_table)
console.print(Panel(
"[green]Setup complete![/green]\n\n"
"Next steps:\n"
" Scrape by zip preset: [cyan]python main.py --query \"smoke shop\" --zip-preset top_100 --no-proxy[/cyan]\n"
" Scrape a city's zips: [cyan]python main.py --query \"smoke shop\" --zip-city \"Dallas, TX\" --no-proxy[/cyan]\n"
" Scrape a whole state: [cyan]python main.py --query \"smoke shop\" --zip-state TX --no-proxy[/cyan]\n\n"
"Optional: load Census population for accurate top-N ranking:\n"
" [cyan]python setup.py --enrich-zips[/cyan]",
title="Done"
))
def enrich_zip_populations(db: DatabaseManager):
"""
Fetch ZCTA population from the 2020 US Census API and update zip_codes table.
No API key required for this endpoint.
"""
console.print(Panel(
"Fetching 2020 Census ZCTA populations (~33k records).\n"
"This runs once and enables proper zip-code ranking by population.",
title="Zip Population Enrichment"
))
# Check if already enriched
import sqlite3
max_pop = db.conn.execute("SELECT MAX(population) FROM zip_codes").fetchone()[0]
if max_pop and max_pop > 0:
console.print(f"[yellow]Already enriched ({max_pop:,} max population). Use --force to re-fetch.[/yellow]")
return
# ACS 5-year estimates, 2020 — B01003_001E = total population
url = "https://api.census.gov/data/2020/acs/acs5?get=B01003_001E,NAME&for=zip+code+tabulation+area:*"
console.print("[yellow]Fetching from api.census.gov (this may take 30-60 seconds)...[/yellow]")
try:
resp = requests.get(url, timeout=120)
resp.raise_for_status()
data = resp.json()
# data[0] = ["B01003_001E", "NAME", "zip code tabulation area"]
# data[1:] = [["population", "ZCTA5 XXXXX", "XXXXX"], ...]
pop_map = {}
for row in data[1:]:
try:
pop = int(row[0])
zipcode = row[2].strip().zfill(5)
pop_map[zipcode] = pop
except (ValueError, IndexError, TypeError):
continue
console.print(f"[green]Fetched population for {len(pop_map):,} ZCTAs.[/green]")
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
task = progress.add_task("Updating zip populations...", total=None)
updated = db.update_zip_populations(pop_map)
progress.update(task, completed=True)
console.print(f"[green]Updated {updated:,} zip codes with population data.[/green]")
# Show top 10 by population as a sanity check
top = db.get_zips_by_preset('top_10')
if top:
console.print(f"\n[bold]Top 10 most-populous zip codes:[/bold]")
for z in top:
info = db.get_zip_info(z)
pop = info.get('population', 0)
city = info.get('city', '?')
state = info.get('state_abbr', '?')
console.print(f" {z} {city}, {state} ({pop:,})")
except Exception as e:
console.print(f"[red]Census API fetch failed: {e}[/red]")
console.print("[dim]Zip presets will use city-population ranking as fallback.[/dim]")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Google Maps Scraper Setup")
parser.add_argument('--enrich-zips', action='store_true',
help='Fetch Census 2020 ZCTA population data and update zip rankings')
parser.add_argument('--force', action='store_true',
help='Force re-download even if data already exists')
args = parser.parse_args()
if args.enrich_zips:
db = DatabaseManager()
if args.force:
db.conn.execute("UPDATE zip_codes SET population = 0")
db.conn.commit()
enrich_zip_populations(db)
db.close()
else:
seed_database()