-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
126 lines (100 loc) · 3 KB
/
scraper.py
File metadata and controls
126 lines (100 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# scraper.py
import os
import time
import json
import logging
import requests
from datetime import datetime, timezone
from dotenv import load_dotenv
load_dotenv()
TOKEN = os.getenv("WAQI_TOKEN")
if not TOKEN:
raise RuntimeError("WAQI_TOKEN not found in environment (.env). Create .env with WAQI_TOKEN=your_token")
LOG_FILENAME = "scraper.log"
logger = logging.getLogger("waqi-scraper")
logger.setLevel(logging.DEBUG)
# console handler (INFO+)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
logger.addHandler(ch)
# file handler (DEBUG+)
fh = logging.FileHandler(LOG_FILENAME)
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
logger.addHandler(fh)
CITIES = [
"new-delhi",
"mumbai",
"dwarka",
"agra",
"jaipur",
"goa",
"udaipur",
"kochi",
"varanasi",
"amritsar",
"manali",
"noida",
"lucknow",
"indore",
"kanpur",
"muzaffarnagar",
"dwarka"
]
WAQI_BASE = "https://api.waqi.info/feed/{city}/"
# -------------------------
# Helper: fetch single city
# -------------------------
def fetch_city(city: str, token: str, timeout=10):
url = WAQI_BASE.format(city=city)
params = {"token": token}
try:
resp = requests.get(url, params=params, timeout=timeout)
resp.raise_for_status()
data = resp.json()
except Exception as e:
logger.warning(f"[{city}] HTTP/Network error: {e}")
return {"error": str(e)}
if data.get("status") != "ok":
logger.warning(f"[{city}] API returned non-ok status: {data}")
return {"error": f"api_status_{data.get('status')}", "raw": data}
d = data.get("data", {})
iaqi = d.get("iaqi", {})
def val(k):
return iaqi.get(k, {}).get("v")
result = {
"city": city,
"aqi": d.get("aqi"),
"pm25": val("pm25"),
"pm10": val("pm10"),
"no2": val("no2"),
"so2": val("so2"),
"o3": val("o3"),
"co": val("co"),
"temp": val("t"),
"time": d.get("time", {}).get("s")
}
logger.debug(f"[{city}] Parsed result: {result}")
return result
def scrape_all(cities, token, out_file="aqi.json", delay=1.0):
logger.info("Starting WAQI scrape for %d cities", len(cities))
results = {}
for city in cities:
logger.info("Fetching: %s", city)
res = fetch_city(city, token)
results[city] = res
time.sleep(delay)
final = {
"last_updated": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
"cities": results
}
with open(out_file, "w", encoding="utf-8") as f:
json.dump(final, f, indent=4, ensure_ascii=False)
logger.info("Saved %s (cities: %d)", out_file, len(results))
return final
if __name__ == "__main__":
try:
scrape_all(CITIES, TOKEN)
except Exception as exc:
logger.exception("Unhandled exception in scraper: %s", exc)