-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstats_scraper.py
More file actions
80 lines (64 loc) · 2.68 KB
/
stats_scraper.py
File metadata and controls
80 lines (64 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
# ---------------- CONFIG ----------------
CHROMEDRIVER_PATH = r"C:\WebDriver\chromedriver.exe"
SEASON_GROUPS = [
list(range(2001, 2006)), # 2000-01 to 2005-06
list(range(2006, 2011)),
list(range(2011, 2016)),
list(range(2016, 2021)),
list(range(2021, 2026)) # 2020-21 to 2024-25
]
OUTPUT_DIR = "data"
FINAL_OUTPUT_CSV = os.path.join(OUTPUT_DIR, "all_players_per36_2001_2025.csv")
# ----------------------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)
all_dfs = []
for idx, group in enumerate(SEASON_GROUPS, 1):
group_rows = []
print(
f"\nScraping group {idx}: Seasons {group[0]-1}-{str(group[-1])[2:]} to {group[-1]-1}-{str(group[-1])[2:]}")
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)
for season_end_year in group:
season_str = f"{season_end_year-1}-{str(season_end_year)[-2:]}"
url = f"https://www.basketball-reference.com/leagues/NBA_{season_end_year}_per_minute.html"
print(f" Scraping season {season_str}...")
try:
driver.get(url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
table = soup.find("table", attrs={"id": "per_minute_stats"})
if not table:
print(f" ⚠️ Table not found for {season_str}, skipping...")
continue
df = pd.read_html(str(table))[0]
df = df[df["Player"] != "Player"].copy() # remove repeated headers
df["season"] = season_str
group_rows.append(df)
except Exception as e:
print(f" ⚠️ Skipped {season_str}: {e}")
time.sleep(2)
driver.quit()
# Save group CSV
if group_rows:
group_df = pd.concat(group_rows, ignore_index=True)
group_csv = os.path.join(OUTPUT_DIR, f"per36_stats_group_{idx}.csv")
group_df.to_csv(group_csv, index=False)
all_dfs.append(group_df)
print(f"✅ Saved group {idx} -> {group_csv} ({len(group_df)} rows)")
# Combine all groups
if all_dfs:
final_df = pd.concat(all_dfs, ignore_index=True)
final_df.to_csv(FINAL_OUTPUT_CSV, index=False)
print(
f"\n🎉 Combined all groups -> {FINAL_OUTPUT_CSV} ({len(final_df)} rows)")