From b09b7f32ca3cf33edbe73e5d2d36e71e55082045 Mon Sep 17 00:00:00 2001 From: Edward Bernal Date: Wed, 28 Jan 2026 22:45:01 -0800 Subject: [PATCH 1/4] Claude fixes --- scraper_logic.py | 658 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 460 insertions(+), 198 deletions(-) diff --git a/scraper_logic.py b/scraper_logic.py index 79cbb49..d57ef11 100644 --- a/scraper_logic.py +++ b/scraper_logic.py @@ -1,257 +1,481 @@ +""" +Scrapes yearly baseball league stats from Baseball Almanac. + +High-level flow: +- Open the year menu page and collect yearly links (AL/NL). +- For each year page, parse player/team tables and a small "events" blurb. +- Flatten into Pandas DataFrames and export to CSV. +""" + +from __future__ import annotations + +import logging import os -import pandas as pd -import json -from time import sleep import re +import argparse from collections import defaultdict +from time import sleep +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Tuple +import pandas as pd from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService -from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager + + +YEAR_MENU_URL = "https://www.baseball-almanac.com/yearmenu.shtml" + +# Matches the year-menu link format like ".../yearly/yr1970n.shtml" or ".../yearly/yr1934a.shtml". +YEARLY_LINK_RE = re.compile(r"/yearly/yr(?P\d{4})(?P[an])\.shtml$") + +# Matches the H1 header content on year pages. +YEAR_LEAGUE_HEADER_RE = re.compile(r"(?P\d{4})\s(?PAMERICAN|NATIONAL)\sLEAGUE") + +# Extracts canonical stat table keys from header text. +STAT_TABLE_KEY_RE = re.compile(r"\b(Hitting Statistics|Pitching Statistics|Standings)\b") + + +class Scraper: + def __init__( + self, + *, + headless: bool = True, + profile_dir: str = "selenium_profile", + logger: Optional[logging.Logger] = None, + ): + """ + Initialize the scraper state and create a Selenium Chrome driver. -# Define a directory for the user profile (cache and cookies will be saved here) -profile_dir = os.path.abspath('selenium_profile') + - **headless**: Run Chrome without a visible UI (useful for servers/CI). + - **profile_dir**: Directory where Chrome user-data is stored (cache/cookies). + - **logger**: Optional logger for progress reporting. + """ + self.logger = logger or logging.getLogger(__name__) -# Ensure the directory exists -if not os.path.exists(profile_dir): - os.makedirs(profile_dir) + self.events: Dict[int, Dict[str, List[str]]] = {} + self.player_stats: DefaultDict[int, Dict[str, Dict[str, List[Dict[str, str]]]]] = defaultdict(dict) + self.team_stats: DefaultDict[int, Dict[str, Dict[str, List[Dict[str, str]]]]] = defaultdict(dict) -options = webdriver.ChromeOptions() -options.add_argument('--headless') # Enable headless mode -options.add_argument('--disable-gpu') # Optional, recommended for Windows -options.add_argument(f"--user-data-dir={profile_dir}") # Specify the user data directory argument + self.profile_dir = os.path.abspath(profile_dir) + os.makedirs(self.profile_dir, exist_ok=True) -driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),options=options) + self.logger.info("Initializing Chrome driver (headless=%s, profile_dir=%s)", headless, self.profile_dir) + self.driver = self._build_driver(headless=headless, profile_dir=self.profile_dir) + self.logger.info("Chrome driver ready") -class Scraper(): - def __init__(self): - self.events = {} - self.player_stats = defaultdict(dict) - self.team_stats = defaultdict(dict) + def _build_driver(self, *, headless: bool, profile_dir: str) -> webdriver.Chrome: + """ + Create and return a configured Chrome WebDriver instance. - def scrape(self): + Note: `webdriver_manager` downloads/chooses an appropriate chromedriver automatically. + """ + options = webdriver.ChromeOptions() + if headless: + options.add_argument("--headless") # Headless mode + options.add_argument("--disable-gpu") # Historically helpful on some platforms + options.add_argument(f"--user-data-dir={profile_dir}") + + return webdriver.Chrome( + service=ChromeService(ChromeDriverManager().install()), + options=options, + ) + + def close(self) -> None: + """Close the Selenium driver (safe to call multiple times).""" + try: + self.logger.info("Closing Chrome driver") + self.driver.quit() + except Exception: + pass + + def scrape(self, *, limit_years: Optional[int] = None) -> None: + """ + Orchestrate the end-to-end scrape and write CSV outputs to the current directory. + + - **limit_years**: If provided, only scrape the first N yearly links (useful for testing). + + Outputs: + - `player_hit.csv`, `player_pitch.csv` + - `team_hit.csv`, `team_pitch.csv` + - `standing.csv` + """ try: - links = self.get_year_links("https://www.baseball-almanac.com/yearmenu.shtml") + self.logger.info("Scrape started") + links = self.get_year_links(YEAR_MENU_URL) + # Example: subset runs for testing # links = ["https://www.baseball-almanac.com/yearly/yr1887n.shtml", "https://www.baseball-almanac.com/yearly/yr1970n.shtml"] + + if limit_years is not None: + if limit_years <= 0: + self.logger.warning("limit_years=%s requested; nothing to scrape", limit_years) + return + self.logger.info("Limiting scrape to first %d yearly links (testing mode)", limit_years) + links = links[:limit_years] + self.log_data(links) - # self.log_data(["https://www.baseball-almanac.com/yearly/yr1970n.shtml", "https://www.baseball-almanac.com/yearly/yr1986n.shtml", "https://www.baseball-almanac.com/yearly/yr1887n.shtml", "https://www.baseball-almanac.com/yearly/yr1883n.shtml", "https://www.baseball-almanac.com/yearly/yr1934a.shtml"]) - except Exception as e: print("Unable to open the url provided.") print(f"Exception: {type(e).__name__} {e}") + finally: + # Always try to close the driver, even if scraping fails mid-run. + self.close() - player_hit_df, player_pitch_df, player_standing_df = self.convert_stats_to_df(self.player_stats) + self.logger.info("Converting scraped stats to DataFrames") + player_hit_df, player_pitch_df, _player_standing_df = self.convert_stats_to_df(self.player_stats) team_hit_df, team_pitch_df, standing_df = self.convert_stats_to_df(self.team_stats) - - # # TODO THIS IS TEST TO MAKE SURE DATA IS CORRECT + + # Useful debugging: # temp = pd.json_normalize(self.player_stats) - # temp.to_csv("test.csv", index = False) - - print(player_hit_df) - print(player_pitch_df) - player_hit_df.to_csv("player_hit.csv", index = False) - player_pitch_df.to_csv("player_pitch.csv", index = False) - team_hit_df.to_csv("team_hit.csv", index = False) - team_pitch_df.to_csv("team_pitch.csv", index = False) - standing_df.to_csv("standing.csv", index = False) - - driver.quit() - - def get_year_links(self, link): - driver.get(link) - search_results = driver.find_elements(By.CSS_SELECTOR, "table.ba-sub > tbody > tr > td.datacolBox > a") - # only scraping data for the American and National leagues - pattern = r"yr\d{4}(a|n)\.shtml$" - links = [link.get_attribute("href") for link in search_results if re.search(pattern, link.get_attribute("href"))] - + # temp.to_csv("test.csv", index=False) + + outputs = [ + ("player_hit.csv", player_hit_df), + ("player_pitch.csv", player_pitch_df), + ("team_hit.csv", team_hit_df), + ("team_pitch.csv", team_pitch_df), + ("standing.csv", standing_df), + ] + + for filename, df in outputs: + self.logger.info("Writing %s (%d rows, %d cols)", filename, len(df.index), len(df.columns)) + df.to_csv(filename, index=False) + + self.logger.info("Scrape finished successfully") + + def get_year_links(self, menu_url: str) -> List[str]: + """ + Load the year-menu page and return yearly AL/NL links. + + Important filter: + - Baseball Almanac also has historical leagues where URLs may end in "a". + We keep **American League only for years >= 1901** and keep all National League years. + """ + self.logger.info("Loading year menu: %s", menu_url) + self.driver.get(menu_url) + + anchors = self.driver.find_elements( + By.CSS_SELECTOR, + "table.ba-sub > tbody > tr > td.datacolBox > a", + ) + + links: List[str] = [] + for a in anchors: + href = a.get_attribute("href") or "" + m = YEARLY_LINK_RE.search(href) + if not m: + continue + + year = int(m.group("year")) + league_code = m.group("league_code") + + # 'n' => National League (keep all years), 'a' => American League (skip pre-1901). + if league_code == "a" and year < 1901: + continue + + links.append(href) + + self.logger.info("Found %d yearly links (post-filter)", len(links)) return links - - # This gets the driver for the new page - def get_driver_new_page(self, link): - driver.get(link) - - def get_year_league(self, driver): - # pulling the header from the intro to get the year and the league - scraped_data = driver.find_element(By.CSS_SELECTOR, "div.intro > h1") - pattern = r"\d{4}\s(AMERICAN|NATIONAL)\sLEAGUE" + + def log_data(self, links: Iterable[str]) -> None: + """ + Visit each yearly link and extract player/team/event data into in-memory dictionaries. + + Note: uses a small `sleep()` after navigation; a more robust improvement is to use + Selenium's WebDriverWait and wait for the tables to exist. + """ + links_list = list(links) + total = len(links_list) + self.logger.info("Scraping %d yearly pages", total) + + for idx, link in enumerate(links_list, start=1): + try: + self.logger.info("(%d/%d) Loading: %s", idx, total, link) + self.driver.get(link) + sleep(2) + except Exception: + self.logger.warning("(%d/%d) Failed to load: %s", idx, total, link) + continue + + year, league = self.get_year_league() + if not year or not league: + self.logger.warning("(%d/%d) Skipping page (could not parse year/league): %s", idx, total, link) + continue + + self.logger.info("(%d/%d) Parsed: year=%s league=%s", idx, total, year, league) + player, team = self.get_data() + self.player_stats[year][league] = player + self.team_stats[year][league] = team + self.logger.info( + "(%d/%d) Extracted tables: player=%d team=%d", + idx, + total, + len(player.keys()), + len(team.keys()), + ) + + # Events are shared per year (AL/NL pages can both include events; first wins). + if year not in self.events: + self.events[year] = self.clean_events() + self.logger.info("(%d/%d) Extracted events keys: %s", idx, total, list(self.events[year].keys())) + + def get_year_league(self) -> Tuple[Optional[int], Optional[str]]: + """ + Parse the current yearly page header to determine (year, league). + + Returns: + - `(year, league)` where league is "American League" or "National League" + - `(None, None)` if the header doesn't match expectations + """ try: - search_result = re.search(pattern, scraped_data.text).group() - if search_result: - year, league = search_result.split(" ", 1) - year, league = int(year), league.title() - if (year >= 1901 and league == "American League") or league == "National League": - return year, league - # TODO This is being raised because American Association has link that also ends in a. Need to fix + header = self.driver.find_element(By.CSS_SELECTOR, "div.intro > h1").text except Exception: - pass + return None, None + + m = YEAR_LEAGUE_HEADER_RE.search(header or "") + if not m: + return None, None + + year = int(m.group("year")) + league = m.group("league").title() + " League" + + # Guardrails: American League is considered modern-era starting 1901. + if league == "American League" and year < 1901: + return None, None + + return year, league + + def get_data(self) -> Tuple[Dict[str, List[Dict[str, str]]], Dict[str, List[Dict[str, str]]]]: + """ + Parse all boxed tables on the current yearly page. + + Returns: + - `player_stats_dict`: maps stat table name -> list of row dicts + - `team_stats_dict`: maps stat table name -> list of row dicts + + Table names are derived from the table's header text (e.g. "Hitting Statistics"). + """ + player_stats_dict: Dict[str, List[Dict[str, str]]] = {} + team_stats_dict: Dict[str, List[Dict[str, str]]] = {} + + boxed_tables = self.driver.find_elements(By.CSS_SELECTOR, "table.boxed") + self.logger.debug("Found %d boxed tables on page", len(boxed_tables)) + for table in boxed_tables: + col_names: List[str] = [] + duplicate_rows: Dict[int, List[Any]] = {} + table_name: Optional[List[str]] = None + col_num: Optional[int] = None + data_list: List[List[str]] = [] - - # TODO Make this smaller functions T_T - def get_data(self, driver): - player_stats_dict = {} - team_stats_dict = {} - search_results = driver.find_elements(By.CSS_SELECTOR, "table.boxed") - - for result in search_results: - col_names = [] - duplicate_rows = {} - table_name = None - col_num = None - data_list = [] - - rows = result.find_elements(By.TAG_NAME, "tr") + rows = table.find_elements(By.TAG_NAME, "tr") for row in rows: - # league_pattern = r"(American|National)\sLeague" temp_table_name, temp_col_num = self.find_table_name_and_columns(row) - temp_col_names, temp_dup_rows = self.find_col_names(row) - data, temp_dup_rows = self.find_cell_data(row, col_num, duplicate_rows) + temp_col_names, temp_dup_from_header = self.find_col_names(row) + row_data, temp_dup_from_cells = self.find_cell_data(row, col_num, duplicate_rows) + + # Update running table context (name/columns/header-driven dupes). if temp_table_name: table_name = temp_table_name if temp_col_num: col_num = temp_col_num - if temp_dup_rows: - duplicate_rows = temp_dup_rows + if temp_dup_from_header: + duplicate_rows = temp_dup_from_header if temp_col_names: col_names = temp_col_names - - # TODO clean up events (do it in a seperate function??) - if data and col_names: - if len(data) == len(col_names): - data_list.append(data) - + + # Cell-driven rowspan dupes should augment (or replace) the current dupes. + if temp_dup_from_cells is not None: + duplicate_rows = temp_dup_from_cells + + # Only record rows where we have both column headers and a matching-length row. + if row_data and col_names and len(row_data) == len(col_names): + data_list.append(row_data) + if table_name and col_names and data_list: - # Convert the list of rows into a list of dictionaries - list_of_dictionaries = [dict(zip(col_names, row)) for row in data_list] - - # Determine which dictionary to add to + list_of_dicts = [dict(zip(col_names, row)) for row in data_list] if table_name[0] == "Player": - player_stats_dict[table_name[-1]] = list_of_dictionaries + player_stats_dict[table_name[-1]] = list_of_dicts elif table_name[0] == "Team": - team_stats_dict[table_name[-1]] = list_of_dictionaries - + team_stats_dict[table_name[-1]] = list_of_dicts + self.logger.debug( + "Captured table %s (%d rows, %d cols)", + " / ".join(table_name), + len(list_of_dicts), + len(col_names), + ) + return player_stats_dict, team_stats_dict - - def find_table_name_and_columns(self, row): - table_name = [] + + def find_table_name_and_columns(self, row) -> Tuple[Optional[List[str]], Optional[int]]: + """ + Detect the table category/name from header rows and extract expected column count. + + Returns: + - `(table_name_parts, num_cols)` where `table_name_parts` looks like + `["Player", "Hitting Statistics"]` or `["Team", "Standings"]` + - `(None, None)` when the row isn't a header row + """ + table_name: List[str] = [] player_pattern = r"(Player|Pitcher)" team_pattern = r"Team(?= Review)|Team Standings" - stat_name = r"^.+Statistics" + # NOTE: we normalize stat keys to match `convert_stats_to_df()` expectations. + # Pages often contain header strings like: + # "1901 Hitting Statistics League Leaderboards | 1902 →" + # We want the canonical "Hitting Statistics" key. + + headers: List[str] = [] try: - headers = [header.text for header in row.find_elements(By.XPATH, ".//h2 | .//p")] - except: - pass + headers = [h.text for h in row.find_elements(By.XPATH, ".//h2 | .//p")] + except Exception: + return None, None + if not headers: return None, None - num_cols = row.find_element(By.TAG_NAME, "td").get_attribute("colspan") - - if match := re.search(player_pattern, headers[0]): - player = "Player" - table_name.append(player) - if match := re.search(team_pattern, headers[0]) or (match := re.search(team_pattern, headers[1])): - team = match.group().split(" ") - table_name.extend(team) - if match := re.search(stat_name, headers[1]): - stat = match.group() - table_name.append(stat) - - return table_name, int(num_cols) - - def find_col_names(self, row): + + try: + num_cols_attr = row.find_element(By.TAG_NAME, "td").get_attribute("colspan") + num_cols = int(num_cols_attr) if num_cols_attr else None + except Exception: + num_cols = None + + # Player/Pitcher tables are treated as "Player" category. + is_player = bool(re.search(player_pattern, headers[0])) + if is_player: + table_name.append("Player") + + # Team tables can show up in different header positions depending on the page. + header0 = headers[0] if len(headers) > 0 else "" + header1 = headers[1] if len(headers) > 1 else "" + m_team = re.search(team_pattern, header0) or re.search(team_pattern, header1) + if m_team: + table_name.extend(m_team.group().split(" ")) + + # Stat key is usually in the second header line (but we normalize it). + if len(headers) > 1: + m_key = STAT_TABLE_KEY_RE.search(headers[1]) + if m_key: + table_name.append(m_key.group(1)) + + if not table_name: + return None, None + + return table_name, num_cols + + def find_col_names(self, row) -> Tuple[Optional[List[str]], Optional[Dict[int, List[Any]]]]: + """ + Extract column names from a "banner" row and detect header rowspans. + + Returns: + - `col_names`: list of column names + - `duplicate_row_val`: mapping from column index -> [value, remaining_rowspan] + used to fill in missing cells on subsequent rows. + """ + elements = [] try: elements = row.find_elements(By.XPATH, ".//td[contains(@class, 'banner')]") - except: - pass - col_names = [] - duplicate_row_val = {} + except Exception: + return None, None + if not elements: return None, None - regions = ["East", "Central", "West"] - for idx, name in enumerate(elements): - num_rows = name.get_attribute("rowspan") + + col_names: List[str] = [] + duplicate_row_val: Dict[int, List[Any]] = {} + regions = {"East", "Central", "West"} + + for idx, el in enumerate(elements): + num_rows = el.get_attribute("rowspan") if num_rows: - duplicate_row_val[idx] = [name.text, int(num_rows)] - if name.text in regions: + duplicate_row_val[idx] = [el.text, int(num_rows)] + + if el.text in regions: col_names.append("Region") else: - col_names.append(name.text.replace(" [Click for roster]", "").strip()) + col_names.append(el.text.replace(" [Click for roster]", "").strip()) + return col_names, duplicate_row_val - def find_cell_data(self, row, num_cols, duplicate_rows): + def find_cell_data( + self, + row, + num_cols: Optional[int], + duplicate_rows: Dict[int, List[Any]], + ) -> Tuple[Optional[List[str]], Dict[int, List[Any]]]: + """ + Extract cell text for a data row, handling rowspans by re-inserting duplicated values. + + Inputs: + - **num_cols**: expected number of columns for this table (may be None early in parsing) + - **duplicate_rows**: rolling rowspan state from prior rows + """ + cells = [] try: - cells = row.find_elements(By.XPATH, ".//td[contains(@class, 'datacolBox') or contains(@class, 'datacolBlue')]") - except: - pass + cells = row.find_elements( + By.XPATH, + ".//td[contains(@class, 'datacolBox') or contains(@class, 'datacolBlue')]", + ) + except Exception: + return None, duplicate_rows + if not cells: return None, duplicate_rows - data = [] + + data: List[str] = [] for idx, cell in enumerate(cells): num_rows = cell.get_attribute("rowspan") if num_rows: duplicate_rows[idx] = [cell.text, int(num_rows)] data.append(cell.text.strip()) - if len(data) != num_cols: - for idx, value in duplicate_rows.items(): + + # Only attempt to backfill when we actually know the intended width. + if num_cols is not None and len(data) != num_cols: + for idx, value in list(duplicate_rows.items()): data.insert(idx, value[0]) duplicate_rows[idx][1] -= 1 + duplicate_rows = {k: v for k, v in duplicate_rows.items() if v[1] > 0} - # if len(cells) > 1 and len(cells) == len(col_names): - # prev_cells = cells - # cell_results.append(cells) return data, duplicate_rows - def clean_events(self, driver): - # TODO save events links and scrape that for winners - events_dict = {} - row = None + def clean_events(self) -> Dict[str, List[str]]: + """ + Extract the small "Events" / "Salary" text block from the current yearly page. + + Returns a mapping like: + - "Special Events" -> ["...", "..."] + - "Salary" -> ["...", "..."] + """ + events_dict: Dict[str, List[str]] = {} + try: - row = driver.find_element(By.XPATH, ".//td[contains(., 'Events') or contains(., 'Salary')]") - except: - pass - if not row: + row = self.driver.find_element(By.XPATH, ".//td[contains(., 'Events') or contains(., 'Salary')]") + except Exception: return events_dict - - event_text = row.text.split("\n") - - for text in event_text: - text = text.split(": ") - title = text[0] - info = text[1].split(" | ") + + event_text = (row.text or "").split("\n") + for line in event_text: + if ": " not in line: + continue + + title, rhs = line.split(": ", 1) if "Events" in title or "Salary" in title: - events_dict[title] = info + events_dict[title] = rhs.split(" | ") + return events_dict - - # def get_event(self, driver): - # search_results = driver.find_elements(By.CSS_SELECTOR, "table.boxed > tbody > tr") - - # print(search_results) - - def log_data(self, links : list): - for link in links: - try: - driver.get(link) - sleep(2) - except Exception: - pass - year, league = self.get_year_league(driver) - if year and league: - player, team = self.get_data(driver) - self.player_stats[year][league] = player - self.team_stats[year][league] = team - if not self.events.get(year): - events = self.clean_events(driver) - self.events[year] = events - - - def convert_events_to_df(self, dictionary): - # Events will have tables [Events, Salary] - events_list = ["Special Events", "Salary"] - - def convert_stats_to_df(self, dictionary): - hit_table = [] - pitch_table = [] - standing_table = [] - # Current list of tables for stats [Hitting Statistics, Pitching Statistics, Standings] + + def convert_stats_to_df(self, dictionary: Dict[int, Dict[str, Dict[str, List[Dict[str, str]]]]]) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Flatten the nested stats dictionary into three DataFrames: + - hitting stats + - pitching stats + - standings + + Each output row is enriched with `Year` and `League`. + """ + hit_table: List[Dict[str, Any]] = [] + pitch_table: List[Dict[str, Any]] = [] + standing_table: List[Dict[str, Any]] = [] + for year, leagues in dictionary.items(): for league, data in leagues.items(): for items in data.get("Hitting Statistics", []): @@ -260,20 +484,58 @@ def convert_stats_to_df(self, dictionary): self.add_to_table(pitch_table, items, year, league) for items in data.get("Standings", []): self.add_to_table(standing_table, items, year, league) - - hit_stats = pd.DataFrame(hit_table) - pitch_stats = pd.DataFrame(pitch_table) - standing_stats = pd.DataFrame(standing_table) - return hit_stats, pitch_stats, standing_stats - + return pd.DataFrame(hit_table), pd.DataFrame(pitch_table), pd.DataFrame(standing_table) + + def add_to_table(self, table: List[Dict[str, Any]], items: Dict[str, Any], year: int, league: str) -> None: + """ + Append a single stats row into an output table, adding Year/League context columns. + """ + if not items: + return + + stats = dict(items) + stats["Year"] = year + stats["League"] = league + table.append(stats) + + def convert_events_to_df(self, dictionary: Dict[int, Dict[str, List[str]]]) -> pd.DataFrame: + """ + Convert the events dictionary into a DataFrame for easier export/analysis. + + Current `events` shape is: + - year -> { "Special Events": [...], "Salary": [...] } + """ + rows: List[Dict[str, Any]] = [] + for year, event_groups in dictionary.items(): + for title, items in event_groups.items(): + rows.append({"Year": year, "Title": title, "Items": " | ".join(items)}) + return pd.DataFrame(rows) - def add_to_table(self, table, items, year, league): - if items: - stats = items.copy() - stats["Year"] = year - stats["League"] = league - table.append(stats) if __name__ == "__main__": - Scraper().scrape() \ No newline at end of file + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(levelname)s | %(message)s", + ) + + parser = argparse.ArgumentParser(description="Scrape Baseball Almanac yearly stats.") + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit to the first N yearly links (useful for testing). Example: --limit 5", + ) + args = parser.parse_args() + + limit = args.limit + if limit is None: + # Interactive prompt for convenience when running manually. + try: + raw = input("How many years to scrape? (press Enter for all): ").strip() + if raw: + limit = int(raw) + except (EOFError, ValueError): + limit = None + + Scraper().scrape(limit_years=limit) \ No newline at end of file From 0dc3e560f74765c7dfd21121f1e4fe1246c28ad6 Mon Sep 17 00:00:00 2001 From: Edward Bernal Date: Wed, 28 Jan 2026 22:55:54 -0800 Subject: [PATCH 2/4] refactored into multiple imports Delete diamond_data_scraper/__pycache__/__init__.cpython-311.pyc Delete diamond_data_scraper/__pycache__/cli.cpython-311.pyc --- .gitignore | 9 + README.md | 4 + diamond_data_scraper/__init__.py | 2 + .../__pycache__/constants.cpython-311.pyc | Bin 0 -> 668 bytes .../__pycache__/scraper.cpython-311.pyc | Bin 0 -> 22754 bytes .../webdriver_factory.cpython-311.pyc | Bin 0 -> 1675 bytes diamond_data_scraper/cli.py | 78 +++ diamond_data_scraper/constants.py | 13 + diamond_data_scraper/scraper.py | 444 ++++++++++++++ diamond_data_scraper/webdriver_factory.py | 28 + scraper_logic.py | 543 +----------------- 11 files changed, 582 insertions(+), 539 deletions(-) create mode 100644 .gitignore create mode 100644 diamond_data_scraper/__init__.py create mode 100644 diamond_data_scraper/__pycache__/constants.cpython-311.pyc create mode 100644 diamond_data_scraper/__pycache__/scraper.cpython-311.pyc create mode 100644 diamond_data_scraper/__pycache__/webdriver_factory.cpython-311.pyc create mode 100644 diamond_data_scraper/cli.py create mode 100644 diamond_data_scraper/constants.py create mode 100644 diamond_data_scraper/scraper.py create mode 100644 diamond_data_scraper/webdriver_factory.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6c0dab6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +# Outputs +*.csv + +# Selenium Chrome profile/cache +selenium_profile/ + +# Python cache +__pycache__/ +*.py[cod] \ No newline at end of file diff --git a/README.md b/README.md index d898a50..a670553 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,6 @@ # Diamond_Data_Scraper Final Capstone project for Code The Dream Python 100 (Python Essentials) + + +python scraper_logic.py --limit 3 +# Limit is optional \ No newline at end of file diff --git a/diamond_data_scraper/__init__.py b/diamond_data_scraper/__init__.py new file mode 100644 index 0000000..f671fe1 --- /dev/null +++ b/diamond_data_scraper/__init__.py @@ -0,0 +1,2 @@ +"""Diamond Data Scraper package.""" + diff --git a/diamond_data_scraper/__pycache__/constants.cpython-311.pyc b/diamond_data_scraper/__pycache__/constants.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10862fffc12b2a2b813286e3153ef9c2be08a352 GIT binary patch literal 668 zcmZ9K&ui0Q7{}i zo$r2-k|DwHY=CViK)CuA`W?wir#%^0GSL?OA753DRipkw(JyqRQGcZ?nq*N;RbDpL zamDKH9Jq|R{@G)bLFQ8CI`rbiWlk@{xa=d&sAP4ee4Gn$Txh9^VIQk{({37Ce1{*X z?-CrZCR}UTT3tV~4K*&!*RCE#o6TwQbXGi_taA2n`%~`oO0;cFH#)P8 eP6SFEmbNbUuMVTFH`DdjY`qnMVp_xGQ~m|sMZd2A literal 0 HcmV?d00001 diff --git a/diamond_data_scraper/__pycache__/scraper.cpython-311.pyc b/diamond_data_scraper/__pycache__/scraper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c7d9a6c6eb959b43d3f5a2f29da8948dd0fbbc6 GIT binary patch literal 22754 zcmb_^dr%v9nqcdRgai^uyuoh3Jj5IPz{c?d1Ab!S7(elN6rmO%BP84{{LqM<>{Mn) z&T<#*W6Df6S5am%8D`_$d2fVOkI}y zWAFQZEw$7F5@+uk@pbp_{ruj){#ijmJ_Xl5>AvINdz7O7C%&XFz5Jm0;t@PNq!^0P z1gII!m_~!o+JJUUOYXWc9o%&R{fuGEK;rZP<4oRI9(gtdOf%*&^GyC&{)}bJGGiUH z&J>Il%oL6l&e+Cmsj}>2MNoz@P(0%pb3j-gV+uHDO2$fNO2Q3)2XnJ^^6C6l786Fg(cI=vsjiVAQ>*q%?5l> zhW6Oq-)pcoom4b>q5p#WLjSShA@`Y~tM2ndQpwe!{`2nPq5cyWhukNJ`Ui*3V{jn` z4_|c;pBgy>?@)|me9j+W+}zF>q`nqvU&m3fIuj|JQ5Ncy+@;4ooNwF{2+#qaXL8O*b1*j?Jpp~DkENsf12dkW z$J>!l3OMPXoN5pFZutWAL?Cp#j|8^U1ED~`=MAUJ4fuo89NpYM+&MDr>PV$I9%5;q z$2&z5(X*aOUmHM{^U<>b&s`te8TNT*=rB$%M|*+{?V-6DY+M~w7YuWC^mt&79aoAA zc*0>{koE_|A^IGo_i*%}C+s=SdS+4u`0mVx*f0(81EZJZd>14;=L`6P{<#_VEE}5e z2Ye&2R7_Jo4+Hgbk|h=DX8f#V41^{peJooFb1dn{L!kg$hL1Wf%(CV1v{9a3f!8pp zKRDSjOtM_};v&RL2mc27uKD5|gg&IAy4mt@Iv%2KD9_>4vVuVBeF*(PiHTBfEQwrx z3f`2IVM{ukqQch9^QQcH>9`lpVM^zSQa6;^^^rG^TZWYS># zBdr=iKtGh$zd#*2b)NtU;X*ff>R_12D#JB?D(DaUJpq5jM~A0iP0M?LtZg4zS6_G4>2l^cAKT8rx~2Ubv@yi`7`oXD+r-!D4TYxtJ}$L^ z(#6V45=%@kJz5T;Ct!m?rIReIW198h+Q$W{3`}IdGBgmLZh+irx;fp?Lk%44OL=SH zCV+RBOG~x@S^N=)YP6st4|8|KWtH+$J3ul|`oi)qkW6lO5VoV+O?Hc9ly@?VYgf`E z)|agE=64g+BjqCwNImQQLFiF2CN;{9bm=8M?B0n?V@Rco`HE5g;&tAALoiK?rfJ?Z{k+umc>l91zVxC{dQmLB zh$SYoGjs^1PSMoKr+;5SqhPRDUwDoUO8Svd(5IjwjKXn(&8hif5gr~=We+eThnEkO0NA9!1JqMt1({IPE{fG_sH-IAHg!jLmAb8Q z=_30F0wEv=xNDWInO>fpX98kEv==GD4Dbwop!amx54hgd5h=l(n-=wm%Rm+gy9-*w zHUNm#g#OT#uoIES65*(OY4@M0?>1?hPwulq337yy`bz201uwAvTW ztXiuR)@r_Hk6_&^TKDqSy;;G6b%$u(!KZ&;KDU*xQyOzo20-H2U~F(*2cchn`6XwA z;=i}Mm--L+{RZ79dJVu0O@~V>SpxnUSc^!sIVlgY4k1pa8DDVDJ;w&vZpgS%kVXgB zfFfRJdmv1ie4Hl4+GO%^(*F6AA6F(H%2Tdr&N7+TN40-?JqnD_G+9<`k7}uHxojB7 zqoxiTvP9#!Y&Tt7EizRdTQ6tB<~pK!MjO?BUt6Kn0j;OvNV7LxQT^ua@M7%?R@B3G zd$F|XE%5^VF>cWw<+(bf?@@{+BAXLG2!gxw1VBp-y5J?WN+-I2+bB9B2W`eDQG;BR4G{P*i{#i)} zf)v|}A$icIxtSm*86hpS-(@3YmEDbbkUSB3BpKvt3o=6m1xvQSj*-803sqFIWLdS-2|LZ# zj`81`6zo%?eTuhFCCh7;1Iyo84v6Kgi-t9;{mrv)oLz1Zth8vQc`J<}S%rNW8&ti1 z`K5~S>=#VOMAI?ebS!!J*y`a+iNlxp%h$xqQ^H}tc-SwP4#o-=zsoltO!n+s?Kz$3 zIsI(UZ;uE)*TkM{z=Fmsi`V04){Ru*j^%mY+MX=f8Q;lUyORY~%h!0TD_KyJ3YXv4 zEk;LvEN|UT)$CfQbY>R;_l9F97sr#OH4lRig7HzI)Fqa>c*j2Yt(=tau|d`Fzx0Ip z&A>D5Z%v<^{q@=3jY6BRikCgY=(spKE?B*y)yrGGNo(Por`|ZV$iy3Brvz)WXl>@n zPc;I)f@!~K+Rvwd9Eu&^>n}dmM}5-T2k_Ir(gC|}!;EUDYq0vSd$aiGN3JA%UI`fSvh$WIt8s!t}sz3%C9#R>dxeFTJzdTYF-!B0)@T@ z+>bu4@Cr&n@WABA-?JJlpX6b93YLVLCh>V(#LdMZk0*Z2>}YEOZCnJ(7Nk^-V}1lVK?! zummIZ$3t_#<7^_)vmq|r4(J_v?aHQbvXG|(GA9)PIb__R;eqCb zITqPA$vQAP>K+{$9vZmt@_DHMsUav4S^xMP6okuIGIG!s@06qiAxbh#vZ1+I$>^D# z1x|}S1PNT`3}KVvL=2!plkP!OsmkNB$ou0Dx}usp=guL$V0OP^)<-0kQt1!};*YgCp^Mf}=%rwD69u zmGRi%bNdd_-VkR5d#h+~C8W@TM4E@YfH3erY^&3G8G3Bkih0T(``vURMvxN}|`hLq_m zs?EVMy06E^pz+k@?(DFF|3Ko>GTKKuDF&%U&2@puK!7=k0!E=DD3@=(Kq}Tpb%2$i zM=D8#x}iv@+emFWSQNCes9{K5xj0rs1=~SMV^7BuB@LZJVX}IcNyiwsZsApG;dVxn zjc3uJOy2k5@dM?Rj0?)4Mphfm<94X#VC%Z`wyt~OBURq5N7h908l!pd!$^Lhj3)MZ zYr3>WYeDvnnxHO)mPCyiJwi0^CFon!4DpK8xJCTe(tVJpVGXF(t0`Wn3|rQNx`paB zt;%oy=J_{`k`el)tf*TW)(hj7wSF?qQnxBQSJX&G=<7!ypNv43PDfozX3YN?{mR){ zi<*VMV=QR4j7%vc?-D5a!x{4xh`mUI34!+A0l6G(U$V?XtXQ2y&rKt}lkHRfAW-!g zRQ{vV8%>rhX!f%t+;TztpRiaUCWDpP5C)rd09XR%4mK?~O{hB<2jx19^2AGe>?DV1P9gWQMw#t{ zu#6ocEoDD|P_7l`?ltOHD@N8xZSkZU64a7Zv*dSw&sE z@9`^qMXylND^~QbQ#EE*^ErX{PQ?Zmo$Fd7(raf%fZX>)_vGT!LBW1Xw4dVbr%2>O z+XGu%FW4J^-t+c`=k}6C&wbO9X%YUueD0`Tr?dqX8IUTlK4GutcO89N|8(vdFbSig zeU!J40)1aJts5W%XtUQ%R5iU?)tRX36so$#s&0_)1{XmdE+eHa-y%0vd0_k)=NUgg z9TM!bqJ5UP&k~j)OVggJlD{w}Sg(mzkS?z!t9HI~=&eIPI`Z}rsN(qIV6tORY*4h; zEHk3DK7L%VwkF%UVuK4WFGFZ;yiu^aKs-)s@00dJNxLIyFTr6y{_xC$GxyIfofYhL zqP-4-1|Ob!aO(baM+sU3-OH zz2dIkMg5Xx83w#oH4lSNz0dak?z}8wkMrJ15VTh>&m=C-2$w_R<&bcGRy;o|*l&vV zo4ozzKdv9dxs_*_bHJkcq`2RBtU~)~Mbm)J@Hbiwz>Rw1|8PxZ187q$gx64F&6uR2 zjEqS-07&)4%mLZP>5MrTEJUz|GlJ|5K&SFGuL%|bOOZwRx3c?$719WXa7FBlYCyQm z5u@foP0s;SHA-cRoKfvIisT$sy_y)Ju)9sZ9Jc927#GGEfi9!qtcaVYs7?_#Nt=O* z(@&r`j5+d-LRJV`C6b?aj%7hmNy~WzKch+3$08H&wb2M&ucEOzNGeDb#)N!auqjN>ct9;lgNFndrDcblENWE9_-}Q91jL3w zP@C%hxpriPeHnTvi!}jwisYqa#fU9Ocw|Q*+GUX?F>AIEMr1Jd5=0}e1ZqJLH*yT@ z6$Doipp6YMu0IUkJO+Z0C8OMQ_BzI&Lx3&d5cd;ID+^T&yhh443snS3|21a#TL7TO zv+B${k|h;O*H%l~6D92*jz0?VM_v|6&WZTD7&{(2o;KF)Kx3T@l8&0izQsN?*6jfF z_D3UCPzSoCmC}{!$322|pJ?3|(BB~_}2pW1yS-9&*I9`Vc?0PpaIy{3jdvpEMOZ_8HBO5&5hA! zqlmB9tq>XO(!73>dZQTHYG8EpM)=PI4Ht0yFVI>A2{C##Y6D*13>vZ@&F2HBt60P` zeFk1XUyWvnL}Qt^Li#P`mo3v5prvbTv=hybf}Kt6Wkc$)&D*0EkeV!@XS2fH3Q|-d z+zaOo(E_*^yipvr|EV@=hj$y>d#{+?s~T0Cy4~4v_lnhVDScW=)U>sf6wN6MG-z*u zMEB=V=e?+j{XwjFLcrV8ZS;aUy2b1fiWaFyC_659^m65RkT$NMXA~g~*2v#0_*^CpsKOf$ z-BFloFu83X8M09CSpmc3bAXE8b!;O+f^ikBa$}EDy|H;+i>dvBc%D_G(7xJ z^N^YZk^TD4c}=#$CtKXTr_EKjt0(Nfc7YzHeold(Q!NxJ=_3AF@Cwv+bV)iG1IY|p zOXA4Hafr%j`ZYpxHu7J>k@ALSW)DYpWs@EhIXYenguq*cI}8u$RR0HZbFBcre2wDi zbMo&A_fzLjy(_NAqe9~WvGIUV+b7ocDbcd(3uQdXc#e2V{{ElflJplq-iYiP@XUg{ zPkMoDZ7huF=uX;IAPY_xp`;8cCK8rpV0`0qlSJf_Oc(l3xJUZW4oOvndXt%w-MGL& z3Hn*t(pN|#m4T5`@4i4l7PziqgY`E;{-Dbt)A6EBOHi`qEGEfHR)AZUAb}4c&c1~V zIQN2nI_z_!5fr?{FqkxnSRUOV8A@gxb6H^1V=8tY8vqr|%*w^d>A?sK0i@ZG35>Am zH(2_d#0H@a-njpK1WB&x|C472D3`{>Am01-Z zq{zsFX%SVxd^Ir7ur}81)ZwEV-c*r1cqnFDE2~=i&T3g#qO9xV;*WMdZ5PVUiDl{qW)*R)lj>d$e@u%#2-+4SJI1Y%81EAF{t6Hau3OWf`)Fr$2tahDC zbe-d0xgd016uT}8_Vz_vygJ!?aJBbzqWAQ(-9qonV(-g>ea~VcPwyc)1`=HZPY0fr z@#kL^x;$c+2XmSD%Jwx!9k5G9-O1Xn#nVX#=%h78t;roNizkwfCVc5jHXK+SNjjht znxf7m-LZHo>1gB~J;{dsiz90eXdQ&d&E)o>_i^*jXCBS`ihFwCum9wif0F1p%Xgej zI%`&)yAsY_@hgI}Lv(iV&W+@H30v=}AAmrg6X|n{1;BhQ zdzKExPbM5~yrV6-e~5QB#BV$PwM53gPFKJ8e8(4ffetLy{ zZ$xnJUXz9J+62gf&c39x;^FBBrx;%AI<1qr=prTB-?z25e22Lj0<2ckF<%V8;B5SOy|sU{xghHihkGtf*rhrMqCrm z;v6%%RJAL*TE?j6(6-iuGkI!skG4qGmaPqs=1}*bhB3|O1&cv^Rks`uR6xvCDyUDl zT_ubfLUmCCW6tm?p~^7I?5XR9A`GKWKRcbeA35y>UWqLMffel=n=U4Y=L9`}rg|#Z zp2?V}$@sD+b^Xv|%e?t3b6T@HdgF?II+GLZ%LVH3Rlj}B_@zf-FpR81>dKl|1Cup2>XJbYu(HkD&12NMb&r@F zk2rOiV68x<2<#o;c=HGsS?C!AkxXIP20?cK>S=TvBXkp7Xv4lq&_#C=c8{F)NGS)9 zZmM+JvVH)V3z_P}Z7vx88b-4@L!I4lz9S4yOF*BI?IJ&FnoaMAK|b=O~|!AcFOkQc%<5J#M-FiI&0nAUBS zBG6&*yGMP9ht7In_f)X}Hg_H?bYsFhiIuZq^f&9q-uJ zAY9f5p>bkEL%WJtg{zbdSI+fcI4LW4SWNjkg3Ac5B0$U`X<47F^g%XJ<~xm`5<)Y( ztO4SH;|sI6eSvRn#6-x?vHuc)ly19WZ{E{jCDw+LU z%EL@bCGJW;bW6q=H!`J?VaAPy0z^J(_KnBz^SuY#EbrmmQ(t&im= zE81d($+G=1>srlDu*+*vA4O&*&aF1>Pc-fSRq-#&pB@&P&WlawVS-AFLg=Gf~J1D7$#t41YtCQ6`SF76+)omZ1e{}81E}{B}Sbb!*`fQ^5>@&YmeMPLk z5<8o0@8in`f5*ygNKigeRS$c@Yy7P>GE%H2z#!Id#(!3G0`~|Gbb%hEUP(bEnT(NBtTO? zC0Lt8YZGs60=-)8uK0;}t?yY^ZYJn1p6*&~TCQDcS!{Vc9?N@9*MCK1kz=W1`K6UI zp{QFd>gJ0setiC;t52fOE(?1vihD1{j7bY|zdw+vdE%YGTY(=1-wv+s=t=D8dAwWL z(JSug6|DP3>wez4AKGKJ#ctnwY2l^i+JvQww^TiMHi*u)mCKJ`c{C!L1+Obe=+rGvGrK3?o(O=aHH=Ovr-0e~i3Z%qvJ_*vZWy3!TT+|? z^0rYA=IHs+>5Hw1=4G7SRE(ZUO+F{JU|5!o&{;X6hFcU{3NB(>D)~0MPApv5E^T%@ zwpIZ$Mqp22Y%>;=tyO<&RhhaMpkn)>VqY*tE-3^oWoAv2ZzTAs^PL{LZX8bI`q(<6 z!ybQ-Yd-1?csL;MO>#+1uIq$k97eh)>4!XE zJJnwXXKE>L08AS!*iiG3!+=sH<9XjCn870s`q}`9_S59>8Vg=sKK42xEnq=VND3Q= z0%d~Y#V0~bkXFhBg=iY`;J{nJ6J!js>c5?uUV+0SpZ{r7x7?U|DLucn!AN{|A((4>~u=hIqhU8`Vu{z1pi(7UI)$c)|$6 zM^WNdBfK2s1qjdcwFocQEc#6mo-T#(3|phOuSZ#5gNnxC>{meL=bK|vjcTV6MZoqA zm!&5NY$w}z@iIpZ+enJWsF~4c;UInhaEYBhZMT0I8{ABLH85|I4JhJl;AfPsS3Xn6n zolKGwAc&Ib@w;%?r5#Xk>RgI~&=?x9861MivgMxK4r!yl6ofMH#%*$;6tx+xr&B7qg4TfyR52p| z-rW=Lj(eWI^7Q7fFFfo2^vcsK%eBj|JcawyD?}gy-{NmVh1h?OgL4SMH~=`H1Nve( z6eG`6+QEnjG77qb^QJ_LtjE!o>5m>m*#AUIkmnQCMhXo|NrXRuuv{XP9)PRZ+1F$u zRBQxR+gY~iY)Cl4!KZl1l(dh5XK)Glu>gn-#fFf9w}VO)nDPt&SDob@yaTy%`G?$o zGrINwRVpFrvXZP-R>$>YX7L6Z#=qKxXcOmdN}Oy=9xFnEbHGkzhisLCX~Cxa+hFj5v?se z`K?(Tyk+N42Uhg&o_g=ppw*0Oa;TOz<6jJ}INEJAtGTpD7u5kyK02>Q zzh&?V)hvLA?lyVh6Dew3Oz;JqLY8jGDt`LsbxqPXS5o=5xny#b+kMl7C*4a5f0Dg!j=;7J1ja1Ks63XgbX zr;JC+Dwn1fgYml6lGa2?>&l=|(k+&BgO0=rOtI0tj{vYk*&NGzc-v0+p+{gv%{!HE zRmKNlUskk<6>axU$A%Vn17Ewmm$%XIi=TXK;CJnXU$UY3XIFoGb!ARy*ef>dU9>C@ zfo~refGT-YC1FBeZ)ls8;ij0dsc4`=A6ff0y1yux()4 zHeZ`^ER!60C^^Srwl`mlW|C5GRj8*D++nC~J_n!$r*{K5)&pDc0kQ_<0x7j>+Zv#s z->3%NH?9Fj@f+0uri-onu<)aWUxaZ7rLK$vQ?CAEn^ZYSbDLB-qQL|n8q-JaE3*CZ z)CZ6dNd?Kd=hV4mMp?tM^LZG~S^2V#DrdWez-eUpWHb76C`Gr^Jh8MvpjgOI8oV;2C3L6OgpCjAT}}PMT->J4h{= zp|z=07Sh1fOJTZE@;mVngwN3s&p(bh3$p(iYr`tHbkIHs5s@kd`)xk_`~Z<)KL9|2 zPO7yX#PGI`m?hcS9V=X`XY~^|8Ga!Q_ymuCSo|}?%)6WWiT=1^# zJsUa=VW@Qr;>jtMnK#F`_3FN=;t8ib2OnC&FVd{UKp`tjZ;yPg^a=Lykyf+xRJ=J+)DFV^f6 zYXI45im-7Avnvih_C2}qbf4fnEjmy0M{bFi-8 zpzEhsvH!x0PX5o_Hm0qxaIU_{LWL>7}NqEmt_3Ul-0WRo4 zh<=S)Bf4v|BWW#;9skVkT)cSSvSfk72j$fl@ZZIEPRr znySDLRsNAV-PlRyEQelZ922f@ShK01N+GR~y?=lf$$kkhOzzGQ;-w>}mLd1%YZJNa zRtThan;FN|q1lGC<#-A(+P8DiIgq)Yh2d?Gdd@jRSVy|Z=>Zwx5Oz~$<1)vD?#eja zlfmCaKj8ClesB$!zi`(HALt9*<@{U+`(L5|E*<+n;0E7BL@#Q|0OuuVxHQJamBhvn z{5J#=fD{YKB93M6Bfy0vS!Mo_oPp&C?)KZ9dJ=U#Lfsy*ZV!lQ<|}}S5W-1OK%#9)x^0#2Nzgq4y+@=W zg{A;gAOu8eSe#4AbMXt0hn^1o?deZXKbsQW|0DD2U&p^!? zh2t5vS`jpzrdnCxH_9IuLv;oD+!0}1{t^Kx?^juA z{|i3rizz1%c(CgP=jp5(AHqT%JvJ9Y{s@##RYEQOyu3JhC zUF+0#pv!0ggP|H!QjIO3(_goA7!Isc+ktTnZ2%pK8VqV040w`f6CBZ$7{Iiu23@^| zo^?tMSWUM9zL}v06Pii`Xg1Vfe=R0dgX0>90bJ(PpsmdCDzs@6>@i`J)WB&pfXCi8 zK%NI2c~=##fQdQuONFUM3eDCaco%?V0a}M23k5NYOMM9tKjTKu^1=6G?&A4G7AYBv zn1)471&aiN%|~ECUa@CqOcihM)o$lJ7Q)x zsa*wp$PXx`kU(3iliT9_iv9x0Y_V7f6iN=gskpuP(!8^)l~GHeqtUz1JM+%R`^+=* zW2ux!Q2t4MX}KwcewR*C)EFpdw*fpx1QEqXj?z*TiB(%|sqjtNDMxE*QJZF`olGkm z;k2zgxmFHvi4-4ctvpe8P<7!CIcZf13)pp;hdqnA;tu$^2eeJNb)Rx@vOARDw>lKy z(ycwt9J&>CeSlrOOL?g8QA}(qM5whHv%^AFc65M)NWA>-E*AnR&iIH`#+9@&=S!oMMj}*d<1M!Ui@v%$^Sw zpj@x}g7P}S9)=FSQB}h%G)TdRS=NiLM70pA4CLLz-d>o+Z4m+8!S^T(5nKZv!&02? zhw+fGhzT(<-}N2pdSNE+OoFC2ic2wt*@za~uBNy=PA=8R%V31mEzo@aQS%enl4uH> zb#2z%u{xB3ioq&FbUk*Udp~wAct18;9G^w9KRJuL=V))M_KG7HBm9&lj!) z3)e>rtHHwR;rrw4r3q5fwXuHbR9`vKSDt(}($@lgZK$t}wZf^ke4;Hsd2^&y1FbsL zs8+TMxALp|<|K@M=)F`mFrMua&n)l^a3j#%bl|N#*9KvJq4^M&-?* zyg4pknWR(2%%2F%U(!raDxEolEKoHQ)2aiGxs{iI7o;i5{z8~H&8`nG6g5pQIp^{+ zg}R_NbuHhC&-bmwZOr&=#OyVudg1cz;LCFC@+>A1^)xm*@`&f35nqwY4AN>P+Cx;C7zzoFb=^c$nfaK4Vw?csbKqj!e$b&^?BfS{L{Xjd-) DX8WT% literal 0 HcmV?d00001 diff --git a/diamond_data_scraper/cli.py b/diamond_data_scraper/cli.py new file mode 100644 index 0000000..96b0453 --- /dev/null +++ b/diamond_data_scraper/cli.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import argparse +import logging +from typing import Optional, Sequence + +from .constants import YEAR_MENU_URL +from .scraper import Scraper + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Scrape Baseball Almanac yearly stats.") + parser.add_argument( + "--limit", + type=int, + default=None, + help="Limit to the first N yearly links (useful for testing). Example: --limit 5", + ) + parser.add_argument( + "--no-prompt", + action="store_true", + help="Do not prompt for a limit if --limit is not provided.", + ) + parser.add_argument( + "--headless", + action=argparse.BooleanOptionalAction, + default=True, + help="Run Chrome headless (default: true).", + ) + parser.add_argument( + "--profile-dir", + default="selenium_profile", + help="Chrome user-data directory (default: selenium_profile).", + ) + parser.add_argument( + "--out-dir", + default=".", + help="Directory to write CSV outputs (default: current directory).", + ) + parser.add_argument( + "--log-level", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logging verbosity (default: INFO).", + ) + return parser + + +def parse_limit_with_optional_prompt(*, limit: Optional[int], prompt: bool) -> Optional[int]: + if limit is not None: + return limit + if not prompt: + return None + + try: + raw = input("How many years to scrape? (press Enter for all): ").strip() + if not raw: + return None + return int(raw) + except (EOFError, ValueError): + return None + + +def main(argv: Optional[Sequence[str]] = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + logging.basicConfig( + level=getattr(logging, args.log_level), + format="%(asctime)s | %(levelname)s | %(message)s", + ) + + limit = parse_limit_with_optional_prompt(limit=args.limit, prompt=not args.no_prompt) + + scraper = Scraper(headless=args.headless, profile_dir=args.profile_dir) + scraper.scrape(menu_url=YEAR_MENU_URL, limit_years=limit, out_dir=args.out_dir) + return 0 + diff --git a/diamond_data_scraper/constants.py b/diamond_data_scraper/constants.py new file mode 100644 index 0000000..c4402c1 --- /dev/null +++ b/diamond_data_scraper/constants.py @@ -0,0 +1,13 @@ +import re + +YEAR_MENU_URL = "https://www.baseball-almanac.com/yearmenu.shtml" + +# Matches the year-menu link format like ".../yearly/yr1970n.shtml" or ".../yearly/yr1934a.shtml". +YEARLY_LINK_RE = re.compile(r"/yearly/yr(?P\d{4})(?P[an])\.shtml$") + +# Matches the H1 header content on year pages. +YEAR_LEAGUE_HEADER_RE = re.compile(r"(?P\d{4})\s(?PAMERICAN|NATIONAL)\sLEAGUE") + +# Extracts canonical stat table keys from header text. +STAT_TABLE_KEY_RE = re.compile(r"\b(Hitting Statistics|Pitching Statistics|Standings)\b") + diff --git a/diamond_data_scraper/scraper.py b/diamond_data_scraper/scraper.py new file mode 100644 index 0000000..a0cf073 --- /dev/null +++ b/diamond_data_scraper/scraper.py @@ -0,0 +1,444 @@ +from __future__ import annotations + +import logging +import re +from collections import defaultdict +from time import sleep +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Tuple + +import pandas as pd +from selenium.webdriver.common.by import By + +from .constants import ( + STAT_TABLE_KEY_RE, + YEAR_LEAGUE_HEADER_RE, + YEARLY_LINK_RE, +) +from .webdriver_factory import build_chrome_driver + + +class Scraper: + """ + Scrapes yearly baseball league stats from Baseball Almanac. + + High-level flow: + - Collect yearly links (AL/NL). + - For each year page, parse player/team tables and a small "events" blurb. + - Flatten into Pandas DataFrames and export to CSV. + """ + + def __init__( + self, + *, + headless: bool = True, + profile_dir: str = "selenium_profile", + logger: Optional[logging.Logger] = None, + ): + """ + Initialize the scraper state and create a Selenium Chrome driver. + + - **headless**: Run Chrome without a visible UI. + - **profile_dir**: Directory where Chrome user-data is stored (cache/cookies). + - **logger**: Optional logger for progress reporting. + """ + self.logger = logger or logging.getLogger(__name__) + + self.events: Dict[int, Dict[str, List[str]]] = {} + self.player_stats: DefaultDict[int, Dict[str, Dict[str, List[Dict[str, str]]]]] = defaultdict(dict) + self.team_stats: DefaultDict[int, Dict[str, Dict[str, List[Dict[str, str]]]]] = defaultdict(dict) + + self.logger.info("Initializing Chrome driver (headless=%s, profile_dir=%s)", headless, profile_dir) + self.driver = build_chrome_driver(headless=headless, profile_dir=profile_dir) + self.logger.info("Chrome driver ready") + + def close(self) -> None: + """Close the Selenium driver (safe to call multiple times).""" + try: + self.logger.info("Closing Chrome driver") + self.driver.quit() + except Exception: + pass + + # ---------- Orchestration ---------- + def scrape( + self, + *, + menu_url: str, + limit_years: Optional[int] = None, + out_dir: str = ".", + ) -> None: + """ + Orchestrate the end-to-end scrape and write CSV outputs. + + - **menu_url**: Year-menu URL to start from. + - **limit_years**: If provided, only scrape the first N yearly links. + - **out_dir**: Directory where CSVs are written. + """ + try: + self.logger.info("Scrape started") + links = self.get_year_links(menu_url) + + if limit_years is not None: + if limit_years <= 0: + self.logger.warning("limit_years=%s requested; nothing to scrape", limit_years) + return + self.logger.info("Limiting scrape to first %d yearly links (testing mode)", limit_years) + links = links[:limit_years] + + self.log_data(links) + finally: + self.close() + + self.logger.info("Converting scraped stats to DataFrames") + player_hit_df, player_pitch_df, _player_standing_df = self.convert_stats_to_df(self.player_stats) + team_hit_df, team_pitch_df, standing_df = self.convert_stats_to_df(self.team_stats) + + outputs = [ + ("player_hit.csv", player_hit_df), + ("player_pitch.csv", player_pitch_df), + ("team_hit.csv", team_hit_df), + ("team_pitch.csv", team_pitch_df), + ("standing.csv", standing_df), + ] + + for filename, df in outputs: + path = f"{out_dir.rstrip('/')}/{filename}" + self.logger.info("Writing %s (%d rows, %d cols)", path, len(df.index), len(df.columns)) + df.to_csv(path, index=False) + + self.logger.info("Scrape finished successfully") + + # ---------- Navigation ---------- + def get_year_links(self, menu_url: str) -> List[str]: + """ + Load the year-menu page and return yearly AL/NL links. + + Filter: + - keep all National League years + - keep American League only for years >= 1901 + """ + self.logger.info("Loading year menu: %s", menu_url) + self.driver.get(menu_url) + + anchors = self.driver.find_elements( + By.CSS_SELECTOR, + "table.ba-sub > tbody > tr > td.datacolBox > a", + ) + + links: List[str] = [] + for a in anchors: + href = a.get_attribute("href") or "" + m = YEARLY_LINK_RE.search(href) + if not m: + continue + + year = int(m.group("year")) + league_code = m.group("league_code") + + if league_code == "a" and year < 1901: + continue + + links.append(href) + + self.logger.info("Found %d yearly links (post-filter)", len(links)) + return links + + def log_data(self, links: Iterable[str]) -> None: + """ + Visit each yearly link and extract player/team/event data into in-memory dictionaries. + """ + links_list = list(links) + total = len(links_list) + self.logger.info("Scraping %d yearly pages", total) + + for idx, link in enumerate(links_list, start=1): + try: + self.logger.info("(%d/%d) Loading: %s", idx, total, link) + self.driver.get(link) + sleep(2) + except Exception: + self.logger.warning("(%d/%d) Failed to load: %s", idx, total, link) + continue + + year, league = self.get_year_league() + if not year or not league: + self.logger.warning("(%d/%d) Skipping page (could not parse year/league): %s", idx, total, link) + continue + + self.logger.info("(%d/%d) Parsed: year=%s league=%s", idx, total, year, league) + player, team = self.get_data() + self.player_stats[year][league] = player + self.team_stats[year][league] = team + + self.logger.info( + "(%d/%d) Extracted tables: player=%d team=%d", + idx, + total, + len(player.keys()), + len(team.keys()), + ) + + if year not in self.events: + self.events[year] = self.clean_events() + self.logger.info("(%d/%d) Extracted events keys: %s", idx, total, list(self.events[year].keys())) + + def get_year_league(self) -> Tuple[Optional[int], Optional[str]]: + """ + Parse the current yearly page header to determine (year, league). + + Returns `(None, None)` when the header doesn't match expectations. + """ + try: + header = self.driver.find_element(By.CSS_SELECTOR, "div.intro > h1").text + except Exception: + return None, None + + m = YEAR_LEAGUE_HEADER_RE.search(header or "") + if not m: + return None, None + + year = int(m.group("year")) + league = m.group("league").title() + " League" + + if league == "American League" and year < 1901: + return None, None + + return year, league + + # ---------- Page parsing ---------- + def get_data(self) -> Tuple[Dict[str, List[Dict[str, str]]], Dict[str, List[Dict[str, str]]]]: + """ + Parse all boxed tables on the current yearly page. + + Returns: + - `player_stats_dict`: maps stat table name -> list of row dicts + - `team_stats_dict`: maps stat table name -> list of row dicts + """ + player_stats_dict: Dict[str, List[Dict[str, str]]] = {} + team_stats_dict: Dict[str, List[Dict[str, str]]] = {} + + boxed_tables = self.driver.find_elements(By.CSS_SELECTOR, "table.boxed") + self.logger.debug("Found %d boxed tables on page", len(boxed_tables)) + + for table in boxed_tables: + col_names: List[str] = [] + duplicate_rows: Dict[int, List[Any]] = {} + table_name: Optional[List[str]] = None + col_num: Optional[int] = None + data_list: List[List[str]] = [] + + rows = table.find_elements(By.TAG_NAME, "tr") + for row in rows: + temp_table_name, temp_col_num = self.find_table_name_and_columns(row) + temp_col_names, temp_dup_from_header = self.find_col_names(row) + row_data, temp_dup_from_cells = self.find_cell_data(row, col_num, duplicate_rows) + + if temp_table_name: + table_name = temp_table_name + if temp_col_num: + col_num = temp_col_num + if temp_dup_from_header: + duplicate_rows = temp_dup_from_header + if temp_col_names: + col_names = temp_col_names + if temp_dup_from_cells is not None: + duplicate_rows = temp_dup_from_cells + + if row_data and col_names and len(row_data) == len(col_names): + data_list.append(row_data) + + if table_name and col_names and data_list: + list_of_dicts = [dict(zip(col_names, row)) for row in data_list] + if table_name[0] == "Player": + player_stats_dict[table_name[-1]] = list_of_dicts + elif table_name[0] == "Team": + team_stats_dict[table_name[-1]] = list_of_dicts + + self.logger.debug( + "Captured table %s (%d rows, %d cols)", + " / ".join(table_name), + len(list_of_dicts), + len(col_names), + ) + + return player_stats_dict, team_stats_dict + + def find_table_name_and_columns(self, row) -> Tuple[Optional[List[str]], Optional[int]]: + """ + Detect the table category/name from header rows and extract expected column count. + + Returns: + - `(table_name_parts, num_cols)` where `table_name_parts` looks like + `["Player", "Hitting Statistics"]` or `["Team", "Standings"]` + - `(None, None)` when the row isn't a header row + """ + table_name: List[str] = [] + player_pattern = r"(Player|Pitcher)" + team_pattern = r"Team(?= Review)|Team Standings" + + headers: List[str] = [] + try: + headers = [h.text for h in row.find_elements(By.XPATH, ".//h2 | .//p")] + except Exception: + return None, None + + if not headers: + return None, None + + try: + num_cols_attr = row.find_element(By.TAG_NAME, "td").get_attribute("colspan") + num_cols = int(num_cols_attr) if num_cols_attr else None + except Exception: + num_cols = None + + # Player/Pitcher tables are treated as "Player" category. + is_player = bool(headers and headers[0] and re.search(player_pattern, headers[0])) + if is_player: + table_name.append("Player") + + # Team tables can show up in different header positions depending on the page. + header0 = headers[0] if len(headers) > 0 else "" + header1 = headers[1] if len(headers) > 1 else "" + m_team = re.search(team_pattern, header0) or re.search(team_pattern, header1) + if m_team: + table_name.extend(m_team.group().split(" ")) + + # Stat key is usually in the second header line (but we normalize it). + if len(headers) > 1: + m_key = STAT_TABLE_KEY_RE.search(headers[1]) + if m_key: + table_name.append(m_key.group(1)) + + if not table_name: + return None, None + + return table_name, num_cols + + def find_col_names(self, row) -> Tuple[Optional[List[str]], Optional[Dict[int, List[Any]]]]: + """ + Extract column names from a "banner" row and detect header rowspans. + """ + try: + elements = row.find_elements(By.XPATH, ".//td[contains(@class, 'banner')]") + except Exception: + return None, None + + if not elements: + return None, None + + col_names: List[str] = [] + duplicate_row_val: Dict[int, List[Any]] = {} + regions = {"East", "Central", "West"} + + for idx, el in enumerate(elements): + num_rows = el.get_attribute("rowspan") + if num_rows: + duplicate_row_val[idx] = [el.text, int(num_rows)] + + if el.text in regions: + col_names.append("Region") + else: + col_names.append(el.text.replace(" [Click for roster]", "").strip()) + + return col_names, duplicate_row_val + + def find_cell_data( + self, + row, + num_cols: Optional[int], + duplicate_rows: Dict[int, List[Any]], + ) -> Tuple[Optional[List[str]], Dict[int, List[Any]]]: + """ + Extract cell text for a data row, handling rowspans by re-inserting duplicated values. + """ + try: + cells = row.find_elements( + By.XPATH, + ".//td[contains(@class, 'datacolBox') or contains(@class, 'datacolBlue')]", + ) + except Exception: + return None, duplicate_rows + + if not cells: + return None, duplicate_rows + + data: List[str] = [] + for idx, cell in enumerate(cells): + num_rows = cell.get_attribute("rowspan") + if num_rows: + duplicate_rows[idx] = [cell.text, int(num_rows)] + data.append(cell.text.strip()) + + if num_cols is not None and len(data) != num_cols: + for idx, value in list(duplicate_rows.items()): + data.insert(idx, value[0]) + duplicate_rows[idx][1] -= 1 + + duplicate_rows = {k: v for k, v in duplicate_rows.items() if v[1] > 0} + return data, duplicate_rows + + def clean_events(self) -> Dict[str, List[str]]: + """ + Extract the small "Events" / "Salary" text block from the current yearly page. + """ + events_dict: Dict[str, List[str]] = {} + try: + row = self.driver.find_element(By.XPATH, ".//td[contains(., 'Events') or contains(., 'Salary')]") + except Exception: + return events_dict + + event_text = (row.text or "").split("\n") + for line in event_text: + if ": " not in line: + continue + + title, rhs = line.split(": ", 1) + if "Events" in title or "Salary" in title: + events_dict[title] = rhs.split(" | ") + + return events_dict + + # ---------- DataFrame/output helpers ---------- + def convert_stats_to_df( + self, + dictionary: Dict[int, Dict[str, Dict[str, List[Dict[str, str]]]]], + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Flatten the nested stats dictionary into three DataFrames: + - hitting stats + - pitching stats + - standings + """ + hit_table: List[Dict[str, Any]] = [] + pitch_table: List[Dict[str, Any]] = [] + standing_table: List[Dict[str, Any]] = [] + + for year, leagues in dictionary.items(): + for league, data in leagues.items(): + for items in data.get("Hitting Statistics", []): + self.add_to_table(hit_table, items, year, league) + for items in data.get("Pitching Statistics", []): + self.add_to_table(pitch_table, items, year, league) + for items in data.get("Standings", []): + self.add_to_table(standing_table, items, year, league) + + return pd.DataFrame(hit_table), pd.DataFrame(pitch_table), pd.DataFrame(standing_table) + + def add_to_table(self, table: List[Dict[str, Any]], items: Dict[str, Any], year: int, league: str) -> None: + """Append a single stats row into an output table, adding Year/League context columns.""" + if not items: + return + + stats = dict(items) + stats["Year"] = year + stats["League"] = league + table.append(stats) + + def convert_events_to_df(self, dictionary: Dict[int, Dict[str, List[str]]]) -> pd.DataFrame: + """Convert the events dictionary into a DataFrame for easier export/analysis.""" + rows: List[Dict[str, Any]] = [] + for year, event_groups in dictionary.items(): + for title, items in event_groups.items(): + rows.append({"Year": year, "Title": title, "Items": " | ".join(items)}) + return pd.DataFrame(rows) + diff --git a/diamond_data_scraper/webdriver_factory.py b/diamond_data_scraper/webdriver_factory.py new file mode 100644 index 0000000..296ae12 --- /dev/null +++ b/diamond_data_scraper/webdriver_factory.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import os +from selenium import webdriver +from selenium.webdriver.chrome.service import Service as ChromeService +from webdriver_manager.chrome import ChromeDriverManager + + +def build_chrome_driver(*, headless: bool, profile_dir: str) -> webdriver.Chrome: + """ + Create and return a configured Chrome WebDriver instance. + + Note: `webdriver_manager` downloads/chooses an appropriate chromedriver automatically. + """ + profile_dir = os.path.abspath(profile_dir) + os.makedirs(profile_dir, exist_ok=True) + + options = webdriver.ChromeOptions() + if headless: + options.add_argument("--headless") + options.add_argument("--disable-gpu") + options.add_argument(f"--user-data-dir={profile_dir}") + + return webdriver.Chrome( + service=ChromeService(ChromeDriverManager().install()), + options=options, + ) + diff --git a/scraper_logic.py b/scraper_logic.py index d57ef11..5c16b7b 100644 --- a/scraper_logic.py +++ b/scraper_logic.py @@ -1,541 +1,6 @@ -""" -Scrapes yearly baseball league stats from Baseball Almanac. - -High-level flow: -- Open the year menu page and collect yearly links (AL/NL). -- For each year page, parse player/team tables and a small "events" blurb. -- Flatten into Pandas DataFrames and export to CSV. -""" - -from __future__ import annotations - -import logging -import os -import re -import argparse -from collections import defaultdict -from time import sleep -from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Tuple - -import pandas as pd -from selenium import webdriver -from selenium.webdriver.chrome.service import Service as ChromeService -from selenium.webdriver.common.by import By -from webdriver_manager.chrome import ChromeDriverManager - - -YEAR_MENU_URL = "https://www.baseball-almanac.com/yearmenu.shtml" - -# Matches the year-menu link format like ".../yearly/yr1970n.shtml" or ".../yearly/yr1934a.shtml". -YEARLY_LINK_RE = re.compile(r"/yearly/yr(?P\d{4})(?P[an])\.shtml$") - -# Matches the H1 header content on year pages. -YEAR_LEAGUE_HEADER_RE = re.compile(r"(?P\d{4})\s(?PAMERICAN|NATIONAL)\sLEAGUE") - -# Extracts canonical stat table keys from header text. -STAT_TABLE_KEY_RE = re.compile(r"\b(Hitting Statistics|Pitching Statistics|Standings)\b") - - -class Scraper: - def __init__( - self, - *, - headless: bool = True, - profile_dir: str = "selenium_profile", - logger: Optional[logging.Logger] = None, - ): - """ - Initialize the scraper state and create a Selenium Chrome driver. - - - **headless**: Run Chrome without a visible UI (useful for servers/CI). - - **profile_dir**: Directory where Chrome user-data is stored (cache/cookies). - - **logger**: Optional logger for progress reporting. - """ - self.logger = logger or logging.getLogger(__name__) - - self.events: Dict[int, Dict[str, List[str]]] = {} - self.player_stats: DefaultDict[int, Dict[str, Dict[str, List[Dict[str, str]]]]] = defaultdict(dict) - self.team_stats: DefaultDict[int, Dict[str, Dict[str, List[Dict[str, str]]]]] = defaultdict(dict) - - self.profile_dir = os.path.abspath(profile_dir) - os.makedirs(self.profile_dir, exist_ok=True) - - self.logger.info("Initializing Chrome driver (headless=%s, profile_dir=%s)", headless, self.profile_dir) - self.driver = self._build_driver(headless=headless, profile_dir=self.profile_dir) - self.logger.info("Chrome driver ready") - - def _build_driver(self, *, headless: bool, profile_dir: str) -> webdriver.Chrome: - """ - Create and return a configured Chrome WebDriver instance. - - Note: `webdriver_manager` downloads/chooses an appropriate chromedriver automatically. - """ - options = webdriver.ChromeOptions() - if headless: - options.add_argument("--headless") # Headless mode - options.add_argument("--disable-gpu") # Historically helpful on some platforms - options.add_argument(f"--user-data-dir={profile_dir}") - - return webdriver.Chrome( - service=ChromeService(ChromeDriverManager().install()), - options=options, - ) - - def close(self) -> None: - """Close the Selenium driver (safe to call multiple times).""" - try: - self.logger.info("Closing Chrome driver") - self.driver.quit() - except Exception: - pass - - def scrape(self, *, limit_years: Optional[int] = None) -> None: - """ - Orchestrate the end-to-end scrape and write CSV outputs to the current directory. - - - **limit_years**: If provided, only scrape the first N yearly links (useful for testing). - - Outputs: - - `player_hit.csv`, `player_pitch.csv` - - `team_hit.csv`, `team_pitch.csv` - - `standing.csv` - """ - try: - self.logger.info("Scrape started") - links = self.get_year_links(YEAR_MENU_URL) - # Example: subset runs for testing - # links = ["https://www.baseball-almanac.com/yearly/yr1887n.shtml", "https://www.baseball-almanac.com/yearly/yr1970n.shtml"] - - if limit_years is not None: - if limit_years <= 0: - self.logger.warning("limit_years=%s requested; nothing to scrape", limit_years) - return - self.logger.info("Limiting scrape to first %d yearly links (testing mode)", limit_years) - links = links[:limit_years] - - self.log_data(links) - except Exception as e: - print("Unable to open the url provided.") - print(f"Exception: {type(e).__name__} {e}") - finally: - # Always try to close the driver, even if scraping fails mid-run. - self.close() - - self.logger.info("Converting scraped stats to DataFrames") - player_hit_df, player_pitch_df, _player_standing_df = self.convert_stats_to_df(self.player_stats) - team_hit_df, team_pitch_df, standing_df = self.convert_stats_to_df(self.team_stats) - - # Useful debugging: - # temp = pd.json_normalize(self.player_stats) - # temp.to_csv("test.csv", index=False) - - outputs = [ - ("player_hit.csv", player_hit_df), - ("player_pitch.csv", player_pitch_df), - ("team_hit.csv", team_hit_df), - ("team_pitch.csv", team_pitch_df), - ("standing.csv", standing_df), - ] - - for filename, df in outputs: - self.logger.info("Writing %s (%d rows, %d cols)", filename, len(df.index), len(df.columns)) - df.to_csv(filename, index=False) - - self.logger.info("Scrape finished successfully") - - def get_year_links(self, menu_url: str) -> List[str]: - """ - Load the year-menu page and return yearly AL/NL links. - - Important filter: - - Baseball Almanac also has historical leagues where URLs may end in "a". - We keep **American League only for years >= 1901** and keep all National League years. - """ - self.logger.info("Loading year menu: %s", menu_url) - self.driver.get(menu_url) - - anchors = self.driver.find_elements( - By.CSS_SELECTOR, - "table.ba-sub > tbody > tr > td.datacolBox > a", - ) - - links: List[str] = [] - for a in anchors: - href = a.get_attribute("href") or "" - m = YEARLY_LINK_RE.search(href) - if not m: - continue - - year = int(m.group("year")) - league_code = m.group("league_code") - - # 'n' => National League (keep all years), 'a' => American League (skip pre-1901). - if league_code == "a" and year < 1901: - continue - - links.append(href) - - self.logger.info("Found %d yearly links (post-filter)", len(links)) - return links - - def log_data(self, links: Iterable[str]) -> None: - """ - Visit each yearly link and extract player/team/event data into in-memory dictionaries. - - Note: uses a small `sleep()` after navigation; a more robust improvement is to use - Selenium's WebDriverWait and wait for the tables to exist. - """ - links_list = list(links) - total = len(links_list) - self.logger.info("Scraping %d yearly pages", total) - - for idx, link in enumerate(links_list, start=1): - try: - self.logger.info("(%d/%d) Loading: %s", idx, total, link) - self.driver.get(link) - sleep(2) - except Exception: - self.logger.warning("(%d/%d) Failed to load: %s", idx, total, link) - continue - - year, league = self.get_year_league() - if not year or not league: - self.logger.warning("(%d/%d) Skipping page (could not parse year/league): %s", idx, total, link) - continue - - self.logger.info("(%d/%d) Parsed: year=%s league=%s", idx, total, year, league) - player, team = self.get_data() - self.player_stats[year][league] = player - self.team_stats[year][league] = team - self.logger.info( - "(%d/%d) Extracted tables: player=%d team=%d", - idx, - total, - len(player.keys()), - len(team.keys()), - ) - - # Events are shared per year (AL/NL pages can both include events; first wins). - if year not in self.events: - self.events[year] = self.clean_events() - self.logger.info("(%d/%d) Extracted events keys: %s", idx, total, list(self.events[year].keys())) - - def get_year_league(self) -> Tuple[Optional[int], Optional[str]]: - """ - Parse the current yearly page header to determine (year, league). - - Returns: - - `(year, league)` where league is "American League" or "National League" - - `(None, None)` if the header doesn't match expectations - """ - try: - header = self.driver.find_element(By.CSS_SELECTOR, "div.intro > h1").text - except Exception: - return None, None - - m = YEAR_LEAGUE_HEADER_RE.search(header or "") - if not m: - return None, None - - year = int(m.group("year")) - league = m.group("league").title() + " League" - - # Guardrails: American League is considered modern-era starting 1901. - if league == "American League" and year < 1901: - return None, None - - return year, league - - def get_data(self) -> Tuple[Dict[str, List[Dict[str, str]]], Dict[str, List[Dict[str, str]]]]: - """ - Parse all boxed tables on the current yearly page. - - Returns: - - `player_stats_dict`: maps stat table name -> list of row dicts - - `team_stats_dict`: maps stat table name -> list of row dicts - - Table names are derived from the table's header text (e.g. "Hitting Statistics"). - """ - player_stats_dict: Dict[str, List[Dict[str, str]]] = {} - team_stats_dict: Dict[str, List[Dict[str, str]]] = {} - - boxed_tables = self.driver.find_elements(By.CSS_SELECTOR, "table.boxed") - self.logger.debug("Found %d boxed tables on page", len(boxed_tables)) - for table in boxed_tables: - col_names: List[str] = [] - duplicate_rows: Dict[int, List[Any]] = {} - table_name: Optional[List[str]] = None - col_num: Optional[int] = None - data_list: List[List[str]] = [] - - rows = table.find_elements(By.TAG_NAME, "tr") - for row in rows: - temp_table_name, temp_col_num = self.find_table_name_and_columns(row) - temp_col_names, temp_dup_from_header = self.find_col_names(row) - row_data, temp_dup_from_cells = self.find_cell_data(row, col_num, duplicate_rows) - - # Update running table context (name/columns/header-driven dupes). - if temp_table_name: - table_name = temp_table_name - if temp_col_num: - col_num = temp_col_num - if temp_dup_from_header: - duplicate_rows = temp_dup_from_header - if temp_col_names: - col_names = temp_col_names - - # Cell-driven rowspan dupes should augment (or replace) the current dupes. - if temp_dup_from_cells is not None: - duplicate_rows = temp_dup_from_cells - - # Only record rows where we have both column headers and a matching-length row. - if row_data and col_names and len(row_data) == len(col_names): - data_list.append(row_data) - - if table_name and col_names and data_list: - list_of_dicts = [dict(zip(col_names, row)) for row in data_list] - if table_name[0] == "Player": - player_stats_dict[table_name[-1]] = list_of_dicts - elif table_name[0] == "Team": - team_stats_dict[table_name[-1]] = list_of_dicts - self.logger.debug( - "Captured table %s (%d rows, %d cols)", - " / ".join(table_name), - len(list_of_dicts), - len(col_names), - ) - - return player_stats_dict, team_stats_dict - - def find_table_name_and_columns(self, row) -> Tuple[Optional[List[str]], Optional[int]]: - """ - Detect the table category/name from header rows and extract expected column count. - - Returns: - - `(table_name_parts, num_cols)` where `table_name_parts` looks like - `["Player", "Hitting Statistics"]` or `["Team", "Standings"]` - - `(None, None)` when the row isn't a header row - """ - table_name: List[str] = [] - player_pattern = r"(Player|Pitcher)" - team_pattern = r"Team(?= Review)|Team Standings" - # NOTE: we normalize stat keys to match `convert_stats_to_df()` expectations. - # Pages often contain header strings like: - # "1901 Hitting Statistics League Leaderboards | 1902 →" - # We want the canonical "Hitting Statistics" key. - - headers: List[str] = [] - try: - headers = [h.text for h in row.find_elements(By.XPATH, ".//h2 | .//p")] - except Exception: - return None, None - - if not headers: - return None, None - - try: - num_cols_attr = row.find_element(By.TAG_NAME, "td").get_attribute("colspan") - num_cols = int(num_cols_attr) if num_cols_attr else None - except Exception: - num_cols = None - - # Player/Pitcher tables are treated as "Player" category. - is_player = bool(re.search(player_pattern, headers[0])) - if is_player: - table_name.append("Player") - - # Team tables can show up in different header positions depending on the page. - header0 = headers[0] if len(headers) > 0 else "" - header1 = headers[1] if len(headers) > 1 else "" - m_team = re.search(team_pattern, header0) or re.search(team_pattern, header1) - if m_team: - table_name.extend(m_team.group().split(" ")) - - # Stat key is usually in the second header line (but we normalize it). - if len(headers) > 1: - m_key = STAT_TABLE_KEY_RE.search(headers[1]) - if m_key: - table_name.append(m_key.group(1)) - - if not table_name: - return None, None - - return table_name, num_cols - - def find_col_names(self, row) -> Tuple[Optional[List[str]], Optional[Dict[int, List[Any]]]]: - """ - Extract column names from a "banner" row and detect header rowspans. - - Returns: - - `col_names`: list of column names - - `duplicate_row_val`: mapping from column index -> [value, remaining_rowspan] - used to fill in missing cells on subsequent rows. - """ - elements = [] - try: - elements = row.find_elements(By.XPATH, ".//td[contains(@class, 'banner')]") - except Exception: - return None, None - - if not elements: - return None, None - - col_names: List[str] = [] - duplicate_row_val: Dict[int, List[Any]] = {} - regions = {"East", "Central", "West"} - - for idx, el in enumerate(elements): - num_rows = el.get_attribute("rowspan") - if num_rows: - duplicate_row_val[idx] = [el.text, int(num_rows)] - - if el.text in regions: - col_names.append("Region") - else: - col_names.append(el.text.replace(" [Click for roster]", "").strip()) - - return col_names, duplicate_row_val - - def find_cell_data( - self, - row, - num_cols: Optional[int], - duplicate_rows: Dict[int, List[Any]], - ) -> Tuple[Optional[List[str]], Dict[int, List[Any]]]: - """ - Extract cell text for a data row, handling rowspans by re-inserting duplicated values. - - Inputs: - - **num_cols**: expected number of columns for this table (may be None early in parsing) - - **duplicate_rows**: rolling rowspan state from prior rows - """ - cells = [] - try: - cells = row.find_elements( - By.XPATH, - ".//td[contains(@class, 'datacolBox') or contains(@class, 'datacolBlue')]", - ) - except Exception: - return None, duplicate_rows - - if not cells: - return None, duplicate_rows - - data: List[str] = [] - for idx, cell in enumerate(cells): - num_rows = cell.get_attribute("rowspan") - if num_rows: - duplicate_rows[idx] = [cell.text, int(num_rows)] - data.append(cell.text.strip()) - - # Only attempt to backfill when we actually know the intended width. - if num_cols is not None and len(data) != num_cols: - for idx, value in list(duplicate_rows.items()): - data.insert(idx, value[0]) - duplicate_rows[idx][1] -= 1 - - duplicate_rows = {k: v for k, v in duplicate_rows.items() if v[1] > 0} - return data, duplicate_rows - - def clean_events(self) -> Dict[str, List[str]]: - """ - Extract the small "Events" / "Salary" text block from the current yearly page. - - Returns a mapping like: - - "Special Events" -> ["...", "..."] - - "Salary" -> ["...", "..."] - """ - events_dict: Dict[str, List[str]] = {} - - try: - row = self.driver.find_element(By.XPATH, ".//td[contains(., 'Events') or contains(., 'Salary')]") - except Exception: - return events_dict - - event_text = (row.text or "").split("\n") - for line in event_text: - if ": " not in line: - continue - - title, rhs = line.split(": ", 1) - if "Events" in title or "Salary" in title: - events_dict[title] = rhs.split(" | ") - - return events_dict - - def convert_stats_to_df(self, dictionary: Dict[int, Dict[str, Dict[str, List[Dict[str, str]]]]]) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Flatten the nested stats dictionary into three DataFrames: - - hitting stats - - pitching stats - - standings - - Each output row is enriched with `Year` and `League`. - """ - hit_table: List[Dict[str, Any]] = [] - pitch_table: List[Dict[str, Any]] = [] - standing_table: List[Dict[str, Any]] = [] - - for year, leagues in dictionary.items(): - for league, data in leagues.items(): - for items in data.get("Hitting Statistics", []): - self.add_to_table(hit_table, items, year, league) - for items in data.get("Pitching Statistics", []): - self.add_to_table(pitch_table, items, year, league) - for items in data.get("Standings", []): - self.add_to_table(standing_table, items, year, league) - - return pd.DataFrame(hit_table), pd.DataFrame(pitch_table), pd.DataFrame(standing_table) - - def add_to_table(self, table: List[Dict[str, Any]], items: Dict[str, Any], year: int, league: str) -> None: - """ - Append a single stats row into an output table, adding Year/League context columns. - """ - if not items: - return - - stats = dict(items) - stats["Year"] = year - stats["League"] = league - table.append(stats) - - def convert_events_to_df(self, dictionary: Dict[int, Dict[str, List[str]]]) -> pd.DataFrame: - """ - Convert the events dictionary into a DataFrame for easier export/analysis. - - Current `events` shape is: - - year -> { "Special Events": [...], "Salary": [...] } - """ - rows: List[Dict[str, Any]] = [] - for year, event_groups in dictionary.items(): - for title, items in event_groups.items(): - rows.append({"Year": year, "Title": title, "Items": " | ".join(items)}) - return pd.DataFrame(rows) - - if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s | %(levelname)s | %(message)s", - ) - - parser = argparse.ArgumentParser(description="Scrape Baseball Almanac yearly stats.") - parser.add_argument( - "--limit", - type=int, - default=None, - help="Limit to the first N yearly links (useful for testing). Example: --limit 5", - ) - args = parser.parse_args() - - limit = args.limit - if limit is None: - # Interactive prompt for convenience when running manually. - try: - raw = input("How many years to scrape? (press Enter for all): ").strip() - if raw: - limit = int(raw) - except (EOFError, ValueError): - limit = None + # Backwards-compatible entrypoint. Prefer running: + # python -m diamond_data_scraper.cli + from diamond_data_scraper.cli import main - Scraper().scrape(limit_years=limit) \ No newline at end of file + raise SystemExit(main()) \ No newline at end of file From 81cd2c2e01801eec9670867b9aa827ca405fb301 Mon Sep 17 00:00:00 2001 From: Edward Bernal Date: Wed, 28 Jan 2026 23:07:55 -0800 Subject: [PATCH 3/4] Delete diamond_data_scraper/__pycache__/webdriver_factory.cpython-311.pyc asdf Delete diamond_data_scraper/__pycache__/constants.cpython-311.pyc Delete diamond_data_scraper/__pycache__/scraper.cpython-311.pyc --- .../__pycache__/constants.cpython-311.pyc | Bin 668 -> 0 bytes .../__pycache__/scraper.cpython-311.pyc | Bin 22754 -> 0 bytes .../webdriver_factory.cpython-311.pyc | Bin 1675 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 diamond_data_scraper/__pycache__/constants.cpython-311.pyc delete mode 100644 diamond_data_scraper/__pycache__/scraper.cpython-311.pyc delete mode 100644 diamond_data_scraper/__pycache__/webdriver_factory.cpython-311.pyc diff --git a/diamond_data_scraper/__pycache__/constants.cpython-311.pyc b/diamond_data_scraper/__pycache__/constants.cpython-311.pyc deleted file mode 100644 index 10862fffc12b2a2b813286e3153ef9c2be08a352..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 668 zcmZ9K&ui0Q7{}i zo$r2-k|DwHY=CViK)CuA`W?wir#%^0GSL?OA753DRipkw(JyqRQGcZ?nq*N;RbDpL zamDKH9Jq|R{@G)bLFQ8CI`rbiWlk@{xa=d&sAP4ee4Gn$Txh9^VIQk{({37Ce1{*X z?-CrZCR}UTT3tV~4K*&!*RCE#o6TwQbXGi_taA2n`%~`oO0;cFH#)P8 eP6SFEmbNbUuMVTFH`DdjY`qnMVp_xGQ~m|sMZd2A diff --git a/diamond_data_scraper/__pycache__/scraper.cpython-311.pyc b/diamond_data_scraper/__pycache__/scraper.cpython-311.pyc deleted file mode 100644 index 1c7d9a6c6eb959b43d3f5a2f29da8948dd0fbbc6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22754 zcmb_^dr%v9nqcdRgai^uyuoh3Jj5IPz{c?d1Ab!S7(elN6rmO%BP84{{LqM<>{Mn) z&T<#*W6Df6S5am%8D`_$d2fVOkI}y zWAFQZEw$7F5@+uk@pbp_{ruj){#ijmJ_Xl5>AvINdz7O7C%&XFz5Jm0;t@PNq!^0P z1gII!m_~!o+JJUUOYXWc9o%&R{fuGEK;rZP<4oRI9(gtdOf%*&^GyC&{)}bJGGiUH z&J>Il%oL6l&e+Cmsj}>2MNoz@P(0%pb3j-gV+uHDO2$fNO2Q3)2XnJ^^6C6l786Fg(cI=vsjiVAQ>*q%?5l> zhW6Oq-)pcoom4b>q5p#WLjSShA@`Y~tM2ndQpwe!{`2nPq5cyWhukNJ`Ui*3V{jn` z4_|c;pBgy>?@)|me9j+W+}zF>q`nqvU&m3fIuj|JQ5Ncy+@;4ooNwF{2+#qaXL8O*b1*j?Jpp~DkENsf12dkW z$J>!l3OMPXoN5pFZutWAL?Cp#j|8^U1ED~`=MAUJ4fuo89NpYM+&MDr>PV$I9%5;q z$2&z5(X*aOUmHM{^U<>b&s`te8TNT*=rB$%M|*+{?V-6DY+M~w7YuWC^mt&79aoAA zc*0>{koE_|A^IGo_i*%}C+s=SdS+4u`0mVx*f0(81EZJZd>14;=L`6P{<#_VEE}5e z2Ye&2R7_Jo4+Hgbk|h=DX8f#V41^{peJooFb1dn{L!kg$hL1Wf%(CV1v{9a3f!8pp zKRDSjOtM_};v&RL2mc27uKD5|gg&IAy4mt@Iv%2KD9_>4vVuVBeF*(PiHTBfEQwrx z3f`2IVM{ukqQch9^QQcH>9`lpVM^zSQa6;^^^rG^TZWYS># zBdr=iKtGh$zd#*2b)NtU;X*ff>R_12D#JB?D(DaUJpq5jM~A0iP0M?LtZg4zS6_G4>2l^cAKT8rx~2Ubv@yi`7`oXD+r-!D4TYxtJ}$L^ z(#6V45=%@kJz5T;Ct!m?rIReIW198h+Q$W{3`}IdGBgmLZh+irx;fp?Lk%44OL=SH zCV+RBOG~x@S^N=)YP6st4|8|KWtH+$J3ul|`oi)qkW6lO5VoV+O?Hc9ly@?VYgf`E z)|agE=64g+BjqCwNImQQLFiF2CN;{9bm=8M?B0n?V@Rco`HE5g;&tAALoiK?rfJ?Z{k+umc>l91zVxC{dQmLB zh$SYoGjs^1PSMoKr+;5SqhPRDUwDoUO8Svd(5IjwjKXn(&8hif5gr~=We+eThnEkO0NA9!1JqMt1({IPE{fG_sH-IAHg!jLmAb8Q z=_30F0wEv=xNDWInO>fpX98kEv==GD4Dbwop!amx54hgd5h=l(n-=wm%Rm+gy9-*w zHUNm#g#OT#uoIES65*(OY4@M0?>1?hPwulq337yy`bz201uwAvTW ztXiuR)@r_Hk6_&^TKDqSy;;G6b%$u(!KZ&;KDU*xQyOzo20-H2U~F(*2cchn`6XwA z;=i}Mm--L+{RZ79dJVu0O@~V>SpxnUSc^!sIVlgY4k1pa8DDVDJ;w&vZpgS%kVXgB zfFfRJdmv1ie4Hl4+GO%^(*F6AA6F(H%2Tdr&N7+TN40-?JqnD_G+9<`k7}uHxojB7 zqoxiTvP9#!Y&Tt7EizRdTQ6tB<~pK!MjO?BUt6Kn0j;OvNV7LxQT^ua@M7%?R@B3G zd$F|XE%5^VF>cWw<+(bf?@@{+BAXLG2!gxw1VBp-y5J?WN+-I2+bB9B2W`eDQG;BR4G{P*i{#i)} zf)v|}A$icIxtSm*86hpS-(@3YmEDbbkUSB3BpKvt3o=6m1xvQSj*-803sqFIWLdS-2|LZ# zj`81`6zo%?eTuhFCCh7;1Iyo84v6Kgi-t9;{mrv)oLz1Zth8vQc`J<}S%rNW8&ti1 z`K5~S>=#VOMAI?ebS!!J*y`a+iNlxp%h$xqQ^H}tc-SwP4#o-=zsoltO!n+s?Kz$3 zIsI(UZ;uE)*TkM{z=Fmsi`V04){Ru*j^%mY+MX=f8Q;lUyORY~%h!0TD_KyJ3YXv4 zEk;LvEN|UT)$CfQbY>R;_l9F97sr#OH4lRig7HzI)Fqa>c*j2Yt(=tau|d`Fzx0Ip z&A>D5Z%v<^{q@=3jY6BRikCgY=(spKE?B*y)yrGGNo(Por`|ZV$iy3Brvz)WXl>@n zPc;I)f@!~K+Rvwd9Eu&^>n}dmM}5-T2k_Ir(gC|}!;EUDYq0vSd$aiGN3JA%UI`fSvh$WIt8s!t}sz3%C9#R>dxeFTJzdTYF-!B0)@T@ z+>bu4@Cr&n@WABA-?JJlpX6b93YLVLCh>V(#LdMZk0*Z2>}YEOZCnJ(7Nk^-V}1lVK?! zummIZ$3t_#<7^_)vmq|r4(J_v?aHQbvXG|(GA9)PIb__R;eqCb zITqPA$vQAP>K+{$9vZmt@_DHMsUav4S^xMP6okuIGIG!s@06qiAxbh#vZ1+I$>^D# z1x|}S1PNT`3}KVvL=2!plkP!OsmkNB$ou0Dx}usp=guL$V0OP^)<-0kQt1!};*YgCp^Mf}=%rwD69u zmGRi%bNdd_-VkR5d#h+~C8W@TM4E@YfH3erY^&3G8G3Bkih0T(``vURMvxN}|`hLq_m zs?EVMy06E^pz+k@?(DFF|3Ko>GTKKuDF&%U&2@puK!7=k0!E=DD3@=(Kq}Tpb%2$i zM=D8#x}iv@+emFWSQNCes9{K5xj0rs1=~SMV^7BuB@LZJVX}IcNyiwsZsApG;dVxn zjc3uJOy2k5@dM?Rj0?)4Mphfm<94X#VC%Z`wyt~OBURq5N7h908l!pd!$^Lhj3)MZ zYr3>WYeDvnnxHO)mPCyiJwi0^CFon!4DpK8xJCTe(tVJpVGXF(t0`Wn3|rQNx`paB zt;%oy=J_{`k`el)tf*TW)(hj7wSF?qQnxBQSJX&G=<7!ypNv43PDfozX3YN?{mR){ zi<*VMV=QR4j7%vc?-D5a!x{4xh`mUI34!+A0l6G(U$V?XtXQ2y&rKt}lkHRfAW-!g zRQ{vV8%>rhX!f%t+;TztpRiaUCWDpP5C)rd09XR%4mK?~O{hB<2jx19^2AGe>?DV1P9gWQMw#t{ zu#6ocEoDD|P_7l`?ltOHD@N8xZSkZU64a7Zv*dSw&sE z@9`^qMXylND^~QbQ#EE*^ErX{PQ?Zmo$Fd7(raf%fZX>)_vGT!LBW1Xw4dVbr%2>O z+XGu%FW4J^-t+c`=k}6C&wbO9X%YUueD0`Tr?dqX8IUTlK4GutcO89N|8(vdFbSig zeU!J40)1aJts5W%XtUQ%R5iU?)tRX36so$#s&0_)1{XmdE+eHa-y%0vd0_k)=NUgg z9TM!bqJ5UP&k~j)OVggJlD{w}Sg(mzkS?z!t9HI~=&eIPI`Z}rsN(qIV6tORY*4h; zEHk3DK7L%VwkF%UVuK4WFGFZ;yiu^aKs-)s@00dJNxLIyFTr6y{_xC$GxyIfofYhL zqP-4-1|Ob!aO(baM+sU3-OH zz2dIkMg5Xx83w#oH4lSNz0dak?z}8wkMrJ15VTh>&m=C-2$w_R<&bcGRy;o|*l&vV zo4ozzKdv9dxs_*_bHJkcq`2RBtU~)~Mbm)J@Hbiwz>Rw1|8PxZ187q$gx64F&6uR2 zjEqS-07&)4%mLZP>5MrTEJUz|GlJ|5K&SFGuL%|bOOZwRx3c?$719WXa7FBlYCyQm z5u@foP0s;SHA-cRoKfvIisT$sy_y)Ju)9sZ9Jc927#GGEfi9!qtcaVYs7?_#Nt=O* z(@&r`j5+d-LRJV`C6b?aj%7hmNy~WzKch+3$08H&wb2M&ucEOzNGeDb#)N!auqjN>ct9;lgNFndrDcblENWE9_-}Q91jL3w zP@C%hxpriPeHnTvi!}jwisYqa#fU9Ocw|Q*+GUX?F>AIEMr1Jd5=0}e1ZqJLH*yT@ z6$Doipp6YMu0IUkJO+Z0C8OMQ_BzI&Lx3&d5cd;ID+^T&yhh443snS3|21a#TL7TO zv+B${k|h;O*H%l~6D92*jz0?VM_v|6&WZTD7&{(2o;KF)Kx3T@l8&0izQsN?*6jfF z_D3UCPzSoCmC}{!$322|pJ?3|(BB~_}2pW1yS-9&*I9`Vc?0PpaIy{3jdvpEMOZ_8HBO5&5hA! zqlmB9tq>XO(!73>dZQTHYG8EpM)=PI4Ht0yFVI>A2{C##Y6D*13>vZ@&F2HBt60P` zeFk1XUyWvnL}Qt^Li#P`mo3v5prvbTv=hybf}Kt6Wkc$)&D*0EkeV!@XS2fH3Q|-d z+zaOo(E_*^yipvr|EV@=hj$y>d#{+?s~T0Cy4~4v_lnhVDScW=)U>sf6wN6MG-z*u zMEB=V=e?+j{XwjFLcrV8ZS;aUy2b1fiWaFyC_659^m65RkT$NMXA~g~*2v#0_*^CpsKOf$ z-BFloFu83X8M09CSpmc3bAXE8b!;O+f^ikBa$}EDy|H;+i>dvBc%D_G(7xJ z^N^YZk^TD4c}=#$CtKXTr_EKjt0(Nfc7YzHeold(Q!NxJ=_3AF@Cwv+bV)iG1IY|p zOXA4Hafr%j`ZYpxHu7J>k@ALSW)DYpWs@EhIXYenguq*cI}8u$RR0HZbFBcre2wDi zbMo&A_fzLjy(_NAqe9~WvGIUV+b7ocDbcd(3uQdXc#e2V{{ElflJplq-iYiP@XUg{ zPkMoDZ7huF=uX;IAPY_xp`;8cCK8rpV0`0qlSJf_Oc(l3xJUZW4oOvndXt%w-MGL& z3Hn*t(pN|#m4T5`@4i4l7PziqgY`E;{-Dbt)A6EBOHi`qEGEfHR)AZUAb}4c&c1~V zIQN2nI_z_!5fr?{FqkxnSRUOV8A@gxb6H^1V=8tY8vqr|%*w^d>A?sK0i@ZG35>Am zH(2_d#0H@a-njpK1WB&x|C472D3`{>Am01-Z zq{zsFX%SVxd^Ir7ur}81)ZwEV-c*r1cqnFDE2~=i&T3g#qO9xV;*WMdZ5PVUiDl{qW)*R)lj>d$e@u%#2-+4SJI1Y%81EAF{t6Hau3OWf`)Fr$2tahDC zbe-d0xgd016uT}8_Vz_vygJ!?aJBbzqWAQ(-9qonV(-g>ea~VcPwyc)1`=HZPY0fr z@#kL^x;$c+2XmSD%Jwx!9k5G9-O1Xn#nVX#=%h78t;roNizkwfCVc5jHXK+SNjjht znxf7m-LZHo>1gB~J;{dsiz90eXdQ&d&E)o>_i^*jXCBS`ihFwCum9wif0F1p%Xgej zI%`&)yAsY_@hgI}Lv(iV&W+@H30v=}AAmrg6X|n{1;BhQ zdzKExPbM5~yrV6-e~5QB#BV$PwM53gPFKJ8e8(4ffetLy{ zZ$xnJUXz9J+62gf&c39x;^FBBrx;%AI<1qr=prTB-?z25e22Lj0<2ckF<%V8;B5SOy|sU{xghHihkGtf*rhrMqCrm z;v6%%RJAL*TE?j6(6-iuGkI!skG4qGmaPqs=1}*bhB3|O1&cv^Rks`uR6xvCDyUDl zT_ubfLUmCCW6tm?p~^7I?5XR9A`GKWKRcbeA35y>UWqLMffel=n=U4Y=L9`}rg|#Z zp2?V}$@sD+b^Xv|%e?t3b6T@HdgF?II+GLZ%LVH3Rlj}B_@zf-FpR81>dKl|1Cup2>XJbYu(HkD&12NMb&r@F zk2rOiV68x<2<#o;c=HGsS?C!AkxXIP20?cK>S=TvBXkp7Xv4lq&_#C=c8{F)NGS)9 zZmM+JvVH)V3z_P}Z7vx88b-4@L!I4lz9S4yOF*BI?IJ&FnoaMAK|b=O~|!AcFOkQc%<5J#M-FiI&0nAUBS zBG6&*yGMP9ht7In_f)X}Hg_H?bYsFhiIuZq^f&9q-uJ zAY9f5p>bkEL%WJtg{zbdSI+fcI4LW4SWNjkg3Ac5B0$U`X<47F^g%XJ<~xm`5<)Y( ztO4SH;|sI6eSvRn#6-x?vHuc)ly19WZ{E{jCDw+LU z%EL@bCGJW;bW6q=H!`J?VaAPy0z^J(_KnBz^SuY#EbrmmQ(t&im= zE81d($+G=1>srlDu*+*vA4O&*&aF1>Pc-fSRq-#&pB@&P&WlawVS-AFLg=Gf~J1D7$#t41YtCQ6`SF76+)omZ1e{}81E}{B}Sbb!*`fQ^5>@&YmeMPLk z5<8o0@8in`f5*ygNKigeRS$c@Yy7P>GE%H2z#!Id#(!3G0`~|Gbb%hEUP(bEnT(NBtTO? zC0Lt8YZGs60=-)8uK0;}t?yY^ZYJn1p6*&~TCQDcS!{Vc9?N@9*MCK1kz=W1`K6UI zp{QFd>gJ0setiC;t52fOE(?1vihD1{j7bY|zdw+vdE%YGTY(=1-wv+s=t=D8dAwWL z(JSug6|DP3>wez4AKGKJ#ctnwY2l^i+JvQww^TiMHi*u)mCKJ`c{C!L1+Obe=+rGvGrK3?o(O=aHH=Ovr-0e~i3Z%qvJ_*vZWy3!TT+|? z^0rYA=IHs+>5Hw1=4G7SRE(ZUO+F{JU|5!o&{;X6hFcU{3NB(>D)~0MPApv5E^T%@ zwpIZ$Mqp22Y%>;=tyO<&RhhaMpkn)>VqY*tE-3^oWoAv2ZzTAs^PL{LZX8bI`q(<6 z!ybQ-Yd-1?csL;MO>#+1uIq$k97eh)>4!XE zJJnwXXKE>L08AS!*iiG3!+=sH<9XjCn870s`q}`9_S59>8Vg=sKK42xEnq=VND3Q= z0%d~Y#V0~bkXFhBg=iY`;J{nJ6J!js>c5?uUV+0SpZ{r7x7?U|DLucn!AN{|A((4>~u=hIqhU8`Vu{z1pi(7UI)$c)|$6 zM^WNdBfK2s1qjdcwFocQEc#6mo-T#(3|phOuSZ#5gNnxC>{meL=bK|vjcTV6MZoqA zm!&5NY$w}z@iIpZ+enJWsF~4c;UInhaEYBhZMT0I8{ABLH85|I4JhJl;AfPsS3Xn6n zolKGwAc&Ib@w;%?r5#Xk>RgI~&=?x9861MivgMxK4r!yl6ofMH#%*$;6tx+xr&B7qg4TfyR52p| z-rW=Lj(eWI^7Q7fFFfo2^vcsK%eBj|JcawyD?}gy-{NmVh1h?OgL4SMH~=`H1Nve( z6eG`6+QEnjG77qb^QJ_LtjE!o>5m>m*#AUIkmnQCMhXo|NrXRuuv{XP9)PRZ+1F$u zRBQxR+gY~iY)Cl4!KZl1l(dh5XK)Glu>gn-#fFf9w}VO)nDPt&SDob@yaTy%`G?$o zGrINwRVpFrvXZP-R>$>YX7L6Z#=qKxXcOmdN}Oy=9xFnEbHGkzhisLCX~Cxa+hFj5v?se z`K?(Tyk+N42Uhg&o_g=ppw*0Oa;TOz<6jJ}INEJAtGTpD7u5kyK02>Q zzh&?V)hvLA?lyVh6Dew3Oz;JqLY8jGDt`LsbxqPXS5o=5xny#b+kMl7C*4a5f0Dg!j=;7J1ja1Ks63XgbX zr;JC+Dwn1fgYml6lGa2?>&l=|(k+&BgO0=rOtI0tj{vYk*&NGzc-v0+p+{gv%{!HE zRmKNlUskk<6>axU$A%Vn17Ewmm$%XIi=TXK;CJnXU$UY3XIFoGb!ARy*ef>dU9>C@ zfo~refGT-YC1FBeZ)ls8;ij0dsc4`=A6ff0y1yux()4 zHeZ`^ER!60C^^Srwl`mlW|C5GRj8*D++nC~J_n!$r*{K5)&pDc0kQ_<0x7j>+Zv#s z->3%NH?9Fj@f+0uri-onu<)aWUxaZ7rLK$vQ?CAEn^ZYSbDLB-qQL|n8q-JaE3*CZ z)CZ6dNd?Kd=hV4mMp?tM^LZG~S^2V#DrdWez-eUpWHb76C`Gr^Jh8MvpjgOI8oV;2C3L6OgpCjAT}}PMT->J4h{= zp|z=07Sh1fOJTZE@;mVngwN3s&p(bh3$p(iYr`tHbkIHs5s@kd`)xk_`~Z<)KL9|2 zPO7yX#PGI`m?hcS9V=X`XY~^|8Ga!Q_ymuCSo|}?%)6WWiT=1^# zJsUa=VW@Qr;>jtMnK#F`_3FN=;t8ib2OnC&FVd{UKp`tjZ;yPg^a=Lykyf+xRJ=J+)DFV^f6 zYXI45im-7Avnvih_C2}qbf4fnEjmy0M{bFi-8 zpzEhsvH!x0PX5o_Hm0qxaIU_{LWL>7}NqEmt_3Ul-0WRo4 zh<=S)Bf4v|BWW#;9skVkT)cSSvSfk72j$fl@ZZIEPRr znySDLRsNAV-PlRyEQelZ922f@ShK01N+GR~y?=lf$$kkhOzzGQ;-w>}mLd1%YZJNa zRtThan;FN|q1lGC<#-A(+P8DiIgq)Yh2d?Gdd@jRSVy|Z=>Zwx5Oz~$<1)vD?#eja zlfmCaKj8ClesB$!zi`(HALt9*<@{U+`(L5|E*<+n;0E7BL@#Q|0OuuVxHQJamBhvn z{5J#=fD{YKB93M6Bfy0vS!Mo_oPp&C?)KZ9dJ=U#Lfsy*ZV!lQ<|}}S5W-1OK%#9)x^0#2Nzgq4y+@=W zg{A;gAOu8eSe#4AbMXt0hn^1o?deZXKbsQW|0DD2U&p^!? zh2t5vS`jpzrdnCxH_9IuLv;oD+!0}1{t^Kx?^juA z{|i3rizz1%c(CgP=jp5(AHqT%JvJ9Y{s@##RYEQOyu3JhC zUF+0#pv!0ggP|H!QjIO3(_goA7!Isc+ktTnZ2%pK8VqV040w`f6CBZ$7{Iiu23@^| zo^?tMSWUM9zL}v06Pii`Xg1Vfe=R0dgX0>90bJ(PpsmdCDzs@6>@i`J)WB&pfXCi8 zK%NI2c~=##fQdQuONFUM3eDCaco%?V0a}M23k5NYOMM9tKjTKu^1=6G?&A4G7AYBv zn1)471&aiN%|~ECUa@CqOcihM)o$lJ7Q)x zsa*wp$PXx`kU(3iliT9_iv9x0Y_V7f6iN=gskpuP(!8^)l~GHeqtUz1JM+%R`^+=* zW2ux!Q2t4MX}KwcewR*C)EFpdw*fpx1QEqXj?z*TiB(%|sqjtNDMxE*QJZF`olGkm z;k2zgxmFHvi4-4ctvpe8P<7!CIcZf13)pp;hdqnA;tu$^2eeJNb)Rx@vOARDw>lKy z(ycwt9J&>CeSlrOOL?g8QA}(qM5whHv%^AFc65M)NWA>-E*AnR&iIH`#+9@&=S!oMMj}*d<1M!Ui@v%$^Sw zpj@x}g7P}S9)=FSQB}h%G)TdRS=NiLM70pA4CLLz-d>o+Z4m+8!S^T(5nKZv!&02? zhw+fGhzT(<-}N2pdSNE+OoFC2ic2wt*@za~uBNy=PA=8R%V31mEzo@aQS%enl4uH> zb#2z%u{xB3ioq&FbUk*Udp~wAct18;9G^w9KRJuL=V))M_KG7HBm9&lj!) z3)e>rtHHwR;rrw4r3q5fwXuHbR9`vKSDt(}($@lgZK$t}wZf^ke4;Hsd2^&y1FbsL zs8+TMxALp|<|K@M=)F`mFrMua&n)l^a3j#%bl|N#*9KvJq4^M&-?* zyg4pknWR(2%%2F%U(!raDxEolEKoHQ)2aiGxs{iI7o;i5{z8~H&8`nG6g5pQIp^{+ zg}R_NbuHhC&-bmwZOr&=#OyVudg1cz;LCFC@+>A1^)xm*@`&f35nqwY4AN>P+Cx;C7zzoFb=^c$nfaK4Vw?csbKqj!e$b&^?BfS{L{Xjd-) DX8WT% From c5afcfcf492880904602e835111a86d216037e22 Mon Sep 17 00:00:00 2001 From: Edward Bernal Date: Wed, 28 Jan 2026 23:45:23 -0800 Subject: [PATCH 4/4] cleaned up older years --- .DS_Store | Bin 0 -> 6148 bytes diamond_data_scraper/cli.py | 34 ++++++++++++- diamond_data_scraper/scraper.py | 86 ++++++++++++++++++++++++++++++-- 3 files changed, 115 insertions(+), 5 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 argparse.ArgumentParser: choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Logging verbosity (default: INFO).", ) + parser.add_argument( + "--league", + choices=["AL", "NL", "BOTH"], + default=None, + help="Which league to scrape: AL, NL, or BOTH. If omitted and prompting is enabled, you'll be asked.", + ) return parser @@ -61,6 +67,31 @@ def parse_limit_with_optional_prompt(*, limit: Optional[int], prompt: bool) -> O return None +def parse_league_with_optional_prompt(*, league: Optional[str], prompt: bool) -> str: + """ + Returns one of: 'AL', 'NL', 'BOTH' + """ + if league is not None: + return league + + if not prompt: + return "BOTH" + + try: + raw = input("Which league to scrape? [AL/NL/BOTH] (press Enter for BOTH): ").strip().upper() + except EOFError: + return "BOTH" + + if raw in {"", "BOTH", "B"}: + return "BOTH" + if raw in {"AL", "A"}: + return "AL" + if raw in {"NL", "N"}: + return "NL" + + return "BOTH" + + def main(argv: Optional[Sequence[str]] = None) -> int: parser = build_parser() args = parser.parse_args(argv) @@ -71,8 +102,9 @@ def main(argv: Optional[Sequence[str]] = None) -> int: ) limit = parse_limit_with_optional_prompt(limit=args.limit, prompt=not args.no_prompt) + league = parse_league_with_optional_prompt(league=args.league, prompt=not args.no_prompt) scraper = Scraper(headless=args.headless, profile_dir=args.profile_dir) - scraper.scrape(menu_url=YEAR_MENU_URL, limit_years=limit, out_dir=args.out_dir) + scraper.scrape(menu_url=YEAR_MENU_URL, limit_years=limit, out_dir=args.out_dir, league=league) return 0 diff --git a/diamond_data_scraper/scraper.py b/diamond_data_scraper/scraper.py index a0cf073..bba022f 100644 --- a/diamond_data_scraper/scraper.py +++ b/diamond_data_scraper/scraper.py @@ -66,6 +66,7 @@ def scrape( menu_url: str, limit_years: Optional[int] = None, out_dir: str = ".", + league: str = "BOTH", ) -> None: """ Orchestrate the end-to-end scrape and write CSV outputs. @@ -73,10 +74,11 @@ def scrape( - **menu_url**: Year-menu URL to start from. - **limit_years**: If provided, only scrape the first N yearly links. - **out_dir**: Directory where CSVs are written. + - **league**: Which league to scrape: 'AL', 'NL', or 'BOTH' (default). """ try: self.logger.info("Scrape started") - links = self.get_year_links(menu_url) + links = self.get_year_links(menu_url, league=league) if limit_years is not None: if limit_years <= 0: @@ -109,13 +111,18 @@ def scrape( self.logger.info("Scrape finished successfully") # ---------- Navigation ---------- - def get_year_links(self, menu_url: str) -> List[str]: + def get_year_links(self, menu_url: str, *, league: str = "BOTH") -> List[str]: """ Load the year-menu page and return yearly AL/NL links. Filter: - keep all National League years - keep American League only for years >= 1901 + + league: + - 'AL': only American League + - 'NL': only National League + - 'BOTH': both leagues """ self.logger.info("Loading year menu: %s", menu_url) self.driver.get(menu_url) @@ -125,6 +132,12 @@ def get_year_links(self, menu_url: str) -> List[str]: "table.ba-sub > tbody > tr > td.datacolBox > a", ) + want: Optional[str] = None + if league == "AL": + want = "a" + elif league == "NL": + want = "n" + links: List[str] = [] for a in anchors: href = a.get_attribute("href") or "" @@ -135,6 +148,9 @@ def get_year_links(self, menu_url: str) -> List[str]: year = int(m.group("year")) league_code = m.group("league_code") + if want is not None and league_code != want: + continue + if league_code == "a" and year < 1901: continue @@ -420,9 +436,71 @@ def convert_stats_to_df( for items in data.get("Pitching Statistics", []): self.add_to_table(pitch_table, items, year, league) for items in data.get("Standings", []): - self.add_to_table(standing_table, items, year, league) + self.add_to_table(standing_table, self.normalize_standings_row(items), year, league) + + standing_df = pd.DataFrame(standing_table) + standing_df = self.reorder_standing_columns(standing_df) + + return pd.DataFrame(hit_table), pd.DataFrame(pitch_table), standing_df + + def normalize_standings_row(self, items: Dict[str, Any]) -> Dict[str, Any]: + """ + Normalize known Baseball Almanac standings header variants to a canonical schema. + + Some years use headers like: + - "Team [Click for roster]" instead of "Team | Roster" + - "Wins"/"Losses" instead of "W"/"L" + + We normalize to: + Team, Roster, W, L, WP, GB, ... + """ + if not items: + return items + + out = dict(items) + + # Team header variants + if "Team [Click for roster]" in out and "Team" not in out: + out["Team"] = out.pop("Team [Click for roster]") + + # Older layouts sometimes use a combined "Team | Roster" header. + # We treat this as "Team" and add an empty "Roster" column to keep schema consistent. + if "Team | Roster" in out: + value = out.pop("Team | Roster") + # Only overwrite Team if it doesn't already exist. + out.setdefault("Team", value) + out.setdefault("Roster", "") + + # W/L header variants + if "Wins" in out and "W" not in out: + out["W"] = out.pop("Wins") + if "Losses" in out and "L" not in out: + out["L"] = out.pop("Losses") + + return out + + def reorder_standing_columns(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Enforce a consistent column order for standings and drop unused variants. + + Target order (when present): + Team, W, L, WP, GB, T, Year, League + Any additional columns are appended after this sequence. + The "Roster" column is dropped (it's empty after normalization). + """ + if df.empty: + return df + + desired_order = ["Team", "W", "L", "WP", "GB", "T", "Year", "League"] + + # Drop Roster if it exists; the user doesn't want it in the output. + if "Roster" in df.columns: + df = df.drop(columns=["Roster"]) + + ordered_cols = [c for c in desired_order if c in df.columns] + remaining_cols = [c for c in df.columns if c not in ordered_cols] - return pd.DataFrame(hit_table), pd.DataFrame(pitch_table), pd.DataFrame(standing_table) + return df[ordered_cols + remaining_cols] def add_to_table(self, table: List[Dict[str, Any]], items: Dict[str, Any], year: int, league: str) -> None: """Append a single stats row into an output table, adding Year/League context columns."""