bunniebytes · sloflo · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+# Outputs
+*.csv
+
+# Selenium Chrome profile/cache
+selenium_profile/
+
+# Python cache
+__pycache__/
+*.py[cod]
diff --git a/README.md b/README.md
@@ -1,2 +1,6 @@
 # Diamond_Data_Scraper
 Final Capstone project for Code The Dream Python 100 (Python Essentials)
+
+
+python scraper_logic.py --limit 3
+# Limit is optional
diff --git a/diamond_data_scraper/__init__.py b/diamond_data_scraper/__init__.py
@@ -0,0 +1,2 @@
+"""Diamond Data Scraper package."""
+
diff --git a/diamond_data_scraper/cli.py b/diamond_data_scraper/cli.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import argparse
+import logging
+from typing import Optional, Sequence
+
+from .constants import YEAR_MENU_URL
+from .scraper import Scraper
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Scrape Baseball Almanac yearly stats.")
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Limit to the first N yearly links (useful for testing). Example: --limit 5",
+    )
+    parser.add_argument(
+        "--no-prompt",
+        action="store_true",
+        help="Do not prompt for a limit if --limit is not provided.",
+    )
+    parser.add_argument(
+        "--headless",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Run Chrome headless (default: true).",
+    )
+    parser.add_argument(
+        "--profile-dir",
+        default="selenium_profile",
+        help="Chrome user-data directory (default: selenium_profile).",
+    )
+    parser.add_argument(
+        "--out-dir",
+        default=".",
+        help="Directory to write CSV outputs (default: current directory).",
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Logging verbosity (default: INFO).",
+    )
+    parser.add_argument(
+        "--league",
+        choices=["AL", "NL", "BOTH"],
+        default=None,
+        help="Which league to scrape: AL, NL, or BOTH. If omitted and prompting is enabled, you'll be asked.",
+    )
+    return parser
+
+
+def parse_limit_with_optional_prompt(*, limit: Optional[int], prompt: bool) -> Optional[int]:
+    if limit is not None:
+        return limit
+    if not prompt:
+        return None
+
+    try:
+        raw = input("How many years to scrape? (press Enter for all): ").strip()
+        if not raw:
+            return None
+        return int(raw)
+    except (EOFError, ValueError):
+        return None
+
+
+def parse_league_with_optional_prompt(*, league: Optional[str], prompt: bool) -> str:
+    """
+    Returns one of: 'AL', 'NL', 'BOTH'
+    """
+    if league is not None:
+        return league
+
+    if not prompt:
+        return "BOTH"
+
+    try:
+        raw = input("Which league to scrape? [AL/NL/BOTH] (press Enter for BOTH): ").strip().upper()
+    except EOFError:
+        return "BOTH"
+
+    if raw in {"", "BOTH", "B"}:
+        return "BOTH"
+    if raw in {"AL", "A"}:
+        return "AL"
+    if raw in {"NL", "N"}:
+        return "NL"
+
+    return "BOTH"
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="%(asctime)s | %(levelname)s | %(message)s",
+    )
+
+    limit = parse_limit_with_optional_prompt(limit=args.limit, prompt=not args.no_prompt)
+    league = parse_league_with_optional_prompt(league=args.league, prompt=not args.no_prompt)
+
+    scraper = Scraper(headless=args.headless, profile_dir=args.profile_dir)
+    scraper.scrape(menu_url=YEAR_MENU_URL, limit_years=limit, out_dir=args.out_dir, league=league)
+    return 0
+
diff --git a/diamond_data_scraper/constants.py b/diamond_data_scraper/constants.py
@@ -0,0 +1,13 @@
+import re
+
+YEAR_MENU_URL = "https://www.baseball-almanac.com/yearmenu.shtml"
+
+# Matches the year-menu link format like ".../yearly/yr1970n.shtml" or ".../yearly/yr1934a.shtml".
+YEARLY_LINK_RE = re.compile(r"/yearly/yr(?P<year>\d{4})(?P<league_code>[an])\.shtml$")
+
+# Matches the H1 header content on year pages.
+YEAR_LEAGUE_HEADER_RE = re.compile(r"(?P<year>\d{4})\s(?P<league>AMERICAN|NATIONAL)\sLEAGUE")
+
+# Extracts canonical stat table keys from header text.
+STAT_TABLE_KEY_RE = re.compile(r"\b(Hitting Statistics|Pitching Statistics|Standings)\b")
+