From 81dc06f89f3fcc69b6d9a8bd76a6355461476da4 Mon Sep 17 00:00:00 2001
From: Bob Nisco <bob.nisco@snowflake.com>
Date: Tue, 13 Jan 2026 14:00:01 -0800
Subject: [PATCH] [feat] Add component directory scripts

---
 .gitattributes                                |   1 +
 directory/ranking_config.json                 |  10 +
 directory/scripts/__init__.py                 |   1 +
 directory/scripts/_enrichers/__init__.py      |  13 +
 directory/scripts/_enrichers/github.py        | 216 +++++++
 directory/scripts/_enrichers/pypi.py          | 143 +++++
 directory/scripts/_enrichers/pypistats.py     | 125 ++++
 directory/scripts/_utils/__init__.py          |   4 +
 directory/scripts/_utils/enrich.py            |  34 ++
 directory/scripts/_utils/enrichment_engine.py | 248 ++++++++
 directory/scripts/_utils/github.py            |  28 +
 directory/scripts/_utils/http.py              | 146 +++++
 directory/scripts/_utils/image_url_policy.py  |  31 +
 directory/scripts/_utils/io.py                |  18 +
 directory/scripts/_utils/metrics.py           |  22 +
 directory/scripts/_utils/pypi_helpers.py      |  35 ++
 directory/scripts/_utils/time.py              |  34 ++
 directory/scripts/build_catalog.py            | 455 ++++++++++++++
 directory/scripts/compute_ranking.py          | 284 +++++++++
 directory/scripts/enrich.py                   | 244 ++++++++
 directory/scripts/enrich_images.py            | 501 ++++++++++++++++
 directory/scripts/run_pipeline.py             | 273 +++++++++
 directory/scripts/validate.py                 | 554 ++++++++++++++++++
 requirements.txt                              |   2 +
 24 files changed, 3422 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 directory/ranking_config.json
 create mode 100644 directory/scripts/__init__.py
 create mode 100644 directory/scripts/_enrichers/__init__.py
 create mode 100644 directory/scripts/_enrichers/github.py
 create mode 100644 directory/scripts/_enrichers/pypi.py
 create mode 100644 directory/scripts/_enrichers/pypistats.py
 create mode 100644 directory/scripts/_utils/__init__.py
 create mode 100644 directory/scripts/_utils/enrich.py
 create mode 100644 directory/scripts/_utils/enrichment_engine.py
 create mode 100644 directory/scripts/_utils/github.py
 create mode 100644 directory/scripts/_utils/http.py
 create mode 100644 directory/scripts/_utils/image_url_policy.py
 create mode 100644 directory/scripts/_utils/io.py
 create mode 100644 directory/scripts/_utils/metrics.py
 create mode 100644 directory/scripts/_utils/pypi_helpers.py
 create mode 100644 directory/scripts/_utils/time.py
 create mode 100644 directory/scripts/build_catalog.py
 create mode 100644 directory/scripts/compute_ranking.py
 create mode 100644 directory/scripts/enrich.py
 create mode 100644 directory/scripts/enrich_images.py
 create mode 100644 directory/scripts/run_pipeline.py
 create mode 100644 directory/scripts/validate.py
 create mode 100644 requirements.txt

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..a0fea2f6
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+directory/compiled/** linguist-generated=true
diff --git a/directory/ranking_config.json b/directory/ranking_config.json
new file mode 100644
index 00000000..b67b0c49
--- /dev/null
+++ b/directory/ranking_config.json
@@ -0,0 +1,10 @@
+{
+  "schemaVersion": 1,
+  "halfLifeDays": 90.0,
+  "weights": {
+    "stars": 1.0,
+    "recency": 2.0,
+    "contributors": 0.5,
+    "downloads": 0.35
+  }
+}
diff --git a/directory/scripts/__init__.py b/directory/scripts/__init__.py
new file mode 100644
index 00000000..9d48db4f
--- /dev/null
+++ b/directory/scripts/__init__.py
@@ -0,0 +1 @@
+from __future__ import annotations
diff --git a/directory/scripts/_enrichers/__init__.py b/directory/scripts/_enrichers/__init__.py
new file mode 100644
index 00000000..15e02349
--- /dev/null
+++ b/directory/scripts/_enrichers/__init__.py
@@ -0,0 +1,13 @@
+from __future__ import annotations
+
+from .github import GitHubEnricher  # type: ignore[import-not-found]
+from .pypi import PyPiEnricher  # type: ignore[import-not-found]
+from .pypistats import PyPiStatsEnricher  # type: ignore[import-not-found]
+
+
+def get_default_enrichers(*, github_token_env: str = "GH_TOKEN") -> list:
+    return [
+        GitHubEnricher(token_env=github_token_env),
+        PyPiEnricher(),
+        PyPiStatsEnricher(),
+    ]
diff --git a/directory/scripts/_enrichers/github.py b/directory/scripts/_enrichers/github.py
new file mode 100644
index 00000000..a56f7462
--- /dev/null
+++ b/directory/scripts/_enrichers/github.py
@@ -0,0 +1,216 @@
+from __future__ import annotations
+
+import os
+import re
+from dataclasses import dataclass
+from typing import Any
+from urllib.parse import parse_qs, urlparse
+
+from _utils.enrich import should_refetch
+from _utils.enrichment_engine import FetchResult, Patch
+from _utils.github import parse_owner_repo
+from _utils.http import RetryConfig, fetch_json
+from _utils.time import utc_now_iso
+
+GITHUB_API_BASE = "https://api.github.com"
+
+
+@dataclass(frozen=True)
+class GitHubResult:
+    owner: str
+    repo: str
+    stars: int | None
+    forks: int | None
+    contributors_count: int | None
+    open_issues: int | None
+    pushed_at: str | None
+
+
+def _github_repo_api_url(owner: str, repo: str) -> str:
+    return f"{GITHUB_API_BASE}/repos/{owner}/{repo}"
+
+
+def _github_contributors_api_url(owner: str, repo: str) -> str:
+    return f"{GITHUB_API_BASE}/repos/{owner}/{repo}/contributors?per_page=1"
+
+
+_LINK_LAST_RE = re.compile(r'<([^>]+)>;\s*rel="last"')
+
+
+def _parse_last_page_from_link_header(link: str | None) -> int | None:
+    if not isinstance(link, str) or not link.strip():
+        return None
+    m = _LINK_LAST_RE.search(link)
+    if not m:
+        return None
+    try:
+        last_url = m.group(1)
+        parsed = urlparse(last_url)
+        qs = parse_qs(parsed.query)
+        page_vals = qs.get("page")
+        if not page_vals:
+            return None
+        page = int(page_vals[0])
+        return page if page >= 0 else None
+    except Exception:
+        return None
+
+
+def _get_token(token_env: str) -> str | None:
+    token = os.environ.get(token_env)
+    if token:
+        return token.strip() or None
+    for k in ("GH_TOKEN", "GH_API_TOKEN", "GITHUB_TOKEN"):
+        token = os.environ.get(k)
+        if token:
+            return token.strip() or None
+    return None
+
+
+class GitHubEnricher:
+    name = "github"
+    bucket = "github"
+
+    def __init__(self, *, token_env: str = "GH_TOKEN") -> None:
+        self._token_env = token_env
+        self._token = _get_token(token_env)
+        self._retry_cfg = RetryConfig(retry_statuses=(403, 429, 500, 502, 503, 504))
+
+    def key_for_component(self, comp: dict[str, Any]) -> tuple[str, str] | None:
+        gh_url = comp.get("gitHubUrl")
+        if not isinstance(gh_url, str) or not gh_url.strip():
+            return None
+        try:
+            owner, repo = parse_owner_repo(gh_url)
+        except Exception:
+            return None
+        return (owner.lower(), repo.lower())
+
+    def needs_fetch(
+        self, comp: dict[str, Any], refresh_older_than_hours: float | None
+    ) -> bool:
+        metrics = comp.get("metrics")
+        gh_metrics = metrics.get("github") if isinstance(metrics, dict) else None
+        existing_fetched_at = (
+            gh_metrics.get("fetchedAt") if isinstance(gh_metrics, dict) else None
+        )
+        stale = gh_metrics.get("isStale") if isinstance(gh_metrics, dict) else None
+        return should_refetch(
+            fetched_at=(
+                existing_fetched_at if isinstance(existing_fetched_at, str) else None
+            ),
+            is_stale=stale if isinstance(stale, bool) else None,
+            refresh_older_than_hours=refresh_older_than_hours,
+        )
+
+    def _headers(self) -> dict[str, str]:
+        headers = {
+            "Accept": "application/vnd.github+json",
+            "User-Agent": "component-gallery-enrich-github",
+            "X-GitHub-Api-Version": "2022-11-28",
+        }
+        if self._token:
+            headers["Authorization"] = f"Bearer {self._token}"
+        return headers
+
+    def _fetch_contributors_count(
+        self, *, ctx, owner: str, repo: str
+    ) -> tuple[int | None, int, int | None, str | None]:
+        url = _github_contributors_api_url(owner, repo)
+        r = ctx.request_json(
+            url=url,
+            headers=self._headers(),
+            fetcher=fetch_json,
+            retry_cfg=self._retry_cfg,
+        )
+        if not r.ok or not isinstance(r.data, list):
+            return None, r.attempts, r.status, r.error
+        link = None
+        if isinstance(r.headers, dict):
+            link = r.headers.get("Link") or r.headers.get("link")
+        last_page = _parse_last_page_from_link_header(link)
+        if isinstance(last_page, int):
+            return last_page, r.attempts, r.status, None
+        return (1 if len(r.data) >= 1 else 0), r.attempts, r.status, None
+
+    def fetch(self, key: tuple[str, str], ctx) -> FetchResult:
+        owner, repo = key
+        url = _github_repo_api_url(owner, repo)
+        r = ctx.request_json(
+            url=url,
+            headers=self._headers(),
+            fetcher=fetch_json,
+            retry_cfg=self._retry_cfg,
+        )
+        attempts = int(r.attempts)
+        if not r.ok or not isinstance(r.data, dict):
+            return FetchResult(
+                ok=False,
+                data=None,
+                error=r.error or "Request failed.",
+                attempts=attempts,
+                status=r.status,
+            )
+
+        data = r.data
+        stars = data.get("stargazers_count")
+        forks = data.get("forks_count")
+        open_issues = data.get("open_issues_count")
+        pushed_at = data.get("pushed_at")
+
+        contributors_count, contrib_attempts, status, err = (
+            self._fetch_contributors_count(ctx=ctx, owner=owner, repo=repo)
+        )
+        attempts += int(contrib_attempts)
+        if err:
+            return FetchResult(
+                ok=False,
+                data=None,
+                error=err,
+                attempts=attempts,
+                status=status,
+            )
+
+        result = GitHubResult(
+            owner=owner,
+            repo=repo,
+            stars=int(stars) if isinstance(stars, int) else None,
+            forks=int(forks) if isinstance(forks, int) else None,
+            contributors_count=(
+                int(contributors_count)
+                if isinstance(contributors_count, int) and contributors_count >= 0
+                else None
+            ),
+            open_issues=int(open_issues) if isinstance(open_issues, int) else None,
+            pushed_at=str(pushed_at) if isinstance(pushed_at, str) else None,
+        )
+        return FetchResult(
+            ok=True, data=result, error=None, attempts=attempts, status=r.status
+        )
+
+    def patch_success(
+        self, comp: dict[str, Any], result: GitHubResult, fetched_at: str
+    ) -> Patch:
+        metrics = comp.get("metrics")
+        gh_metrics = metrics.get("github") if isinstance(metrics, dict) else None
+        prev_stars = gh_metrics.get("stars") if isinstance(gh_metrics, dict) else None
+
+        updates: dict[str, Any] = {}
+        if isinstance(result.stars, int):
+            updates["stars"] = result.stars
+        if isinstance(result.forks, int):
+            updates["forks"] = result.forks
+        if isinstance(result.contributors_count, int):
+            updates["contributorsCount"] = result.contributors_count
+        if isinstance(result.open_issues, int):
+            updates["openIssues"] = result.open_issues
+        if isinstance(result.pushed_at, str):
+            updates["lastPushAt"] = result.pushed_at
+        updates["fetchedAt"] = fetched_at or utc_now_iso()
+        updates["isStale"] = False
+
+        changed = isinstance(result.stars, int) and prev_stars != result.stars
+        return Patch(bucket=self.bucket, updates=updates, changed=changed)
+
+    def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch:
+        return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False)
diff --git a/directory/scripts/_enrichers/pypi.py b/directory/scripts/_enrichers/pypi.py
new file mode 100644
index 00000000..1f2e005c
--- /dev/null
+++ b/directory/scripts/_enrichers/pypi.py
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from _utils.enrich import should_refetch
+from _utils.enrichment_engine import FetchResult, Patch
+from _utils.http import RetryConfig, fetch_json
+from _utils.pypi_helpers import infer_pypi_project_from_piplink
+from _utils.time import utc_now_iso
+
+PYPI_BASE = "https://pypi.org/pypi"
+
+
+@dataclass(frozen=True)
+class PyPiResult:
+    project: str
+    latest_version: str | None
+    latest_release_at: str | None
+
+
+def _get_project_for_component(comp: dict[str, Any]) -> str | None:
+    p = comp.get("pypi")
+    if isinstance(p, str) and p.strip():
+        return p.strip()
+    return infer_pypi_project_from_piplink(comp.get("pipLink"))
+
+
+def _pypi_api_url(project: str) -> str:
+    return f"{PYPI_BASE}/{project}/json"
+
+
+def _max_upload_time_iso(release_files: Any) -> str | None:
+    if not isinstance(release_files, list):
+        return None
+    times: list[str] = []
+    for f in release_files:
+        if not isinstance(f, dict):
+            continue
+        t = f.get("upload_time_iso_8601") or f.get("upload_time")
+        if isinstance(t, str) and t:
+            times.append(t)
+    return max(times) if times else None
+
+
+class PyPiEnricher:
+    name = "pypi"
+    bucket = "pypi"
+
+    def __init__(self) -> None:
+        self._retry_cfg = RetryConfig(retry_statuses=(429, 500, 502, 503, 504))
+
+    def key_for_component(self, comp: dict[str, Any]) -> str | None:
+        return _get_project_for_component(comp)
+
+    def needs_fetch(
+        self, comp: dict[str, Any], refresh_older_than_hours: float | None
+    ) -> bool:
+        metrics = comp.get("metrics")
+        pypi_metrics = metrics.get("pypi") if isinstance(metrics, dict) else None
+        existing_fetched_at = (
+            pypi_metrics.get("fetchedAt") if isinstance(pypi_metrics, dict) else None
+        )
+        stale = pypi_metrics.get("isStale") if isinstance(pypi_metrics, dict) else None
+        return should_refetch(
+            fetched_at=(
+                existing_fetched_at if isinstance(existing_fetched_at, str) else None
+            ),
+            is_stale=stale if isinstance(stale, bool) else None,
+            refresh_older_than_hours=refresh_older_than_hours,
+        )
+
+    def fetch(self, key: str, ctx) -> FetchResult:
+        url = _pypi_api_url(key)
+        headers = {
+            "Accept": "application/json",
+            "User-Agent": "component-gallery-enrich-pypi",
+        }
+        r = ctx.request_json(
+            url=url,
+            headers=headers,
+            fetcher=fetch_json,
+            retry_cfg=self._retry_cfg,
+        )
+        if not r.ok or not isinstance(r.data, dict):
+            return FetchResult(
+                ok=False,
+                data=None,
+                error=r.error or "Request failed.",
+                attempts=int(r.attempts),
+                status=r.status,
+            )
+        data = r.data
+        info = data.get("info")
+        releases = data.get("releases")
+        if not isinstance(info, dict) or not isinstance(releases, dict):
+            return FetchResult(
+                ok=False,
+                data=None,
+                error="Missing info/releases.",
+                attempts=int(r.attempts),
+                status=r.status,
+            )
+        latest_version = info.get("version")
+        latest_version = (
+            str(latest_version)
+            if isinstance(latest_version, str) and latest_version
+            else None
+        )
+
+        latest_release_at: str | None = None
+        if latest_version and latest_version in releases:
+            latest_release_at = _max_upload_time_iso(releases.get(latest_version))
+        if latest_release_at is None:
+            best: str | None = None
+            for _, files in releases.items():
+                t = _max_upload_time_iso(files)
+                if t and (best is None or t > best):
+                    best = t
+            latest_release_at = best
+
+        result = PyPiResult(
+            project=key,
+            latest_version=latest_version,
+            latest_release_at=latest_release_at,
+        )
+        return FetchResult(
+            ok=True, data=result, error=None, attempts=int(r.attempts), status=r.status
+        )
+
+    def patch_success(
+        self, comp: dict[str, Any], result: PyPiResult, fetched_at: str
+    ) -> Patch:
+        updates = {
+            "latestVersion": result.latest_version,
+            "latestReleaseAt": result.latest_release_at,
+            "fetchedAt": fetched_at or utc_now_iso(),
+            "isStale": False,
+        }
+        return Patch(bucket=self.bucket, updates=updates, changed=True)
+
+    def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch:
+        return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False)
diff --git a/directory/scripts/_enrichers/pypistats.py b/directory/scripts/_enrichers/pypistats.py
new file mode 100644
index 00000000..35f526ae
--- /dev/null
+++ b/directory/scripts/_enrichers/pypistats.py
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from _utils.enrich import should_refetch
+from _utils.enrichment_engine import FetchResult, Patch
+from _utils.http import RetryConfig, fetch_json
+from _utils.pypi_helpers import infer_pypi_project_from_piplink
+from _utils.time import utc_now_iso
+
+PYPISTATS_BASE = "https://pypistats.org/api/packages"
+
+
+@dataclass(frozen=True)
+class PyPiStatsResult:
+    project: str
+    last_day: int | None
+    last_week: int | None
+    last_month: int | None
+
+
+def _get_project_for_component(comp: dict[str, Any]) -> str | None:
+    p = comp.get("pypi")
+    if isinstance(p, str) and p.strip():
+        return p.strip()
+    return infer_pypi_project_from_piplink(comp.get("pipLink"))
+
+
+def _pypistats_recent_url(project: str) -> str:
+    return f"{PYPISTATS_BASE}/{project}/recent"
+
+
+class PyPiStatsEnricher:
+    name = "pypistats"
+    bucket = "pypistats"
+
+    def __init__(self) -> None:
+        self._retry_cfg = RetryConfig(retry_statuses=(429, 500, 502, 503, 504))
+
+    def key_for_component(self, comp: dict[str, Any]) -> str | None:
+        return _get_project_for_component(comp)
+
+    def needs_fetch(
+        self, comp: dict[str, Any], refresh_older_than_hours: float | None
+    ) -> bool:
+        metrics = comp.get("metrics") if isinstance(comp.get("metrics"), dict) else None
+        pypistats_metrics = (
+            metrics.get("pypistats") if isinstance(metrics, dict) else None
+        )
+        existing_fetched_at = (
+            pypistats_metrics.get("fetchedAt")
+            if isinstance(pypistats_metrics, dict)
+            else None
+        )
+        stale = (
+            pypistats_metrics.get("isStale")
+            if isinstance(pypistats_metrics, dict)
+            else None
+        )
+        return should_refetch(
+            fetched_at=(
+                existing_fetched_at if isinstance(existing_fetched_at, str) else None
+            ),
+            is_stale=stale if isinstance(stale, bool) else None,
+            refresh_older_than_hours=refresh_older_than_hours,
+        )
+
+    def fetch(self, key: str, ctx) -> FetchResult:
+        url = _pypistats_recent_url(key)
+        headers = {
+            "Accept": "application/json",
+            "User-Agent": "component-gallery-enrich-pypistats",
+        }
+        r = ctx.request_json(
+            url=url,
+            headers=headers,
+            fetcher=fetch_json,
+            retry_cfg=self._retry_cfg,
+        )
+        if not r.ok or not isinstance(r.data, dict):
+            return FetchResult(
+                ok=False,
+                data=None,
+                error=r.error or "Request failed.",
+                attempts=int(r.attempts),
+                status=r.status,
+            )
+        data = r.data.get("data") if isinstance(r.data, dict) else None
+        if not isinstance(data, dict):
+            return FetchResult(
+                ok=False,
+                data=None,
+                error="Missing data payload.",
+                attempts=int(r.attempts),
+                status=r.status,
+            )
+
+        def _as_int(x: Any) -> int | None:
+            return int(x) if isinstance(x, int) and x >= 0 else None
+
+        result = PyPiStatsResult(
+            project=key,
+            last_day=_as_int(data.get("last_day")),
+            last_week=_as_int(data.get("last_week")),
+            last_month=_as_int(data.get("last_month")),
+        )
+        return FetchResult(
+            ok=True, data=result, error=None, attempts=int(r.attempts), status=r.status
+        )
+
+    def patch_success(
+        self, comp: dict[str, Any], result: PyPiStatsResult, fetched_at: str
+    ) -> Patch:
+        updates = {
+            "lastDay": result.last_day,
+            "lastWeek": result.last_week,
+            "lastMonth": result.last_month,
+            "fetchedAt": fetched_at or utc_now_iso(),
+            "isStale": False,
+        }
+        return Patch(bucket=self.bucket, updates=updates, changed=True)
+
+    def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch:
+        return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False)
diff --git a/directory/scripts/_utils/__init__.py b/directory/scripts/_utils/__init__.py
new file mode 100644
index 00000000..36a57ceb
--- /dev/null
+++ b/directory/scripts/_utils/__init__.py
@@ -0,0 +1,4 @@
+"""Shared utilities for the component-gallery scripts.
+
+These helpers intentionally avoid third-party dependencies.
+"""
diff --git a/directory/scripts/_utils/enrich.py b/directory/scripts/_utils/enrich.py
new file mode 100644
index 00000000..e1f911ef
--- /dev/null
+++ b/directory/scripts/_utils/enrich.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+from .time import parse_iso8601
+
+
+def should_refetch(
+    *,
+    fetched_at: str | None,
+    is_stale: bool | None,
+    refresh_older_than_hours: float | None,
+) -> bool:
+    """Return True if we should refetch a metric bucket.
+
+    Rules:
+    - If refresh_older_than_hours is None or <= 0: always refetch
+    - If we have no parseable fetched_at: refetch
+    - If is_stale is True: refetch
+    - Otherwise, refetch only when fetched_at is older than refresh_older_than_hours
+    """
+    if refresh_older_than_hours is None or refresh_older_than_hours <= 0:
+        return True
+    if is_stale is True:
+        return True
+
+    dt = parse_iso8601(fetched_at)
+    if not dt:
+        return True
+
+    age_h = (datetime.now(timezone.utc) - dt).total_seconds() / 3600.0
+    return age_h >= refresh_older_than_hours
+
+
diff --git a/directory/scripts/_utils/enrichment_engine.py b/directory/scripts/_utils/enrichment_engine.py
new file mode 100644
index 00000000..5508f6d5
--- /dev/null
+++ b/directory/scripts/_utils/enrichment_engine.py
@@ -0,0 +1,248 @@
+from __future__ import annotations
+
+import threading
+import time
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Any, Callable, Iterable, Protocol, cast
+
+import requests
+
+from .metrics import Bucket, ensure_bucket
+
+
+class Enricher(Protocol):
+    name: str
+    bucket: str
+
+    def key_for_component(self, comp: dict[str, Any]) -> Any | None: ...
+
+    def needs_fetch(
+        self, comp: dict[str, Any], refresh_older_than_hours: float | None
+    ) -> bool: ...
+
+    def fetch(self, key: Any, ctx: "EnricherContext") -> "FetchResult": ...
+
+    def patch_success(
+        self, comp: dict[str, Any], result: Any, fetched_at: str
+    ) -> "Patch": ...
+
+    def patch_failure(self, comp: dict[str, Any], error: str | None) -> "Patch": ...
+
+
+@dataclass(frozen=True)
+class FetchResult:
+    ok: bool
+    data: Any | None
+    error: str | None
+    attempts: int
+    status: int | None = None
+
+
+@dataclass(frozen=True)
+class Patch:
+    bucket: str
+    updates: dict[str, Any]
+    changed: bool = True
+
+
+@dataclass(frozen=True)
+class Failure:
+    key: str
+    status: int | None
+    error: str | None
+
+
+@dataclass
+class ServiceStats:
+    processed: int = 0
+    requests: int = 0
+    ok: int = 0
+    failed: int = 0
+    updated: int = 0
+    skipped_fresh: int = 0
+    skipped_no_key: int = 0
+    cache_hits: int = 0
+
+
+class ServiceLimiter:
+    def __init__(self, min_interval_s: float) -> None:
+        self._min_interval_s = max(0.0, float(min_interval_s))
+        self._lock = threading.Lock()
+        self._next_allowed = 0.0
+
+    def acquire(self) -> None:
+        if self._min_interval_s <= 0:
+            return
+        wait_s = 0.0
+        with self._lock:
+            now = time.monotonic()
+            if now < self._next_allowed:
+                wait_s = self._next_allowed - now
+            self._next_allowed = max(now, self._next_allowed) + self._min_interval_s
+        if wait_s > 0:
+            time.sleep(wait_s)
+
+
+class ThreadLocalSession:
+    def __init__(self) -> None:
+        self._tls = threading.local()
+
+    def get(self) -> requests.Session:
+        sess = getattr(self._tls, "session", None)
+        if sess is None:
+            sess = requests.Session()
+            self._tls.session = sess
+        return sess
+
+
+@dataclass
+class EnricherContext:
+    name: str
+    limiter: ServiceLimiter
+    session_getter: Callable[[], requests.Session]
+    timeout_s: float
+
+    def request_json(
+        self,
+        *,
+        url: str,
+        headers: dict[str, str] | None,
+        fetcher: Callable[..., Any],
+        retry_cfg: Any,
+    ) -> Any:
+        self.limiter.acquire()
+        session = self.session_getter()
+        return fetcher(
+            session=session,
+            url=url,
+            headers=headers,
+            timeout_s=self.timeout_s,
+            retry=retry_cfg,
+        )
+
+
+@dataclass
+class EngineRunResult:
+    stats: dict[str, ServiceStats]
+    failures: dict[str, list[Failure]]
+
+
+def run_enrichment_engine(
+    *,
+    components: list[dict[str, Any]],
+    enrichers: Iterable[Enricher],
+    refresh_older_than_hours: float | None,
+    timeout_s: float,
+    sleep_by_service: dict[str, float],
+    workers: int,
+    run_fetched_at: str,
+    progress_every: int | None = None,
+) -> EngineRunResult:
+    enricher_list = list(enrichers)
+    stats: dict[str, ServiceStats] = {e.name: ServiceStats() for e in enricher_list}
+    failures: dict[str, list[Failure]] = {e.name: [] for e in enricher_list}
+
+    inflight: dict[tuple[str, Any], tuple[Enricher, Any, Future[FetchResult]]] = {}
+    future_meta: dict[Future[FetchResult], tuple[str, Any]] = {}
+    comp_tasks: list[list[tuple[Enricher, Future[FetchResult]]]] = [
+        [] for _ in range(len(components))
+    ]
+
+    limiter_by_service = {
+        e.name: ServiceLimiter(sleep_by_service.get(e.name, 0.0)) for e in enricher_list
+    }
+    session_by_service = {e.name: ThreadLocalSession() for e in enricher_list}
+
+    def submit_fetch(enricher: Enricher, key: Any) -> Future[FetchResult]:
+        limiter = limiter_by_service[enricher.name]
+        session_factory = session_by_service[enricher.name].get
+
+        def _run() -> FetchResult:
+            ctx = EnricherContext(
+                name=enricher.name,
+                limiter=limiter,
+                session_getter=session_factory,
+                timeout_s=timeout_s,
+            )
+            return enricher.fetch(key, ctx)
+
+        return executor.submit(_run)
+
+    with ThreadPoolExecutor(max_workers=max(1, int(workers))) as executor:
+        for idx, comp in enumerate(components):
+            if not isinstance(comp, dict):
+                continue
+            for enricher in enricher_list:
+                stats[enricher.name].processed += 1
+                if not enricher.needs_fetch(comp, refresh_older_than_hours):
+                    stats[enricher.name].skipped_fresh += 1
+                    continue
+                key = enricher.key_for_component(comp)
+                if key is None:
+                    stats[enricher.name].skipped_no_key += 1
+                    continue
+                inflight_key = (enricher.name, key)
+                if inflight_key in inflight:
+                    stats[enricher.name].cache_hits += 1
+                    fut = inflight[inflight_key][2]
+                else:
+                    fut = submit_fetch(enricher, key)
+                    inflight[inflight_key] = (enricher, key, fut)
+                    future_meta[fut] = (enricher.name, key)
+                comp_tasks[idx].append((enricher, fut))
+
+        # Apply patches deterministically by component index, resolving futures lazily
+        # so progress can be reported as components complete.
+        result_cache: dict[Future[FetchResult], FetchResult] = {}
+        counted: set[Future[FetchResult]] = set()
+        for idx, comp in enumerate(components):
+            if not isinstance(comp, dict):
+                continue
+            for enricher, fut in comp_tasks[idx]:
+                if fut in result_cache:
+                    res = result_cache[fut]
+                else:
+                    res = fut.result()
+                    result_cache[fut] = res
+                if fut not in counted:
+                    counted.add(fut)
+                    meta = future_meta.get(fut)
+                    if meta is not None:
+                        service_name, key = meta
+                    else:
+                        service_name, key = enricher.name, "?"
+                    stats[service_name].requests += int(res.attempts)
+                    if res.ok:
+                        stats[service_name].ok += 1
+                    else:
+                        stats[service_name].failed += 1
+                        failures[service_name].append(
+                            Failure(key=str(key), status=res.status, error=res.error)
+                        )
+                if res.ok:
+                    patch = enricher.patch_success(comp, res.data, run_fetched_at)
+                else:
+                    patch = enricher.patch_failure(comp, res.error)
+                bucket = ensure_bucket(comp, cast(Bucket, patch.bucket))
+                for k, v in patch.updates.items():
+                    bucket[k] = v
+                if patch.changed:
+                    stats[enricher.name].updated += 1
+
+            if (
+                progress_every
+                and progress_every > 0
+                and (idx + 1) % progress_every == 0
+            ):
+                for enricher in enricher_list:
+                    s = stats[enricher.name]
+                    print(
+                        f"[{enricher.name}] requests={s.requests} "
+                        f"ok={s.ok} fail={s.failed} "
+                        f"updated={s.updated} skipped_fresh={s.skipped_fresh} "
+                        f"cache_hits={s.cache_hits} skipped_no_key={s.skipped_no_key}",
+                        flush=True,
+                    )
+
+    return EngineRunResult(stats=stats, failures=failures)
diff --git a/directory/scripts/_utils/github.py b/directory/scripts/_utils/github.py
new file mode 100644
index 00000000..30cd870d
--- /dev/null
+++ b/directory/scripts/_utils/github.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from urllib.parse import urlparse
+
+
+def normalize_github_repo_url(url: str) -> str:
+    """Canonicalize `https://github.com/<owner>/<repo>` (no trailing slash)."""
+    parsed = urlparse(url)
+    if parsed.scheme != "https" or parsed.netloc != "github.com":
+        raise ValueError(f"Not a GitHub HTTPS URL: {url}")
+    parts = [p for p in parsed.path.split("/") if p]
+    if len(parts) < 2:
+        raise ValueError(f"Not a GitHub repo URL: {url}")
+    owner, repo = parts[0], parts[1]
+    return f"https://github.com/{owner}/{repo}"
+
+
+def parse_owner_repo(github_url: str) -> tuple[str, str]:
+    canonical = normalize_github_repo_url(github_url)
+    parsed = urlparse(canonical)
+    parts = [p for p in parsed.path.split("/") if p]
+    return parts[0], parts[1]
+
+
+def repo_key(github_url: str) -> str:
+    """Stable identifier `owner/repo` (lowercased)."""
+    owner, repo = parse_owner_repo(github_url)
+    return f"{owner.lower()}/{repo.lower()}"
diff --git a/directory/scripts/_utils/http.py b/directory/scripts/_utils/http.py
new file mode 100644
index 00000000..79778d6c
--- /dev/null
+++ b/directory/scripts/_utils/http.py
@@ -0,0 +1,146 @@
+from __future__ import annotations
+
+import random
+import time
+from dataclasses import dataclass
+from typing import Any
+
+import requests
+
+
+def _maybe_int(s: str | None) -> int | None:
+    if not isinstance(s, str) or not s.strip():
+        return None
+    try:
+        return int(s.strip())
+    except Exception:
+        return None
+
+
+def _retry_after_seconds(headers: dict[str, str] | None) -> int | None:
+    if not headers:
+        return None
+    return _maybe_int(headers.get("Retry-After"))
+
+
+@dataclass(frozen=True)
+class RetryConfig:
+    max_attempts: int = 6
+    backoff_base_s: float = 0.5
+    backoff_cap_s: float = 60.0
+    retry_statuses: tuple[int, ...] = (403, 429, 500, 502, 503, 504)
+
+
+@dataclass(frozen=True)
+class FetchJsonResult:
+    ok: bool
+    status: int | None
+    data: Any | None
+    headers: dict[str, str] | None
+    error: str | None
+    attempts: int
+    last_retry_after_s: int | None = None
+
+
+def fetch_json(
+    *,
+    session: requests.Session,
+    url: str,
+    headers: dict[str, str] | None,
+    timeout_s: float,
+    retry: RetryConfig,
+) -> FetchJsonResult:
+    """GET a URL and parse JSON with retry/backoff.
+
+    Retries on retry_statuses and on request-level exceptions.
+    Honors integer Retry-After when present.
+    """
+    last_retry_after: int | None = None
+
+    for attempt in range(1, max(1, retry.max_attempts) + 1):
+        try:
+            resp = session.get(url, headers=headers, timeout=timeout_s)
+        except requests.RequestException as e:
+            # Retryable network error.
+            if attempt >= retry.max_attempts:
+                return FetchJsonResult(
+                    ok=False,
+                    status=None,
+                    data=None,
+                    headers=None,
+                    error=f"{type(e).__name__}: {e}",
+                    attempts=attempt,
+                    last_retry_after_s=last_retry_after,
+                )
+            wait_s = min(retry.backoff_cap_s, (2**attempt) * retry.backoff_base_s)
+            wait_s = wait_s + random.random() * 0.25
+            time.sleep(wait_s)
+            continue
+
+        status = int(resp.status_code)
+        # Success path.
+        if 200 <= status < 300:
+            try:
+                return FetchJsonResult(
+                    ok=True,
+                    status=status,
+                    data=resp.json(),
+                    headers=dict(resp.headers) if resp.headers else None,
+                    error=None,
+                    attempts=attempt,
+                    last_retry_after_s=last_retry_after,
+                )
+            except Exception as e:
+                return FetchJsonResult(
+                    ok=False,
+                    status=status,
+                    data=None,
+                    headers=dict(resp.headers) if resp.headers else None,
+                    error=f"Invalid JSON payload: {e}",
+                    attempts=attempt,
+                    last_retry_after_s=last_retry_after,
+                )
+
+        # Non-success: decide whether to retry.
+        body = None
+        try:
+            body = resp.text
+        except Exception:
+            body = None
+
+        if status in retry.retry_statuses and attempt < retry.max_attempts:
+            ra = _retry_after_seconds(dict(resp.headers) if resp.headers else None)
+            if isinstance(ra, int):
+                last_retry_after = ra
+            wait_s = min(retry.backoff_cap_s, (2**attempt) * retry.backoff_base_s)
+            if isinstance(ra, int):
+                wait_s = max(wait_s, float(ra))
+            wait_s = wait_s + random.random() * 0.25
+            time.sleep(wait_s)
+            continue
+
+        # Final failure.
+        msg = f"HTTP {status}"
+        if body:
+            msg = f"{msg}: {body[:5000]}"
+        ra = _retry_after_seconds(dict(resp.headers) if resp.headers else None)
+        return FetchJsonResult(
+            ok=False,
+            status=status,
+            data=None,
+            headers=dict(resp.headers) if resp.headers else None,
+            error=msg + (f" (Retry-After={ra}s)" if isinstance(ra, int) else ""),
+            attempts=attempt,
+            last_retry_after_s=ra if isinstance(ra, int) else last_retry_after,
+        )
+
+    # Unreachable.
+    return FetchJsonResult(
+        ok=False,
+        status=None,
+        data=None,
+        headers=None,
+        error="Unknown error.",
+        attempts=retry.max_attempts,
+        last_retry_after_s=last_retry_after,
+    )
diff --git a/directory/scripts/_utils/image_url_policy.py b/directory/scripts/_utils/image_url_policy.py
new file mode 100644
index 00000000..c4c13fcd
--- /dev/null
+++ b/directory/scripts/_utils/image_url_policy.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+# Shared policy constants used by multiple scripts (`validate.py`, `enrich_images.py`).
+
+DISALLOWED_IMAGE_HOSTS = {
+    # GitHub's image proxy URLs are often brittle and not the canonical image source.
+    "camo.githubusercontent.com",
+}
+
+# Keys are compared case-insensitively.
+DISALLOWED_IMAGE_QUERY_KEYS = {
+    # AWS SigV4
+    "x-amz-algorithm",
+    "x-amz-credential",
+    "x-amz-date",
+    "x-amz-expires",
+    "x-amz-signature",
+    "x-amz-signedheaders",
+    # GCS signed URLs
+    "x-goog-algorithm",
+    "x-goog-credential",
+    "x-goog-date",
+    "x-goog-expires",
+    "x-goog-signature",
+    # CloudFront (common)
+    "expires",
+    "signature",
+    "key-pair-id",
+    "policy",
+}
+
diff --git a/directory/scripts/_utils/io.py b/directory/scripts/_utils/io.py
new file mode 100644
index 00000000..899230d6
--- /dev/null
+++ b/directory/scripts/_utils/io.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+
+def load_json(path: Path) -> Any:
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def dump_json(path: Path, obj: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        # Use 2-space indentation for human-friendly diffs in GitHub PRs.
+        json.dump(obj, f, indent=2, ensure_ascii=False, sort_keys=True)
+        f.write("\n")
diff --git a/directory/scripts/_utils/metrics.py b/directory/scripts/_utils/metrics.py
new file mode 100644
index 00000000..fb40daa8
--- /dev/null
+++ b/directory/scripts/_utils/metrics.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from typing import Any, Literal
+
+Bucket = Literal["github", "pypi", "pypistats"]
+
+
+def ensure_metrics(comp: dict[str, Any]) -> dict[str, Any]:
+    metrics = comp.get("metrics")
+    if not isinstance(metrics, dict):
+        metrics = {}
+        comp["metrics"] = metrics
+    return metrics
+
+
+def ensure_bucket(comp: dict[str, Any], bucket: Bucket) -> dict[str, Any]:
+    metrics = ensure_metrics(comp)
+    b = metrics.get(bucket)
+    if not isinstance(b, dict):
+        b = {}
+        metrics[bucket] = b
+    return b
diff --git a/directory/scripts/_utils/pypi_helpers.py b/directory/scripts/_utils/pypi_helpers.py
new file mode 100644
index 00000000..d4d6e014
--- /dev/null
+++ b/directory/scripts/_utils/pypi_helpers.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import re
+
+_PIP_INSTALL_RE = re.compile(
+    r"""(?ix)
+    ^\s*
+    pip(?:3)?                # pip / pip3
+    \s+install
+    \s+
+    (?P<spec>\S+)            # first argument to pip install
+    """
+)
+
+
+def infer_pypi_project_from_piplink(pip_link: str | None) -> str | None:
+    """Infer the PyPI project name from a compiled catalog `pipLink` string.
+
+    This is intentionally conservative: it refuses URL/git-based installs and
+    strips common version specifiers and extras.
+    """
+    if not isinstance(pip_link, str) or not pip_link.strip():
+        return None
+    m = _PIP_INSTALL_RE.match(pip_link)
+    if not m:
+        return None
+    spec = m.group("spec").strip().strip('"').strip("'")
+    if not spec:
+        return None
+    if "://" in spec or spec.startswith("git+"):
+        return None
+    base = spec.split("==", 1)[0].split(">=", 1)[0].split("<=", 1)[0].split("~=", 1)[0]
+    base = base.split("[", 1)[0]
+    base = base.strip()
+    return base or None
diff --git a/directory/scripts/_utils/time.py b/directory/scripts/_utils/time.py
new file mode 100644
index 00000000..9f385aeb
--- /dev/null
+++ b/directory/scripts/_utils/time.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+
+def utc_now_iso() -> str:
+    """UTC now in ISO8601 with Z suffix (e.g. 2025-12-19T00:00:00Z)."""
+    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+
+
+def parse_iso8601(dt: str | None) -> datetime | None:
+    """Parse a subset of ISO8601/RFC3339 strings used by GitHub/PyPI.
+
+    Accepts timestamps like:
+    - 2025-11-30T12:33:58Z
+    - 2025-11-23T22:30:23.036058Z
+    - 2025-11-23T22:30:23+00:00
+
+    Returns timezone-aware UTC datetimes when possible.
+    """
+    if not isinstance(dt, str) or not dt.strip():
+        return None
+    s = dt.strip()
+    # `datetime.fromisoformat` doesn't accept "Z" suffix; normalize it.
+    if s.endswith("Z"):
+        s = s[:-1] + "+00:00"
+    try:
+        parsed = datetime.fromisoformat(s)
+    except ValueError:
+        return None
+    if parsed.tzinfo is None:
+        # Assume UTC if tzinfo is missing (shouldn't happen with our sources)
+        parsed = parsed.replace(tzinfo=timezone.utc)
+    return parsed.astimezone(timezone.utc)
diff --git a/directory/scripts/build_catalog.py b/directory/scripts/build_catalog.py
new file mode 100644
index 00000000..4ba8a7ac
--- /dev/null
+++ b/directory/scripts/build_catalog.py
@@ -0,0 +1,455 @@
+"""
+Build the compiled Component Gallery catalog artifact.
+
+This script compiles per-component submissions in `components/*.json` into a
+single legacy-compatible artifact at `compiled/components.json` that the
+Streamlit gallery app reads from local disk.
+
+It also supports carrying forward "last-known-good" computed fields (e.g. stars)
+from a previous compiled artifact to avoid regressing metrics when enrichment is
+not yet implemented.
+
+Run from the repo root (recommended):
+
+    python directory/scripts/build_catalog.py
+
+Common variants:
+
+    # Write somewhere else
+    python directory/scripts/build_catalog.py --out dist/components.json
+
+    # Skip invalid component JSON files (prints errors and continues)
+    python directory/scripts/build_catalog.py --skip-invalid
+
+    # Explicitly choose the prior artifact used for carry-forward
+    python directory/scripts/build_catalog.py --previous compiled/components.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+
+from _utils.github import normalize_github_repo_url, repo_key
+from _utils.io import dump_json, load_json
+from _utils.time import utc_now_iso
+
+
+@dataclass(frozen=True)
+class ComponentBuildError:
+    file: Path
+    message: str
+    json_path: str | None = None
+
+
+def _load_json(path: Path) -> Any:
+    return load_json(path)
+
+
+def _load_schema(repo_root: Path) -> dict[str, Any]:
+    schema_path = repo_root / "schemas" / "component.schema.json"
+    obj = load_json(schema_path)
+    if not isinstance(obj, dict):
+        raise TypeError(f"Schema must be a JSON object: {schema_path}")
+    return obj
+
+
+def _taxonomy_categories(repo_root: Path) -> list[str]:
+    """Return the fixed taxonomy categories, prefixed with 'All'."""
+    schema = _load_schema(repo_root)
+    try:
+        enum = schema["properties"]["categories"]["items"]["enum"]
+    except Exception as e:  # pragma: no cover
+        raise KeyError(
+            "Could not find taxonomy enum at "
+            "schemas/component.schema.json::properties.categories.items.enum"
+        ) from e
+    if not isinstance(enum, list) or not all(isinstance(x, str) for x in enum):
+        raise TypeError("Category enum must be a list of strings.")
+    return ["All", *enum]
+
+
+def _format_json_path(parts: Iterable[Any]) -> str:
+    out: list[str] = []
+    for p in parts:
+        if isinstance(p, int):
+            out.append(f"[{p}]")
+        else:
+            if out:
+                out.append(".")
+            out.append(str(p))
+    return "".join(out) or "$"
+
+
+def _validate_instance(
+    instance: Any, schema: dict[str, Any]
+) -> list[ComponentBuildError]:
+    try:
+        from jsonschema import Draft202012Validator  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "Missing dependency `jsonschema`.\n\n"
+            "Install it with:\n"
+            "  pip install jsonschema\n"
+            "or add it to `component-gallery/requirements.txt`."
+        ) from e
+
+    validator = Draft202012Validator(schema)
+    errors: list[ComponentBuildError] = []
+    for err in sorted(validator.iter_errors(instance), key=lambda x: list(x.path)):
+        errors.append(
+            ComponentBuildError(
+                file=Path("<in-memory>"),
+                message=err.message,
+                json_path=_format_json_path(err.path),
+            )
+        )
+    return errors
+
+
+def _normalize_github_repo_url(url: str) -> str:
+    return normalize_github_repo_url(url)
+
+
+def _component_key_from_github_url(url: str) -> str:
+    return repo_key(url)
+
+
+def _load_previous_index(previous_path: Path | None) -> dict[str, dict[str, Any]]:
+    """Index previous compiled components by canonical github owner/repo."""
+    if previous_path is None or not previous_path.is_file():
+        return {}
+    obj = load_json(previous_path)
+    if not isinstance(obj, dict):
+        return {}
+    comps = obj.get("components", [])
+    if not isinstance(comps, list):
+        return {}
+
+    out: dict[str, dict[str, Any]] = {}
+    for c in comps:
+        if not isinstance(c, dict):
+            continue
+        gh = c.get("gitHubUrl")
+        if not isinstance(gh, str) or not gh:
+            continue
+        try:
+            key = repo_key(gh)
+        except Exception:
+            continue
+        out[key] = c
+    return out
+
+
+def _prev_int(prev: dict[str, Any], *path: str) -> int | None:
+    cur: Any = prev
+    for p in path:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(p)
+    return int(cur) if isinstance(cur, int) else None
+
+
+def _prev_str(prev: dict[str, Any], *path: str) -> str | None:
+    cur: Any = prev
+    for p in path:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(p)
+    return str(cur) if isinstance(cur, str) else None
+
+
+def _prev_bool(prev: dict[str, Any], *path: str) -> bool | None:
+    cur: Any = prev
+    for p in path:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(p)
+    return bool(cur) if isinstance(cur, bool) else None
+
+
+def _pip_cmd_from_submission(
+    links: dict[str, Any], install: dict[str, Any] | None
+) -> str | None:
+    if isinstance(install, dict):
+        pip_cmd = install.get("pip")
+        if isinstance(pip_cmd, str) and pip_cmd.strip():
+            return pip_cmd.strip()
+
+    pkg = links.get("pypi")
+    if isinstance(pkg, str) and pkg.strip():
+        return f"pip install {pkg.strip()}"
+    return None
+
+
+def build_catalog(
+    *,
+    repo_root: Path,
+    out_path: Path,
+    components_dir: Path,
+    previous_path: Path | None,
+    skip_invalid: bool,
+) -> tuple[dict[str, Any], list[ComponentBuildError]]:
+    schema = _load_schema(repo_root)
+    categories = _taxonomy_categories(repo_root)
+    prev_index = _load_previous_index(previous_path)
+
+    errors: list[ComponentBuildError] = []
+    compiled_components: list[dict[str, Any]] = []
+
+    if not components_dir.is_dir():
+        raise FileNotFoundError(f"Missing components directory: {components_dir}")
+
+    seen_keys: set[str] = set()
+    for json_file in sorted(components_dir.glob("*.json")):
+        try:
+            submission = _load_json(json_file)
+        except json.JSONDecodeError as e:
+            errors.append(
+                ComponentBuildError(file=json_file, message=str(e), json_path=None)
+            )
+            continue
+
+        if not isinstance(submission, dict):
+            errors.append(
+                ComponentBuildError(
+                    file=json_file,
+                    message="Submission JSON must be an object.",
+                    json_path="$",
+                )
+            )
+            continue
+
+        # Schema validation (so we can safely map fields)
+        for ve in _validate_instance(submission, schema):
+            errors.append(
+                ComponentBuildError(
+                    file=json_file,
+                    message=ve.message,
+                    json_path=ve.json_path,
+                )
+            )
+        if any(e.file == json_file for e in errors) and not skip_invalid:
+            continue
+        if any(e.file == json_file for e in errors) and skip_invalid:
+            # Skip this component but keep going.
+            continue
+
+        try:
+            author_obj = submission["author"]
+            links = submission["links"]
+            governance = submission["governance"]
+            title = submission["title"]
+
+            author_github = author_obj["github"]
+            github_url = normalize_github_repo_url(links["github"])
+            key = repo_key(github_url)
+
+            if key in seen_keys:
+                errors.append(
+                    ComponentBuildError(
+                        file=json_file,
+                        message=f"Duplicate component identity (same GitHub repo): {key}",
+                        json_path="links.github",
+                    )
+                )
+                continue
+            seen_keys.add(key)
+
+            pip_cmd = _pip_cmd_from_submission(links, submission.get("install"))
+            pypi_project = links.get("pypi")
+            if not isinstance(pypi_project, str) or not pypi_project.strip():
+                pypi_project = None
+            demo_url = links.get("demo")
+            app_url = demo_url if isinstance(demo_url, str) else None
+
+            media = (
+                submission.get("media")
+                if isinstance(submission.get("media"), dict)
+                else None
+            )
+            image_url = media.get("image") if isinstance(media, dict) else None
+            if not isinstance(image_url, str):
+                image_url = None
+
+            enabled = bool(governance.get("enabled", True))
+
+            # Compiled per-component categories should NOT include "All".
+            # "All" is an implied UI filter mode, not a real category assignment.
+            submitted_categories = submission.get("categories", [])
+            cat_list: list[str] = []
+            if isinstance(submitted_categories, list):
+                for c in submitted_categories:
+                    if isinstance(c, str) and c != "All" and c not in cat_list:
+                        cat_list.append(c)
+            if not cat_list:
+                raise ValueError(
+                    "Per-component categories must be non-empty (and must not be 'All')."
+                )
+
+            prev = prev_index.get(key, {})
+            if not isinstance(prev, dict):
+                prev = {}
+
+            # Prefer previous metrics.github stars if present, else (legacy) top-level stars.
+            stars_val: int | None = _prev_int(prev, "metrics", "github", "stars")
+            if stars_val is None:
+                stars_val = _prev_int(prev, "stars")
+            # Default to 0 to match the current gallery UI expectations.
+            if stars_val is None:
+                stars_val = 0
+
+            prev_forks = _prev_int(prev, "metrics", "github", "forks")
+            prev_open_issues = _prev_int(prev, "metrics", "github", "openIssues")
+            prev_contributors = _prev_int(
+                prev, "metrics", "github", "contributorsCount"
+            )
+            prev_last_push_at = _prev_str(prev, "metrics", "github", "lastPushAt")
+            prev_fetched_at = _prev_str(prev, "metrics", "github", "fetchedAt")
+            prev_is_stale = _prev_bool(prev, "metrics", "github", "isStale")
+            prev_pypi = (
+                prev.get("metrics", {}).get("pypi")
+                if isinstance(prev.get("metrics"), dict)
+                else None
+            )
+            prev_pypistats = (
+                prev.get("metrics", {}).get("pypistats")
+                if isinstance(prev.get("metrics"), dict)
+                else None
+            )
+
+            social_url = f"https://github.com/{author_github}"
+
+            compiled_components.append(
+                {
+                    "title": title,
+                    "author": author_github,
+                    "pipLink": pip_cmd,
+                    "pypi": pypi_project,
+                    "categories": cat_list,
+                    "image": image_url,
+                    "gitHubUrl": github_url,
+                    "enabled": enabled,
+                    "appUrl": app_url,
+                    "socialUrl": social_url,
+                    "metrics": {
+                        "github": {
+                            "stars": stars_val,
+                            "forks": prev_forks,
+                            "openIssues": prev_open_issues,
+                            "contributorsCount": prev_contributors,
+                            "lastPushAt": prev_last_push_at,
+                            "fetchedAt": prev_fetched_at,
+                            "isStale": prev_is_stale,
+                        },
+                        "pypi": prev_pypi,
+                        "pypistats": prev_pypistats,
+                    },
+                }
+            )
+        except Exception as e:
+            errors.append(
+                ComponentBuildError(file=json_file, message=str(e), json_path=None)
+            )
+            if not skip_invalid:
+                continue
+
+    # Deterministic ordering for stable diffs.
+    compiled_components.sort(
+        key=lambda c: (c.get("gitHubUrl") or "", c.get("title") or "")
+    )
+
+    compiled = {
+        "generatedAt": utc_now_iso(),
+        "schemaVersion": 1,
+        "categories": categories,
+        "components": compiled_components,
+    }
+    return compiled, errors
+
+
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(
+        description="Build the compiled components catalog."
+    )
+    parser.add_argument(
+        "--out",
+        default=None,
+        help="Output path for the compiled JSON (default: compiled/components.json).",
+    )
+    parser.add_argument(
+        "--components-dir",
+        default=None,
+        help="Directory containing per-component JSON submissions (default: components/).",
+    )
+    parser.add_argument(
+        "--previous",
+        default=None,
+        help=(
+            "Path to a previous compiled artifact to carry forward metrics like stars. "
+            "Defaults to compiled/components.json if present."
+        ),
+    )
+    parser.add_argument(
+        "--skip-invalid",
+        action="store_true",
+        help="Skip invalid component JSON files instead of failing the build.",
+    )
+    args = parser.parse_args(argv)
+
+    repo_root = Path(__file__).resolve().parents[1]
+
+    out_path = (
+        Path(args.out) if args.out else (repo_root / "compiled" / "components.json")
+    )
+    components_dir = (
+        Path(args.components_dir) if args.components_dir else (repo_root / "components")
+    )
+
+    previous_path: Path | None
+    if args.previous:
+        previous_path = Path(args.previous)
+    else:
+        candidate = repo_root / "compiled" / "components.json"
+        previous_path = candidate if candidate.is_file() else None
+
+    compiled, errors = build_catalog(
+        repo_root=repo_root,
+        out_path=out_path,
+        components_dir=components_dir,
+        previous_path=previous_path,
+        skip_invalid=args.skip_invalid,
+    )
+
+    if errors and not args.skip_invalid:
+        print(
+            "ERROR: build failed due to invalid component submissions:", file=sys.stderr
+        )
+        for e in errors:
+            rel = e.file.relative_to(repo_root) if e.file.is_absolute() else e.file
+            jp = f"{e.json_path}: " if e.json_path else ""
+            print(f"- {rel}: {jp}{e.message}", file=sys.stderr)
+        return 1
+
+    dump_json(out_path, compiled)
+
+    # Print a compact summary for CI logs
+    ts = utc_now_iso()
+    print(
+        f"Wrote {len(compiled.get('components', []))} component(s) to {out_path} at {ts}."
+    )
+    if errors and args.skip_invalid:
+        print(f"NOTE: Skipped {len(errors)} validation error(s).", file=sys.stderr)
+        for e in errors:
+            rel = e.file.relative_to(repo_root) if e.file.is_absolute() else e.file
+            jp = f"{e.json_path}: " if e.json_path else ""
+            print(f"- {rel}: {jp}{e.message}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/directory/scripts/compute_ranking.py b/directory/scripts/compute_ranking.py
new file mode 100644
index 00000000..5b85695f
--- /dev/null
+++ b/directory/scripts/compute_ranking.py
@@ -0,0 +1,284 @@
+"""
+Compute and persist ranking signals for the compiled component catalog.
+
+This script reads `compiled/components.json` and writes a `ranking` block for each
+component, following the tech spec's v1 proposal:
+
+- starsScore = log10(stars + 1)
+- recencyScore = exp(-days_since_update / half_life_days)
+  - days_since_update = min(days_since_github_push, days_since_pypi_release) when both exist
+
+The final score is:
+  score = w_stars * starsScore + w_recency * recencyScore
+
+If recency data is missing, the score falls back to the stars-only term.
+
+Run from the repo root (recommended):
+
+    python directory/scripts/compute_ranking.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import math
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from _utils.io import dump_json, load_json
+from _utils.time import parse_iso8601, utc_now_iso
+
+
+@dataclass(frozen=True)
+class RankingConfig:
+    half_life_days: float
+    w_stars: float
+    w_recency: float
+    w_contributors: float
+    w_downloads: float
+
+
+def _load_ranking_config(path: Path) -> RankingConfig:
+    obj = load_json(path)
+    if not isinstance(obj, dict):
+        raise TypeError(f"Ranking config must be a JSON object: {path}")
+    half_life = obj.get("halfLifeDays", 90.0)
+    weights = obj.get("weights", {})
+    if not isinstance(weights, dict):
+        weights = {}
+    w_stars = weights.get("stars", 1.0)
+    w_recency = weights.get("recency", 2.0)
+    w_contributors = weights.get("contributors", 0.0)
+    w_downloads = weights.get("downloads", 0.0)
+
+    try:
+        half_life_f = float(half_life)
+        w_stars_f = float(w_stars)
+        w_recency_f = float(w_recency)
+        w_contributors_f = float(w_contributors)
+        w_downloads_f = float(w_downloads)
+    except Exception as e:  # pragma: no cover
+        raise TypeError("Ranking config values must be numeric.") from e
+
+    if half_life_f <= 0:
+        raise ValueError("halfLifeDays must be > 0.")
+
+    return RankingConfig(
+        half_life_days=half_life_f,
+        w_stars=w_stars_f,
+        w_recency=w_recency_f,
+        w_contributors=w_contributors_f,
+        w_downloads=w_downloads_f,
+    )
+
+
+def _days_since(dt: datetime, now: datetime) -> float:
+    delta_s = (now - dt).total_seconds()
+    # If clocks or sources are weird and dt is in the future, clamp to 0.
+    if delta_s < 0:
+        delta_s = 0.0
+    return delta_s / 86400.0
+
+
+def _get_nested(comp: dict[str, Any], *path: str) -> Any:
+    cur: Any = comp
+    for p in path:
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(p)
+    return cur
+
+
+def _stars_for_component(comp: dict[str, Any]) -> int:
+    # Prefer nested metrics.github.stars if available.
+    s = _get_nested(comp, "metrics", "github", "stars")
+    if isinstance(s, int):
+        return max(0, s)
+    return 0
+
+
+def _contributors_for_component(comp: dict[str, Any]) -> int | None:
+    c = _get_nested(comp, "metrics", "github", "contributorsCount")
+    if isinstance(c, int):
+        return max(0, c)
+    return None
+
+
+def _downloads_last_month(comp: dict[str, Any]) -> int | None:
+    d = _get_nested(comp, "metrics", "pypistats", "lastMonth")
+    if isinstance(d, int) and d >= 0:
+        return d
+    return None
+
+
+def _recency_days(
+    comp: dict[str, Any], now: datetime
+) -> tuple[float | None, float | None, float | None]:
+    gh_last_push = _get_nested(comp, "metrics", "github", "lastPushAt")
+    pypi_latest_release = _get_nested(comp, "metrics", "pypi", "latestReleaseAt")
+
+    gh_dt = parse_iso8601(gh_last_push if isinstance(gh_last_push, str) else None)
+    pypi_dt = parse_iso8601(
+        pypi_latest_release if isinstance(pypi_latest_release, str) else None
+    )
+
+    gh_days = _days_since(gh_dt, now) if gh_dt else None
+    pypi_days = _days_since(pypi_dt, now) if pypi_dt else None
+
+    days_since_update: float | None
+    if gh_days is not None and pypi_days is not None:
+        days_since_update = min(gh_days, pypi_days)
+    else:
+        days_since_update = gh_days if gh_days is not None else pypi_days
+
+    return days_since_update, gh_days, pypi_days
+
+
+def _compute_ranking(
+    comp: dict[str, Any], *, cfg: RankingConfig, now: datetime
+) -> dict[str, Any]:
+    stars = _stars_for_component(comp)
+    stars_score = math.log10(stars + 1)
+
+    contributors = _contributors_for_component(comp)
+    contributors_score: float | None = None
+    if contributors is not None:
+        contributors_score = math.log10(contributors + 1)
+
+    downloads_last_month = _downloads_last_month(comp)
+    downloads_score: float | None = None
+    if downloads_last_month is not None:
+        downloads_score = math.log10(downloads_last_month + 1)
+
+    days_since_update, gh_days, pypi_days = _recency_days(comp, now)
+    recency_score: float | None = None
+    if days_since_update is not None:
+        recency_score = math.exp(-days_since_update / cfg.half_life_days)
+
+    score = cfg.w_stars * stars_score
+    if recency_score is not None:
+        score += cfg.w_recency * recency_score
+    if contributors_score is not None:
+        score += cfg.w_contributors * contributors_score
+    if downloads_score is not None:
+        score += cfg.w_downloads * downloads_score
+
+    # Keep ranking explainable and stable.
+    return {
+        "score": score,
+        "signals": {
+            "starsScore": stars_score,
+            "recencyScore": recency_score,
+            "contributorsScore": contributors_score,
+            "daysSinceUpdate": days_since_update,
+            "daysSinceGithubPush": gh_days,
+            "daysSincePypiRelease": pypi_days,
+            "downloadsScore": downloads_score,
+        },
+        "computedAt": utc_now_iso(),
+    }
+
+
+def compute_rankings(
+    *,
+    compiled_in: Path,
+    compiled_out: Path,
+    config_path: Path,
+    limit: int | None,
+) -> int:
+    obj = load_json(compiled_in)
+    if not isinstance(obj, dict):
+        print(
+            f"ERROR: compiled catalog must be a JSON object: {compiled_in}",
+            file=sys.stderr,
+        )
+        return 2
+
+    comps = obj.get("components")
+    if not isinstance(comps, list):
+        print(
+            f"ERROR: compiled catalog missing `components` array: {compiled_in}",
+            file=sys.stderr,
+        )
+        return 2
+
+    cfg = _load_ranking_config(config_path)
+    now = datetime.now(timezone.utc)
+
+    processed = 0
+    for comp in comps:
+        if limit is not None and processed >= limit:
+            break
+        processed += 1
+        if not isinstance(comp, dict):
+            continue
+        comp["ranking"] = _compute_ranking(comp, cfg=cfg, now=now)
+
+    dump_json(compiled_out, obj)
+    print(f"Wrote rankings for {processed} component(s) to {compiled_out}.")
+    return 0
+
+
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(
+        description="Compute ranking fields for compiled/components.json."
+    )
+    parser.add_argument(
+        "--in",
+        dest="compiled_in",
+        default=None,
+        help="Input compiled catalog path (default: compiled/components.json).",
+    )
+    parser.add_argument(
+        "--out",
+        dest="compiled_out",
+        default=None,
+        help="Output compiled catalog path (default: overwrite --in).",
+    )
+    parser.add_argument(
+        "--config",
+        dest="config_path",
+        default=None,
+        help="Ranking config path (default: ranking_config.json at repo root).",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Only process the first N components (debug).",
+    )
+    args = parser.parse_args(argv)
+
+    repo_root = Path(__file__).resolve().parents[1]
+    compiled_in = (
+        Path(args.compiled_in)
+        if args.compiled_in
+        else (repo_root / "compiled" / "components.json")
+    )
+    compiled_out = Path(args.compiled_out) if args.compiled_out else compiled_in
+    config_path = (
+        Path(args.config_path)
+        if args.config_path
+        else (repo_root / "ranking_config.json")
+    )
+
+    if not compiled_in.is_file():
+        print(f"ERROR: Missing compiled catalog: {compiled_in}", file=sys.stderr)
+        return 2
+    if not config_path.is_file():
+        print(f"ERROR: Missing ranking config: {config_path}", file=sys.stderr)
+        return 2
+
+    return compute_rankings(
+        compiled_in=compiled_in,
+        compiled_out=compiled_out,
+        config_path=config_path,
+        limit=args.limit,
+    )
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/directory/scripts/enrich.py b/directory/scripts/enrich.py
new file mode 100644
index 00000000..e02e22a0
--- /dev/null
+++ b/directory/scripts/enrich.py
@@ -0,0 +1,244 @@
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+from _enrichers import get_default_enrichers  # type: ignore[import-not-found]
+from _utils.enrichment_engine import (
+    run_enrichment_engine,  # type: ignore[import-not-found]
+)
+from _utils.io import dump_json, load_json  # type: ignore[import-not-found]
+from _utils.time import utc_now_iso  # type: ignore[import-not-found]
+
+
+def _parse_services(raw: list[str] | None) -> list[str]:
+    if not raw:
+        return ["github", "pypi", "pypistats"]
+    return [s.strip().lower() for s in raw if s.strip()]
+
+
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(
+        description="Enrich compiled/components.json using GitHub, PyPI, and pypistats."
+    )
+    parser.add_argument(
+        "--in",
+        dest="compiled_in",
+        default=None,
+        help="Input compiled catalog path (default: compiled/components.json).",
+    )
+    parser.add_argument(
+        "--out",
+        dest="compiled_out",
+        default=None,
+        help="Output compiled catalog path (default: overwrite --in).",
+    )
+    parser.add_argument(
+        "--services",
+        nargs="*",
+        default=None,
+        help="Which enrichers to run (default: github pypi pypistats).",
+    )
+    parser.add_argument(
+        "--token-env",
+        default="GH_TOKEN",
+        help="Environment variable name holding a GitHub token (default: GH_TOKEN).",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=20.0,
+        help="HTTP timeout in seconds (default: 20).",
+    )
+    parser.add_argument(
+        "--sleep-github",
+        type=float,
+        default=None,
+        help="Sleep between unique GitHub API requests in seconds.",
+    )
+    parser.add_argument(
+        "--sleep-pypi",
+        type=float,
+        default=None,
+        help="Sleep between unique PyPI API requests in seconds.",
+    )
+    parser.add_argument(
+        "--sleep-pypistats",
+        type=float,
+        default=None,
+        help="Sleep between unique pypistats API requests in seconds.",
+    )
+    parser.add_argument(
+        "--refresh-older-than-hours",
+        type=float,
+        default=24.0,
+        help=(
+            "Only refetch metrics if existing fetchedAt values are older than this many "
+            "hours (default: 24). Use 0 to refetch everything."
+        ),
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Only process the first N components (debug).",
+    )
+    parser.add_argument(
+        "--allow-failures",
+        action="store_true",
+        help="Do not fail the process if some enrichment fetches fail.",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=max(4, (os.cpu_count() or 4) * 4),
+        help="Max worker threads (default: 4 * CPU count).",
+    )
+    parser.add_argument(
+        "--progress-every",
+        dest="progress_every",
+        type=int,
+        default=25,
+        help="Print progress every N processed components (default: 25). Use 0 to disable.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print per-service failure details.",
+    )
+    args = parser.parse_args(argv)
+
+    repo_root = Path(__file__).resolve().parents[1]
+    compiled_in = (
+        Path(args.compiled_in)
+        if args.compiled_in
+        else (repo_root / "compiled" / "components.json")
+    )
+    compiled_out = Path(args.compiled_out) if args.compiled_out else compiled_in
+
+    if not compiled_in.is_file():
+        print(f"ERROR: Missing compiled catalog: {compiled_in}", file=sys.stderr)
+        return 2
+
+    obj = load_json(compiled_in)
+    if not isinstance(obj, dict):
+        print(
+            f"ERROR: compiled catalog must be a JSON object: {compiled_in}",
+            file=sys.stderr,
+        )
+        return 2
+    comps = obj.get("components")
+    if not isinstance(comps, list):
+        print(
+            f"ERROR: compiled catalog missing `components` array: {compiled_in}",
+            file=sys.stderr,
+        )
+        return 2
+
+    services = _parse_services(args.services)
+    enrichers = [
+        e
+        for e in get_default_enrichers(github_token_env=args.token_env)
+        if e.name in services
+    ]
+    if not enrichers:
+        print(f"ERROR: No valid services selected: {services}", file=sys.stderr)
+        return 2
+
+    has_gh_token = bool(
+        os.environ.get(args.token_env)
+        or os.environ.get("GH_TOKEN")
+        or os.environ.get("GH_API_TOKEN")
+        or os.environ.get("GITHUB_TOKEN")
+    )
+    github_sleep = (
+        float(args.sleep_github)
+        if args.sleep_github is not None
+        else (0.2 if has_gh_token else 1.0)
+    )
+    pypi_sleep = float(args.sleep_pypi) if args.sleep_pypi is not None else 0.3
+    pypistats_sleep = (
+        float(args.sleep_pypistats) if args.sleep_pypistats is not None else pypi_sleep
+    )
+
+    sleep_by_service = {
+        "github": github_sleep,
+        "pypi": pypi_sleep,
+        "pypistats": pypistats_sleep,
+    }
+
+    run_fetched_at = utc_now_iso()
+    comps_for_run = comps if args.limit is None else comps[: int(args.limit)]
+    expected_counts: dict[str, int] = {e.name: 0 for e in enrichers}
+    for comp in comps_for_run:
+        if not isinstance(comp, dict):
+            continue
+        for enricher in enrichers:
+            if not enricher.needs_fetch(comp, args.refresh_older_than_hours):
+                continue
+            if enricher.key_for_component(comp) is None:
+                continue
+            expected_counts[enricher.name] += 1
+    for enricher in enrichers:
+        print(
+            f"[{enricher.name}] will attempt {expected_counts[enricher.name]} component(s).",
+            flush=True,
+        )
+    result = run_enrichment_engine(
+        components=comps_for_run,
+        enrichers=enrichers,
+        refresh_older_than_hours=args.refresh_older_than_hours,
+        timeout_s=float(args.timeout),
+        sleep_by_service=sleep_by_service,
+        workers=int(args.workers),
+        run_fetched_at=run_fetched_at,
+        progress_every=(
+            int(args.progress_every) if args.progress_every is not None else None
+        ),
+    )
+
+    dump_json(compiled_out, obj)
+
+    ts = utc_now_iso()
+    print(f"Wrote {compiled_out} at {ts}.")
+    for enricher in enrichers:
+        s = result.stats[enricher.name]
+        print(
+            f"[{enricher.name}] summary: processed={s.processed} "
+            f"requests={s.requests} ok={s.ok} fail={s.failed} "
+            f"updated={s.updated} skipped_fresh={s.skipped_fresh} "
+            f"cache_hits={s.cache_hits} skipped_no_key={s.skipped_no_key}",
+            flush=True,
+        )
+
+    any_failures = False
+    for enricher in enrichers:
+        fails = result.failures[enricher.name]
+        if fails:
+            any_failures = True
+            print(
+                f"WARNING: {len(fails)} {enricher.name} fetch failure(s):",
+                file=sys.stderr,
+            )
+            for f in fails[:50]:
+                code = f" (status {f.status})" if f.status is not None else ""
+                print(f"- {f.key}{code}: {f.error}", file=sys.stderr)
+            if len(fails) > 50:
+                print(f"... and {len(fails) - 50} more", file=sys.stderr)
+            if args.verbose:
+                for f in fails:
+                    code = f" (status {f.status})" if f.status is not None else ""
+                    print(
+                        f"[{enricher.name}] FAIL {f.key}{code}: {f.error}",
+                        file=sys.stderr,
+                    )
+
+    if any_failures and not args.allow_failures:
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/directory/scripts/enrich_images.py b/directory/scripts/enrich_images.py
new file mode 100644
index 00000000..05b8ccbf
--- /dev/null
+++ b/directory/scripts/enrich_images.py
@@ -0,0 +1,501 @@
+"""
+Validate Component Gallery preview images (`media.image`) for stability + accessibility.
+
+Rules enforced:
+- `media.image` is optional (it may be missing, null, or an empty string). If present and non-empty:
+- URL must be https://
+- must not be a brittle proxy (e.g. `camo.githubusercontent.com`)
+- must not contain signed/expiring query params (X-Amz-*, X-Goog-*, Signature/Expires/etc)
+- must be fetchable (HTTP 2xx) and plausibly an image (best-effort via Content-Type)
+
+Typical usage (from repo root):
+  python directory/scripts/enrich_images.py --check-only
+
+Notes:
+- This script is intentionally check-only (no auto-fix, no caching).
+- It requires outbound network access.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import parse_qsl, urlparse
+
+import requests
+from _utils.image_url_policy import DISALLOWED_IMAGE_HOSTS, DISALLOWED_IMAGE_QUERY_KEYS
+from _utils.io import load_json
+from requests.adapters import HTTPAdapter
+
+DEFAULT_TIMEOUT_S = 15.0
+DEFAULT_WORKERS = min(32, max(4, (os.cpu_count() or 4) * 5))
+
+
+def _is_https_url(url: str) -> bool:
+    """Return True if a URL is a well-formed HTTPS URL.
+
+    Parameters
+    ----------
+    url
+        URL to check.
+
+    Returns
+    -------
+    bool
+        True if the URL uses the ``https`` scheme and has a non-empty network
+        location (host).
+    """
+    p = urlparse(url)
+    return p.scheme == "https" and bool(p.netloc)
+
+
+def _is_disallowed_host(url: str) -> bool:
+    """Return True if the URL host is disallowed for preview images.
+
+    Parameters
+    ----------
+    url
+        URL to check.
+
+    Returns
+    -------
+    bool
+        True if the URL host is in the disallowed host list.
+    """
+    host = (urlparse(url).netloc or "").lower()
+    return host in DISALLOWED_IMAGE_HOSTS
+
+
+def _has_disallowed_query_params(url: str) -> bool:
+    """Return True if the URL includes signed/expiring query parameters.
+
+    Parameters
+    ----------
+    url
+        URL to check.
+
+    Returns
+    -------
+    bool
+        True if any query parameter key matches a disallowed key (case-insensitive).
+    """
+    for k, _ in parse_qsl(urlparse(url).query, keep_blank_values=True):
+        if k.strip().lower() in DISALLOWED_IMAGE_QUERY_KEYS:
+            return True
+    return False
+
+
+def _is_imageish_content_type(ct: str | None) -> bool:
+    """Return True if an HTTP Content-Type is plausibly an image.
+
+    Parameters
+    ----------
+    ct
+        Content-Type header value (may be missing).
+
+    Returns
+    -------
+    bool
+        True if the value looks like an image MIME type (``image/*``), or if it
+        is missing/blank, or if it is ``application/octet-stream`` (some CDNs
+        mislabel images). This is intentionally permissive to avoid false negatives.
+    """
+    if not isinstance(ct, str) or not ct.strip():
+        return True
+    base = ct.split(";", 1)[0].strip().lower()
+    if base.startswith("image/"):
+        return True
+    if base in {"application/octet-stream"}:
+        return True
+    return False
+
+
+@dataclass(frozen=True)
+class ImageCheck:
+    """Result of a best-effort remote image URL check.
+
+    Attributes
+    ----------
+    ok
+        True if the URL was fetchable (HTTP 2xx).
+    status
+        HTTP status code if available.
+    final_url
+        Final URL after redirects if available.
+    content_type
+        Response Content-Type header if available.
+    error
+        Human-readable error string for failures.
+    """
+
+    ok: bool
+    status: int | None
+    final_url: str | None
+    content_type: str | None
+    error: str | None = None
+
+
+_tls = threading.local()
+
+
+def _get_thread_session(*, pool_maxsize: int) -> requests.Session:
+    """Return a thread-local `requests.Session` for connection reuse.
+
+    Parameters
+    ----------
+    pool_maxsize
+        Max number of pooled connections to keep per host for the mounted adapters.
+
+    Returns
+    -------
+    requests.Session
+        A per-thread session instance.
+
+    Notes
+    -----
+    `requests.Session` is not guaranteed to be thread-safe. Using one session per
+    worker thread preserves connection pooling without cross-thread sharing.
+    """
+    s = getattr(_tls, "session", None)
+    if s is None:
+        s = requests.Session()
+        adapter = HTTPAdapter(pool_connections=pool_maxsize, pool_maxsize=pool_maxsize)
+        s.mount("https://", adapter)
+        s.mount("http://", adapter)
+        _tls.session = s
+    return s
+
+
+def _check_fetchable(
+    session: requests.Session, url: str, *, timeout_s: float
+) -> ImageCheck:
+    """Fetch an image URL (HEAD then GET) and return an `ImageCheck`.
+
+    Parameters
+    ----------
+    session
+        `requests` session used to issue HTTP requests.
+    url
+        Image URL to fetch.
+    timeout_s
+        Per-request timeout in seconds.
+
+    Returns
+    -------
+    ImageCheck
+        Structured result containing status, final URL after redirects, and
+        Content-Type (best-effort).
+
+    Notes
+    -----
+    This function tries a ``HEAD`` request first (faster, less bandwidth). Some
+    servers block or mishandle ``HEAD``; in that case we fall back to a streaming
+    ``GET``.
+    """
+    headers = {"User-Agent": "component-gallery-image-check"}
+
+    # HEAD first; fall back to GET (some servers block HEAD).
+    try:
+        with session.head(
+            url, allow_redirects=True, timeout=timeout_s, headers=headers
+        ) as r:
+            status = int(r.status_code)
+            ct = r.headers.get("Content-Type")
+            if 200 <= status < 300:
+                return ImageCheck(
+                    ok=True,
+                    status=status,
+                    final_url=str(r.url),
+                    content_type=ct,
+                    error=None,
+                )
+    except requests.RequestException:
+        # Some servers reject or mishandle HEAD requests; ignore the error and
+        # fall back to a full GET request below.
+        pass
+
+    try:
+        with session.get(
+            url, allow_redirects=True, timeout=timeout_s, headers=headers, stream=True
+        ) as r:
+            status = int(r.status_code)
+            ct = r.headers.get("Content-Type")
+            if 200 <= status < 300:
+                return ImageCheck(
+                    ok=True,
+                    status=status,
+                    final_url=str(r.url),
+                    content_type=ct,
+                    error=None,
+                )
+            return ImageCheck(
+                ok=False,
+                status=status,
+                final_url=str(r.url),
+                content_type=ct,
+                error=f"HTTP {status}",
+            )
+    except requests.RequestException as e:
+        return ImageCheck(
+            ok=False,
+            status=None,
+            final_url=None,
+            content_type=None,
+            error=f"{type(e).__name__}: {e}",
+        )
+
+
+def _get_media_image(obj: dict[str, Any]) -> str | None:
+    """Extract a non-empty `media.image` string from a component JSON object.
+
+    Parameters
+    ----------
+    obj
+        Parsed JSON object for a component.
+
+    Returns
+    -------
+    str | None
+        The stripped image URL if present and non-empty; otherwise ``None``.
+    """
+    media = obj.get("media")
+    if not isinstance(media, dict):
+        return None
+    img = media.get("image")
+    return img.strip() if isinstance(img, str) and img.strip() else None
+
+
+@dataclass(frozen=True)
+class _FetchTask:
+    """Unit of work for a single network fetch."""
+
+    json_name: str
+    url: str
+
+
+@dataclass(frozen=True)
+class _FetchResult:
+    """Network fetch result for a single component JSON file."""
+
+    json_name: str
+    url: str
+    chk: ImageCheck
+
+
+def _fetch_one(
+    task: _FetchTask, *, timeout_s: float, pool_maxsize: int
+) -> _FetchResult:
+    """Fetch one URL for a `_FetchTask` and return a `_FetchResult`.
+
+    Parameters
+    ----------
+    task
+        Task describing which component file the URL came from.
+    timeout_s
+        Per-request timeout in seconds.
+    pool_maxsize
+        Per-thread connection pool size for the thread-local session.
+
+    Returns
+    -------
+    _FetchResult
+        Result record with the originating JSON file name and `ImageCheck`.
+    """
+    session = _get_thread_session(pool_maxsize=pool_maxsize)
+    chk = _check_fetchable(session, task.url, timeout_s=timeout_s)
+    return _FetchResult(json_name=task.json_name, url=task.url, chk=chk)
+
+
+def check_images(
+    *,
+    components_dir: Path,
+    timeout_s: float,
+    verbose: bool,
+    workers: int,
+) -> int:
+    """Validate component preview image URLs under `components_dir`.
+
+    Parameters
+    ----------
+    components_dir
+        Directory containing `components/*.json` submission files.
+    timeout_s
+        Per-request timeout in seconds for image fetch checks.
+    verbose
+        If True, print an OK line for each successfully validated image.
+    workers
+        Maximum number of concurrent network fetches to run.
+
+    Returns
+    -------
+    int
+        Process-style return code: 0 if all checks pass, otherwise 1.
+
+    Notes
+    -----
+    The validation happens in two phases:
+
+    1. Local policy checks (HTTPS, disallowed hosts, disallowed query params).
+    2. Network checks (fetchability + permissive Content-Type validation), run in
+       parallel because they are I/O-bound.
+    """
+    failures = 0
+    tasks: list[_FetchTask] = []
+    local_failures: dict[str, str] = {}
+
+    for json_file in sorted(components_dir.glob("*.json")):
+        try:
+            obj = load_json(json_file)
+        except Exception as e:
+            local_failures[json_file.name] = f"invalid JSON ({e})"
+            continue
+        if not isinstance(obj, dict):
+            continue
+
+        img = _get_media_image(obj)
+        if not img:
+            # Optional: null/empty is allowed.
+            continue
+
+        if not _is_https_url(img):
+            local_failures[json_file.name] = f"not https:// ({img})"
+            continue
+
+        if _is_disallowed_host(img):
+            local_failures[json_file.name] = f"disallowed_host=camo ({img})"
+            continue
+
+        if _has_disallowed_query_params(img):
+            local_failures[json_file.name] = f"signed/expiring_url ({img})"
+            continue
+
+        tasks.append(_FetchTask(json_name=json_file.name, url=img))
+
+    # Report local (non-network) failures deterministically.
+    for json_name in sorted(local_failures.keys()):
+        print(
+            f"[images] FAIL {json_name}: {local_failures[json_name]}", file=sys.stderr
+        )
+        failures += 1
+
+    # Network-bound checks: run in parallel (bounded).
+    results_by_json: dict[str, _FetchResult] = {}
+    if tasks:
+        # Ensure a sane lower bound; allow workers=1 for debugging.
+        w = max(1, int(workers))
+        pool_maxsize = max(8, w)
+        with ThreadPoolExecutor(max_workers=w) as ex:
+            futs = [
+                ex.submit(_fetch_one, t, timeout_s=timeout_s, pool_maxsize=pool_maxsize)
+                for t in tasks
+            ]
+            for fut in as_completed(futs):
+                r = fut.result()
+                results_by_json[r.json_name] = r
+
+        for json_name in sorted(results_by_json.keys()):
+            r = results_by_json[json_name]
+            chk = r.chk
+            img = r.url
+
+            if not chk.ok:
+                print(
+                    f"[images] FAIL {json_name}: unfetchable ({chk.error}) ({img})",
+                    file=sys.stderr,
+                )
+                failures += 1
+                continue
+
+            if not _is_imageish_content_type(chk.content_type):
+                print(
+                    f"[images] FAIL {json_name}: non-image content-type ({chk.content_type}) ({img})",
+                    file=sys.stderr,
+                )
+                failures += 1
+                continue
+
+            if verbose:
+                print(f"[images] OK {json_name}: {chk.final_url or img}")
+
+    print(f"[images] done: failures={failures}")
+    return 1 if failures else 0
+
+
+def main(argv: list[str]) -> int:
+    """CLI entrypoint.
+
+    Parameters
+    ----------
+    argv
+        Command line arguments excluding the program name (i.e., ``sys.argv[1:]``).
+
+    Returns
+    -------
+    int
+        Process exit code:
+
+        - 0: all checks passed
+        - 1: one or more checks failed
+        - 2: configuration error (e.g., missing components dir, offline mode set)
+    """
+    parser = argparse.ArgumentParser(
+        description="Validate `media.image` URLs for components."
+    )
+    parser.add_argument(
+        "--components-dir",
+        default=None,
+        help="Directory containing components/*.json (default: components/).",
+    )
+    parser.add_argument(
+        "--check-only",
+        action="store_true",
+        help="Compatibility flag; this script is always check-only.",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=DEFAULT_TIMEOUT_S,
+        help=f"HTTP timeout in seconds (default: {DEFAULT_TIMEOUT_S}).",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=DEFAULT_WORKERS,
+        help=f"Max parallel fetches (default: {DEFAULT_WORKERS}).",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output.")
+    args = parser.parse_args(argv)
+
+    if os.environ.get("COMPONENT_GALLERY_OFFLINE") == "1":
+        print(
+            "ERROR: COMPONENT_GALLERY_OFFLINE=1 set; image checks require network.",
+            file=sys.stderr,
+        )
+        return 2
+
+    project_root = Path(__file__).resolve().parents[1]
+    components_dir = (
+        Path(args.components_dir)
+        if args.components_dir
+        else (project_root / "components")
+    )
+    if not components_dir.is_dir():
+        print(f"ERROR: components dir not found: {components_dir}", file=sys.stderr)
+        return 2
+
+    return check_images(
+        components_dir=components_dir,
+        timeout_s=float(args.timeout),
+        verbose=bool(args.verbose),
+        workers=int(args.workers),
+    )
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/directory/scripts/run_pipeline.py b/directory/scripts/run_pipeline.py
new file mode 100644
index 00000000..01de5c09
--- /dev/null
+++ b/directory/scripts/run_pipeline.py
@@ -0,0 +1,273 @@
+"""
+Run the full Component Gallery pipeline with a single entrypoint.
+
+Default pipeline:
+
+  1) Validate `components/*.json` submissions
+  2) Build `compiled/components.json`
+  3) Validate `compiled/components.json`
+  4) Enrich GitHub metrics
+  5) Enrich PyPI metrics
+  6) Compute ranking signals
+  7) Validate `compiled/components.json` again
+
+Run from the repo root (recommended):
+
+    python directory/scripts/run_pipeline.py
+
+Typical CI usage:
+
+    # Build + validate only (no network)
+    python directory/scripts/run_pipeline.py --no-enrich
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def _run(cmd: list[str]) -> int:
+    proc = subprocess.run(cmd)
+    return int(proc.returncode)
+
+
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(
+        description="Run validate -> build -> enrich pipeline for the component gallery."
+    )
+    parser.add_argument(
+        "--no-validate",
+        action="store_true",
+        help="Skip validation steps (not recommended).",
+    )
+    parser.add_argument(
+        "--no-build",
+        action="store_true",
+        help="Skip build step (assumes compiled/components.json already exists).",
+    )
+    parser.add_argument(
+        "--no-github",
+        action="store_true",
+        help="Skip GitHub enrichment.",
+    )
+    parser.add_argument(
+        "--no-pypi",
+        action="store_true",
+        help="Skip PyPI enrichment.",
+    )
+    parser.add_argument(
+        "--no-pypistats",
+        action="store_true",
+        help="Skip PyPI download enrichment (pypistats).",
+    )
+    parser.add_argument(
+        "--no-enrich",
+        action="store_true",
+        help="Skip all enrichment (equivalent to --no-github --no-pypi).",
+    )
+    parser.add_argument(
+        "--no-ranking",
+        action="store_true",
+        help="Skip ranking computation (not recommended).",
+    )
+    parser.add_argument(
+        "--no-images",
+        action="store_true",
+        help="Skip image URL checking (requires outbound network).",
+    )
+    parser.add_argument(
+        "--allow-enrich-failures",
+        action="store_true",
+        help="Do not fail the pipeline if some enrichment fetches fail.",
+    )
+    parser.add_argument(
+        "--refresh-older-than-hours",
+        type=float,
+        default=24.0,
+        help=(
+            "Only refetch enrichment metrics if existing fetchedAt values are older "
+            "than this many hours (default: 24). Use 0 to force refetching everything."
+        ),
+    )
+    parser.add_argument(
+        "--enrich-progress-every",
+        type=int,
+        default=None,
+        help=(
+            "Forwarded to enrichers as --progress-every N. "
+            "Default: use each enricher's default."
+        ),
+    )
+    parser.add_argument(
+        "--enrich-verbose",
+        action="store_true",
+        help="Forwarded to enrichers as --verbose (prints per-request failures as they happen).",
+    )
+    parser.add_argument(
+        "--enrich-sleep-github",
+        type=float,
+        default=None,
+        help=(
+            "Sleep between unique GitHub API requests in seconds. "
+            "Default: 0.2 with GH_TOKEN set, else 1.0 (safer for large catalogs)."
+        ),
+    )
+    parser.add_argument(
+        "--enrich-sleep-pypi",
+        type=float,
+        default=None,
+        help=(
+            "Sleep between unique PyPI API requests in seconds. "
+            "Default: 0.3 (safer for large catalogs)."
+        ),
+    )
+    parser.add_argument(
+        "--enrich-sleep-pypistats",
+        type=float,
+        default=None,
+        help=(
+            "Sleep between unique pypistats API requests in seconds. "
+            "Default: reuse PyPI sleep (0.3 by default)."
+        ),
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Only process the first N components for each enrichment step (debug).",
+    )
+    args = parser.parse_args(argv)
+
+    repo_root = Path(__file__).resolve().parents[1]
+    py = sys.executable
+
+    if args.no_enrich:
+        args.no_github = True
+        args.no_pypi = True
+        args.no_pypistats = True
+
+    # Choose conservative enrichment pacing defaults, especially for large catalogs.
+    has_gh_token = bool(
+        os.environ.get("GH_TOKEN")
+        or os.environ.get("GH_API_TOKEN")
+        or os.environ.get("GITHUB_TOKEN")
+    )
+    github_sleep = (
+        float(args.enrich_sleep_github)
+        if args.enrich_sleep_github is not None
+        else (0.2 if has_gh_token else 1.0)
+    )
+    pypi_sleep = (
+        float(args.enrich_sleep_pypi) if args.enrich_sleep_pypi is not None else 0.3
+    )
+    pypistats_sleep = (
+        float(args.enrich_sleep_pypistats)
+        if args.enrich_sleep_pypistats is not None
+        else pypi_sleep
+    )
+
+    def run_step(name: str, cmd: list[str]) -> int:
+        # Flush so headers appear before subprocess output in buffered environments.
+        print(f"\n==> {name}\n$ {' '.join(cmd)}", flush=True)
+        return _run(cmd)
+
+    # 1) Validate submissions
+    if not args.no_validate:
+        rc = run_step(
+            "Validate submissions", [py, str(repo_root / "scripts" / "validate.py")]
+        )
+        if rc != 0:
+            return rc
+
+    # 1b) Check image URLs (network). Keep this separate from schema validation so
+    # CI can enforce it while local/offline runs can skip it.
+    if not args.no_images:
+        rc = run_step(
+            "Check images",
+            [py, str(repo_root / "scripts" / "enrich_images.py"), "--check-only"],
+        )
+        if rc != 0:
+            return rc
+
+    # 2) Build compiled artifact
+    if not args.no_build:
+        rc = run_step(
+            "Build compiled catalog",
+            [py, str(repo_root / "scripts" / "build_catalog.py")],
+        )
+        if rc != 0:
+            return rc
+
+    # 3) Validate compiled artifact
+    if not args.no_validate:
+        rc = run_step(
+            "Validate compiled catalog",
+            [py, str(repo_root / "scripts" / "validate.py"), "--compiled"],
+        )
+        if rc != 0:
+            return rc
+
+    # 4) Enrich (GitHub/PyPI/pypistats)
+    services: list[str] = []
+    if not args.no_github:
+        services.append("github")
+    if not args.no_pypi:
+        services.append("pypi")
+    if not args.no_pypistats:
+        services.append("pypistats")
+
+    if services:
+        cmd = [
+            py,
+            str(repo_root / "scripts" / "enrich.py"),
+            "--services",
+            *services,
+            "--sleep-github",
+            str(github_sleep),
+            "--sleep-pypi",
+            str(pypi_sleep),
+            "--sleep-pypistats",
+            str(pypistats_sleep),
+            "--refresh-older-than-hours",
+            str(args.refresh_older_than_hours),
+        ]
+        if args.enrich_progress_every is not None:
+            cmd += ["--progress-every", str(args.enrich_progress_every)]
+        if args.enrich_verbose:
+            cmd += ["--verbose"]
+        if args.limit is not None:
+            cmd += ["--limit", str(args.limit)]
+        if args.allow_enrich_failures:
+            cmd += ["--allow-failures"]
+        rc = run_step("Enrich catalog", cmd)
+        if rc != 0:
+            return rc
+
+    # 6) Compute ranking
+    if not args.no_ranking:
+        cmd = [py, str(repo_root / "scripts" / "compute_ranking.py")]
+        if args.limit is not None:
+            cmd += ["--limit", str(args.limit)]
+        rc = run_step("Compute ranking", cmd)
+        if rc != 0:
+            return rc
+
+    # 7) Final validate compiled artifact
+    if not args.no_validate:
+        rc = run_step(
+            "Final validate compiled catalog",
+            [py, str(repo_root / "scripts" / "validate.py"), "--compiled"],
+        )
+        if rc != 0:
+            return rc
+
+    print("\nOK: pipeline completed successfully.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/directory/scripts/validate.py b/directory/scripts/validate.py
new file mode 100644
index 00000000..1cdc2b57
--- /dev/null
+++ b/directory/scripts/validate.py
@@ -0,0 +1,554 @@
+"""
+Validate Component Gallery JSON files.
+
+This script validates:
+
+- Source-of-truth component submissions: `components/*.json`
+  against `schemas/component.schema.json`.
+- Optionally, the compiled artifact: `compiled/components.json`
+  against `schemas/compiled.schema.json` (use `--compiled`).
+
+Run from the repo root (recommended):
+
+    python directory/scripts/validate.py
+    python directory/scripts/validate.py --compiled
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+from urllib.parse import parse_qsl, urlparse
+
+from _utils.github import normalize_github_repo_url
+from _utils.image_url_policy import DISALLOWED_IMAGE_HOSTS, DISALLOWED_IMAGE_QUERY_KEYS
+from _utils.io import load_json
+
+
+@dataclass(frozen=True)
+class ValidationIssue:
+    """A single schema validation issue tied to a specific JSON file."""
+
+    file: Path
+    schema: Path
+    message: str
+    json_path: str | None = None
+
+
+def _format_json_path(parts: Iterable[Any]) -> str:
+    """Format a jsonschema error path into a compact JSONPath-ish string.
+
+    Parameters
+    ----------
+    parts
+        Iterable of path parts (strings for object keys, ints for array indices),
+        typically from `jsonschema.ValidationError.path`.
+
+    Returns
+    -------
+    str
+        A compact, human-readable path (e.g. ``$``, ``author.github``,
+        ``components[0].title``).
+    """
+    out: list[str] = []
+    for p in parts:
+        if isinstance(p, int):
+            out.append(f"[{p}]")
+        else:
+            if out:
+                out.append(".")
+            out.append(str(p))
+    return "".join(out) or "$"
+
+
+def _load_json(path: Path) -> Any:
+    """Load a JSON file from disk.
+
+    Parameters
+    ----------
+    path
+        Path to a JSON file.
+
+    Returns
+    -------
+    Any
+        Parsed JSON data.
+    """
+    return load_json(path)
+
+
+def _load_schema(path: Path) -> dict[str, Any]:
+    """Load and sanity-check a JSON Schema from disk.
+
+    Parameters
+    ----------
+    path
+        Path to a JSON Schema file.
+
+    Returns
+    -------
+    dict[str, Any]
+        Parsed schema object.
+
+    Raises
+    ------
+    TypeError
+        If the schema file does not contain a JSON object.
+    """
+    obj = _load_json(path)
+    if not isinstance(obj, dict):
+        raise TypeError(f"Schema must be a JSON object: {path}")
+    return obj
+
+
+def _missing_required_fields(err: Any) -> list[str] | None:
+    """Compute missing required field names for a jsonschema "required" error.
+
+    jsonschema "required" errors can be noisy; this extracts the specific fields
+    missing at the failing location so output stays readable.
+
+    Parameters
+    ----------
+    err
+        A `jsonschema.ValidationError` instance (typed as `Any` to keep this
+        script dependency-light).
+
+    Returns
+    -------
+    list[str] | None
+        List of missing field names if applicable; otherwise ``None``.
+    """
+    if err.validator != "required" or not isinstance(err.validator_value, list):
+        return None
+    if not isinstance(err.instance, dict):
+        return None
+    # validator_value is the list of required fields for the schema at this path.
+    required: list[str] = [str(x) for x in err.validator_value]
+    return [k for k in required if k not in err.instance]
+
+
+def _validate_one(instance_path: Path, schema_path: Path) -> list[ValidationIssue]:
+    """Validate one JSON instance file against a JSON Schema.
+
+    Parameters
+    ----------
+    instance_path
+        Path to the JSON file to validate.
+    schema_path
+        Path to the JSON Schema file to validate against.
+
+    Returns
+    -------
+    list[ValidationIssue]
+        A (de-duplicated) list of validation issues for this file. Empty means
+        the file is valid.
+
+    Raises
+    ------
+    RuntimeError
+        If the `jsonschema` dependency is not installed.
+    TypeError
+        If the schema file is not a JSON object.
+    json.JSONDecodeError
+        If either the schema or instance JSON cannot be parsed.
+    """
+    try:
+        from jsonschema import Draft202012Validator  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "Missing dependency `jsonschema`.\n\n"
+            "Install it with:\n"
+            "  pip install jsonschema\n"
+            "or add it to `component-gallery/requirements.txt`."
+        ) from e
+
+    schema = _load_schema(schema_path)
+    instance = _load_json(instance_path)
+
+    validator = Draft202012Validator(schema)
+    issues: list[ValidationIssue] = []
+
+    for err in sorted(validator.iter_errors(instance), key=lambda x: list(x.path)):
+        # jsonschema gives a path deque; make it readable
+        json_path = _format_json_path(err.path)
+        message = err.message
+
+        missing = _missing_required_fields(err)
+        if missing:
+            message = f"Missing required field(s): {', '.join(missing)}"
+
+        issues.append(
+            ValidationIssue(
+                file=instance_path,
+                schema=schema_path,
+                message=message,
+                json_path=json_path,
+            )
+        )
+
+    # De-dupe identical messages (common when multiple schemas report the same root-level issue)
+    deduped: list[ValidationIssue] = []
+    seen: set[tuple[str, str]] = set()
+    for issue in issues:
+        key = (issue.json_path or "$", issue.message)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(issue)
+    return deduped
+
+
+def validate_components(repo_root: Path) -> list[ValidationIssue]:
+    """Validate all source component submissions under `components/`.
+
+    Parameters
+    ----------
+    repo_root
+        Path to the component-gallery repo root.
+
+    Returns
+    -------
+    list[ValidationIssue]
+        Validation issues across all `components/*.json` files.
+    """
+    schema_path = repo_root / "schemas" / "component.schema.json"
+    components_dir = repo_root / "components"
+
+    issues: list[ValidationIssue] = []
+    for json_file in sorted(components_dir.glob("*.json")):
+        issues.extend(_validate_one(json_file, schema_path))
+    return issues
+
+
+def _is_https_url(url: str) -> bool:
+    parsed = urlparse(url)
+    return parsed.scheme == "https" and bool(parsed.netloc)
+
+
+def _is_disallowed_url(url: str) -> bool:
+    """Reject obvious XSS / unsafe schemes even if schema is relaxed."""
+    parsed = urlparse(url)
+    return parsed.scheme in {"javascript", "data", "file"}
+
+
+# --- Image URL hardening -----------------------------------------------------
+#
+# We want preview images to remain stable over time. In practice, the most common
+# sources of broken images are:
+# - Signed / expiring URLs (S3/GCS/CloudFront style query params)
+# - Proxy URLs like `camo.githubusercontent.com` (can change/expire and is not the
+#   canonical image source)
+#
+# We enforce these constraints only for `media.image` (not general links).
+
+
+def _has_disallowed_image_query_params(url: str) -> bool:
+    parsed = urlparse(url)
+    for k, _ in parse_qsl(parsed.query, keep_blank_values=True):
+        if k.strip().lower() in DISALLOWED_IMAGE_QUERY_KEYS:
+            return True
+    return False
+
+
+def _is_disallowed_image_host(url: str) -> bool:
+    parsed = urlparse(url)
+    host = (parsed.netloc or "").lower()
+    return host in DISALLOWED_IMAGE_HOSTS
+
+
+def validate_policies(
+    repo_root: Path, *, max_component_bytes: int = 50_000
+) -> list[ValidationIssue]:
+    """Policy/lint checks beyond JSON Schema for `components/*.json`.
+
+    This matches the tech spec's CI expectations:
+    - Unique component identity (unique GitHub owner/repo across submissions)
+    - HTTPS-only URLs
+    - Basic abuse guardrails (file size)
+    """
+    schema_path = repo_root / "schemas" / "component.schema.json"
+    components_dir = repo_root / "components"
+
+    issues: list[ValidationIssue] = []
+    first_by_repo: dict[str, Path] = {}
+
+    for json_file in sorted(components_dir.glob("*.json")):
+        # File size abuse guardrail
+        try:
+            size = json_file.stat().st_size
+        except OSError as e:  # pragma: no cover
+            issues.append(
+                ValidationIssue(
+                    file=json_file,
+                    schema=schema_path,
+                    message=f"Could not stat file: {e}",
+                    json_path=None,
+                )
+            )
+            continue
+        if size > max_component_bytes:
+            issues.append(
+                ValidationIssue(
+                    file=json_file,
+                    schema=schema_path,
+                    message=(
+                        f"File too large ({size} bytes). "
+                        f"Max allowed is {max_component_bytes} bytes."
+                    ),
+                    json_path=None,
+                )
+            )
+
+        # Best-effort JSON load for lint checks (schema validation handled separately)
+        try:
+            obj = _load_json(json_file)
+        except Exception:
+            continue
+        if not isinstance(obj, dict):
+            continue
+
+        links = obj.get("links")
+        if not isinstance(links, dict):
+            continue
+
+        gh = links.get("github")
+        if isinstance(gh, str) and gh:
+            # Extra HTTPS enforcement (schema already restricts, but keep as policy)
+            if _is_disallowed_url(gh) or not _is_https_url(gh):
+                issues.append(
+                    ValidationIssue(
+                        file=json_file,
+                        schema=schema_path,
+                        message="URL must be https:// and must not use a disallowed scheme.",
+                        json_path="links.github",
+                    )
+                )
+            else:
+                try:
+                    canonical = normalize_github_repo_url(gh)
+                    key = urlparse(canonical).path.lower().strip("/")
+                    if key in first_by_repo:
+                        issues.append(
+                            ValidationIssue(
+                                file=json_file,
+                                schema=schema_path,
+                                message=(
+                                    f"Duplicate component identity: links.github repo `{key}` "
+                                    f"already submitted in `{first_by_repo[key].name}`."
+                                ),
+                                json_path="links.github",
+                            )
+                        )
+                    else:
+                        first_by_repo[key] = json_file
+                except Exception as e:
+                    issues.append(
+                        ValidationIssue(
+                            file=json_file,
+                            schema=schema_path,
+                            message=str(e),
+                            json_path="links.github",
+                        )
+                    )
+
+        # Enforce HTTPS for other URL fields we accept
+        for path, val in (
+            ("links.demo", links.get("demo")),
+            ("links.docs", links.get("docs")),
+        ):
+            if val is None:
+                continue
+            if isinstance(val, str):
+                if _is_disallowed_url(val) or not _is_https_url(val):
+                    issues.append(
+                        ValidationIssue(
+                            file=json_file,
+                            schema=schema_path,
+                            message="URL must be https:// and must not use a disallowed scheme.",
+                            json_path=path,
+                        )
+                    )
+
+        media = obj.get("media")
+        if isinstance(media, dict):
+            img = media.get("image")
+            if img is None:
+                # Image is optional; null is allowed.
+                pass
+            elif isinstance(img, str):
+                if _is_disallowed_url(img) or not _is_https_url(img):
+                    issues.append(
+                        ValidationIssue(
+                            file=json_file,
+                            schema=schema_path,
+                            message="URL must be https:// and must not use a disallowed scheme.",
+                            json_path="media.image",
+                        )
+                    )
+                elif _is_disallowed_image_host(img):
+                    issues.append(
+                        ValidationIssue(
+                            file=json_file,
+                            schema=schema_path,
+                            message=(
+                                "Image host is not allowed for `media.image` "
+                                "(brittle proxy). Use a stable upstream URL instead."
+                            ),
+                            json_path="media.image",
+                        )
+                    )
+                elif _has_disallowed_image_query_params(img):
+                    issues.append(
+                        ValidationIssue(
+                            file=json_file,
+                            schema=schema_path,
+                            message=(
+                                "Signed/expiring image URLs are not allowed for `media.image` "
+                                "(disallowed query parameters detected)."
+                            ),
+                            json_path="media.image",
+                        )
+                    )
+            else:
+                issues.append(
+                    ValidationIssue(
+                        file=json_file,
+                        schema=schema_path,
+                        message="`media.image` must be a string URL or null.",
+                        json_path="media.image",
+                    )
+                )
+
+    return issues
+
+
+def validate_compiled(repo_root: Path) -> list[ValidationIssue]:
+    """Validate the compiled catalog artifact `compiled/components.json`.
+
+    Parameters
+    ----------
+    repo_root
+        Path to the component-gallery repo root.
+
+    Returns
+    -------
+    list[ValidationIssue]
+        Validation issues for the compiled artifact. If the artifact is missing,
+        returns a single issue indicating it was skipped.
+    """
+    schema_path = repo_root / "schemas" / "compiled.schema.json"
+    compiled_path = repo_root / "compiled" / "components.json"
+    if not compiled_path.is_file():
+        return [
+            ValidationIssue(
+                file=compiled_path,
+                schema=schema_path,
+                message="Compiled artifact not found (skipping).",
+                json_path=None,
+            )
+        ]
+    return _validate_one(compiled_path, schema_path)
+
+
+def main(argv: list[str]) -> int:
+    """CLI entrypoint.
+
+    Parameters
+    ----------
+    argv
+        CLI arguments excluding the program name (i.e., ``sys.argv[1:]``).
+
+    Returns
+    -------
+    int
+        Process exit code:
+
+        - 0: success
+        - 1: validation failed
+        - 2: configuration error (missing required files/dirs)
+    """
+    parser = argparse.ArgumentParser(
+        description="Validate Component Gallery JSON files."
+    )
+    parser.add_argument(
+        "--compiled",
+        action="store_true",
+        help="Also validate compiled/components.json against schemas/compiled.schema.json.",
+    )
+    parser.add_argument(
+        "--no-policy",
+        action="store_true",
+        help="Disable policy/lint checks beyond schema validation.",
+    )
+    parser.add_argument(
+        "--max-component-bytes",
+        type=int,
+        default=50_000,
+        help="Max allowed size for each components/*.json file (default: 50000).",
+    )
+    args = parser.parse_args(argv)
+
+    repo_root = Path(__file__).resolve().parents[1]
+
+    all_issues: list[ValidationIssue] = []
+
+    # Guardrails for common mistakes
+    if not (repo_root / "schemas" / "component.schema.json").is_file():
+        print("ERROR: Missing schema: schemas/component.schema.json", file=sys.stderr)
+        return 2
+    if not (repo_root / "components").is_dir():
+        print("ERROR: Missing directory: components/", file=sys.stderr)
+        return 2
+
+    all_issues.extend(validate_components(repo_root))
+    if not args.no_policy:
+        all_issues.extend(
+            validate_policies(repo_root, max_component_bytes=args.max_component_bytes)
+        )
+    if args.compiled:
+        all_issues.extend(validate_compiled(repo_root))
+
+    hard_errors = [i for i in all_issues if "skipping" not in i.message.lower()]
+
+    if hard_errors:
+        # Group and compress output by file for readability.
+        by_file: dict[Path, list[ValidationIssue]] = defaultdict(list)
+        for issue in hard_errors:
+            by_file[issue.file].append(issue)
+
+        total_files = len(by_file)
+        print(
+            f"Found {len(hard_errors)} validation error(s) across {total_files} file(s):",
+            file=sys.stderr,
+        )
+
+        for file_path in sorted(by_file.keys()):
+            issues = by_file[file_path]
+            # All issues for a given file share the same schema in our usage.
+            schema_path = issues[0].schema
+            rel = (
+                file_path.relative_to(repo_root)
+                if file_path.is_absolute()
+                else file_path
+            )
+            print(f"\n- {rel} ({len(issues)} error(s))", file=sys.stderr)
+            print(f"  schema: {schema_path.relative_to(repo_root)}", file=sys.stderr)
+            for issue in sorted(issues, key=lambda i: (i.json_path or "$", i.message)):
+                jp = issue.json_path or "$"
+                print(f"  - {jp}: {issue.message}", file=sys.stderr)
+        return 1
+
+    print("OK: all validated files passed.")
+    # If the only issues are "compiled missing (skipping)", be explicit.
+    skipped = [i for i in all_issues if "skipping" in i.message.lower()]
+    for s in skipped:
+        print(f"NOTE: {s.file} - {s.message}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..9d09f36a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+jsonschema>=4.25.1
+requests>=2.32.0