From 81dc06f89f3fcc69b6d9a8bd76a6355461476da4 Mon Sep 17 00:00:00 2001 From: Bob Nisco Date: Tue, 13 Jan 2026 14:00:01 -0800 Subject: [PATCH] [feat] Add component directory scripts --- .gitattributes | 1 + directory/ranking_config.json | 10 + directory/scripts/__init__.py | 1 + directory/scripts/_enrichers/__init__.py | 13 + directory/scripts/_enrichers/github.py | 216 +++++++ directory/scripts/_enrichers/pypi.py | 143 +++++ directory/scripts/_enrichers/pypistats.py | 125 ++++ directory/scripts/_utils/__init__.py | 4 + directory/scripts/_utils/enrich.py | 34 ++ directory/scripts/_utils/enrichment_engine.py | 248 ++++++++ directory/scripts/_utils/github.py | 28 + directory/scripts/_utils/http.py | 146 +++++ directory/scripts/_utils/image_url_policy.py | 31 + directory/scripts/_utils/io.py | 18 + directory/scripts/_utils/metrics.py | 22 + directory/scripts/_utils/pypi_helpers.py | 35 ++ directory/scripts/_utils/time.py | 34 ++ directory/scripts/build_catalog.py | 455 ++++++++++++++ directory/scripts/compute_ranking.py | 284 +++++++++ directory/scripts/enrich.py | 244 ++++++++ directory/scripts/enrich_images.py | 501 ++++++++++++++++ directory/scripts/run_pipeline.py | 273 +++++++++ directory/scripts/validate.py | 554 ++++++++++++++++++ requirements.txt | 2 + 24 files changed, 3422 insertions(+) create mode 100644 .gitattributes create mode 100644 directory/ranking_config.json create mode 100644 directory/scripts/__init__.py create mode 100644 directory/scripts/_enrichers/__init__.py create mode 100644 directory/scripts/_enrichers/github.py create mode 100644 directory/scripts/_enrichers/pypi.py create mode 100644 directory/scripts/_enrichers/pypistats.py create mode 100644 directory/scripts/_utils/__init__.py create mode 100644 directory/scripts/_utils/enrich.py create mode 100644 directory/scripts/_utils/enrichment_engine.py create mode 100644 directory/scripts/_utils/github.py create mode 100644 directory/scripts/_utils/http.py create mode 100644 directory/scripts/_utils/image_url_policy.py create mode 100644 directory/scripts/_utils/io.py create mode 100644 directory/scripts/_utils/metrics.py create mode 100644 directory/scripts/_utils/pypi_helpers.py create mode 100644 directory/scripts/_utils/time.py create mode 100644 directory/scripts/build_catalog.py create mode 100644 directory/scripts/compute_ranking.py create mode 100644 directory/scripts/enrich.py create mode 100644 directory/scripts/enrich_images.py create mode 100644 directory/scripts/run_pipeline.py create mode 100644 directory/scripts/validate.py create mode 100644 requirements.txt diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..a0fea2f6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +directory/compiled/** linguist-generated=true diff --git a/directory/ranking_config.json b/directory/ranking_config.json new file mode 100644 index 00000000..b67b0c49 --- /dev/null +++ b/directory/ranking_config.json @@ -0,0 +1,10 @@ +{ + "schemaVersion": 1, + "halfLifeDays": 90.0, + "weights": { + "stars": 1.0, + "recency": 2.0, + "contributors": 0.5, + "downloads": 0.35 + } +} diff --git a/directory/scripts/__init__.py b/directory/scripts/__init__.py new file mode 100644 index 00000000..9d48db4f --- /dev/null +++ b/directory/scripts/__init__.py @@ -0,0 +1 @@ +from __future__ import annotations diff --git a/directory/scripts/_enrichers/__init__.py b/directory/scripts/_enrichers/__init__.py new file mode 100644 index 00000000..15e02349 --- /dev/null +++ b/directory/scripts/_enrichers/__init__.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from .github import GitHubEnricher # type: ignore[import-not-found] +from .pypi import PyPiEnricher # type: ignore[import-not-found] +from .pypistats import PyPiStatsEnricher # type: ignore[import-not-found] + + +def get_default_enrichers(*, github_token_env: str = "GH_TOKEN") -> list: + return [ + GitHubEnricher(token_env=github_token_env), + PyPiEnricher(), + PyPiStatsEnricher(), + ] diff --git a/directory/scripts/_enrichers/github.py b/directory/scripts/_enrichers/github.py new file mode 100644 index 00000000..a56f7462 --- /dev/null +++ b/directory/scripts/_enrichers/github.py @@ -0,0 +1,216 @@ +from __future__ import annotations + +import os +import re +from dataclasses import dataclass +from typing import Any +from urllib.parse import parse_qs, urlparse + +from _utils.enrich import should_refetch +from _utils.enrichment_engine import FetchResult, Patch +from _utils.github import parse_owner_repo +from _utils.http import RetryConfig, fetch_json +from _utils.time import utc_now_iso + +GITHUB_API_BASE = "https://api.github.com" + + +@dataclass(frozen=True) +class GitHubResult: + owner: str + repo: str + stars: int | None + forks: int | None + contributors_count: int | None + open_issues: int | None + pushed_at: str | None + + +def _github_repo_api_url(owner: str, repo: str) -> str: + return f"{GITHUB_API_BASE}/repos/{owner}/{repo}" + + +def _github_contributors_api_url(owner: str, repo: str) -> str: + return f"{GITHUB_API_BASE}/repos/{owner}/{repo}/contributors?per_page=1" + + +_LINK_LAST_RE = re.compile(r'<([^>]+)>;\s*rel="last"') + + +def _parse_last_page_from_link_header(link: str | None) -> int | None: + if not isinstance(link, str) or not link.strip(): + return None + m = _LINK_LAST_RE.search(link) + if not m: + return None + try: + last_url = m.group(1) + parsed = urlparse(last_url) + qs = parse_qs(parsed.query) + page_vals = qs.get("page") + if not page_vals: + return None + page = int(page_vals[0]) + return page if page >= 0 else None + except Exception: + return None + + +def _get_token(token_env: str) -> str | None: + token = os.environ.get(token_env) + if token: + return token.strip() or None + for k in ("GH_TOKEN", "GH_API_TOKEN", "GITHUB_TOKEN"): + token = os.environ.get(k) + if token: + return token.strip() or None + return None + + +class GitHubEnricher: + name = "github" + bucket = "github" + + def __init__(self, *, token_env: str = "GH_TOKEN") -> None: + self._token_env = token_env + self._token = _get_token(token_env) + self._retry_cfg = RetryConfig(retry_statuses=(403, 429, 500, 502, 503, 504)) + + def key_for_component(self, comp: dict[str, Any]) -> tuple[str, str] | None: + gh_url = comp.get("gitHubUrl") + if not isinstance(gh_url, str) or not gh_url.strip(): + return None + try: + owner, repo = parse_owner_repo(gh_url) + except Exception: + return None + return (owner.lower(), repo.lower()) + + def needs_fetch( + self, comp: dict[str, Any], refresh_older_than_hours: float | None + ) -> bool: + metrics = comp.get("metrics") + gh_metrics = metrics.get("github") if isinstance(metrics, dict) else None + existing_fetched_at = ( + gh_metrics.get("fetchedAt") if isinstance(gh_metrics, dict) else None + ) + stale = gh_metrics.get("isStale") if isinstance(gh_metrics, dict) else None + return should_refetch( + fetched_at=( + existing_fetched_at if isinstance(existing_fetched_at, str) else None + ), + is_stale=stale if isinstance(stale, bool) else None, + refresh_older_than_hours=refresh_older_than_hours, + ) + + def _headers(self) -> dict[str, str]: + headers = { + "Accept": "application/vnd.github+json", + "User-Agent": "component-gallery-enrich-github", + "X-GitHub-Api-Version": "2022-11-28", + } + if self._token: + headers["Authorization"] = f"Bearer {self._token}" + return headers + + def _fetch_contributors_count( + self, *, ctx, owner: str, repo: str + ) -> tuple[int | None, int, int | None, str | None]: + url = _github_contributors_api_url(owner, repo) + r = ctx.request_json( + url=url, + headers=self._headers(), + fetcher=fetch_json, + retry_cfg=self._retry_cfg, + ) + if not r.ok or not isinstance(r.data, list): + return None, r.attempts, r.status, r.error + link = None + if isinstance(r.headers, dict): + link = r.headers.get("Link") or r.headers.get("link") + last_page = _parse_last_page_from_link_header(link) + if isinstance(last_page, int): + return last_page, r.attempts, r.status, None + return (1 if len(r.data) >= 1 else 0), r.attempts, r.status, None + + def fetch(self, key: tuple[str, str], ctx) -> FetchResult: + owner, repo = key + url = _github_repo_api_url(owner, repo) + r = ctx.request_json( + url=url, + headers=self._headers(), + fetcher=fetch_json, + retry_cfg=self._retry_cfg, + ) + attempts = int(r.attempts) + if not r.ok or not isinstance(r.data, dict): + return FetchResult( + ok=False, + data=None, + error=r.error or "Request failed.", + attempts=attempts, + status=r.status, + ) + + data = r.data + stars = data.get("stargazers_count") + forks = data.get("forks_count") + open_issues = data.get("open_issues_count") + pushed_at = data.get("pushed_at") + + contributors_count, contrib_attempts, status, err = ( + self._fetch_contributors_count(ctx=ctx, owner=owner, repo=repo) + ) + attempts += int(contrib_attempts) + if err: + return FetchResult( + ok=False, + data=None, + error=err, + attempts=attempts, + status=status, + ) + + result = GitHubResult( + owner=owner, + repo=repo, + stars=int(stars) if isinstance(stars, int) else None, + forks=int(forks) if isinstance(forks, int) else None, + contributors_count=( + int(contributors_count) + if isinstance(contributors_count, int) and contributors_count >= 0 + else None + ), + open_issues=int(open_issues) if isinstance(open_issues, int) else None, + pushed_at=str(pushed_at) if isinstance(pushed_at, str) else None, + ) + return FetchResult( + ok=True, data=result, error=None, attempts=attempts, status=r.status + ) + + def patch_success( + self, comp: dict[str, Any], result: GitHubResult, fetched_at: str + ) -> Patch: + metrics = comp.get("metrics") + gh_metrics = metrics.get("github") if isinstance(metrics, dict) else None + prev_stars = gh_metrics.get("stars") if isinstance(gh_metrics, dict) else None + + updates: dict[str, Any] = {} + if isinstance(result.stars, int): + updates["stars"] = result.stars + if isinstance(result.forks, int): + updates["forks"] = result.forks + if isinstance(result.contributors_count, int): + updates["contributorsCount"] = result.contributors_count + if isinstance(result.open_issues, int): + updates["openIssues"] = result.open_issues + if isinstance(result.pushed_at, str): + updates["lastPushAt"] = result.pushed_at + updates["fetchedAt"] = fetched_at or utc_now_iso() + updates["isStale"] = False + + changed = isinstance(result.stars, int) and prev_stars != result.stars + return Patch(bucket=self.bucket, updates=updates, changed=changed) + + def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch: + return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False) diff --git a/directory/scripts/_enrichers/pypi.py b/directory/scripts/_enrichers/pypi.py new file mode 100644 index 00000000..1f2e005c --- /dev/null +++ b/directory/scripts/_enrichers/pypi.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from _utils.enrich import should_refetch +from _utils.enrichment_engine import FetchResult, Patch +from _utils.http import RetryConfig, fetch_json +from _utils.pypi_helpers import infer_pypi_project_from_piplink +from _utils.time import utc_now_iso + +PYPI_BASE = "https://pypi.org/pypi" + + +@dataclass(frozen=True) +class PyPiResult: + project: str + latest_version: str | None + latest_release_at: str | None + + +def _get_project_for_component(comp: dict[str, Any]) -> str | None: + p = comp.get("pypi") + if isinstance(p, str) and p.strip(): + return p.strip() + return infer_pypi_project_from_piplink(comp.get("pipLink")) + + +def _pypi_api_url(project: str) -> str: + return f"{PYPI_BASE}/{project}/json" + + +def _max_upload_time_iso(release_files: Any) -> str | None: + if not isinstance(release_files, list): + return None + times: list[str] = [] + for f in release_files: + if not isinstance(f, dict): + continue + t = f.get("upload_time_iso_8601") or f.get("upload_time") + if isinstance(t, str) and t: + times.append(t) + return max(times) if times else None + + +class PyPiEnricher: + name = "pypi" + bucket = "pypi" + + def __init__(self) -> None: + self._retry_cfg = RetryConfig(retry_statuses=(429, 500, 502, 503, 504)) + + def key_for_component(self, comp: dict[str, Any]) -> str | None: + return _get_project_for_component(comp) + + def needs_fetch( + self, comp: dict[str, Any], refresh_older_than_hours: float | None + ) -> bool: + metrics = comp.get("metrics") + pypi_metrics = metrics.get("pypi") if isinstance(metrics, dict) else None + existing_fetched_at = ( + pypi_metrics.get("fetchedAt") if isinstance(pypi_metrics, dict) else None + ) + stale = pypi_metrics.get("isStale") if isinstance(pypi_metrics, dict) else None + return should_refetch( + fetched_at=( + existing_fetched_at if isinstance(existing_fetched_at, str) else None + ), + is_stale=stale if isinstance(stale, bool) else None, + refresh_older_than_hours=refresh_older_than_hours, + ) + + def fetch(self, key: str, ctx) -> FetchResult: + url = _pypi_api_url(key) + headers = { + "Accept": "application/json", + "User-Agent": "component-gallery-enrich-pypi", + } + r = ctx.request_json( + url=url, + headers=headers, + fetcher=fetch_json, + retry_cfg=self._retry_cfg, + ) + if not r.ok or not isinstance(r.data, dict): + return FetchResult( + ok=False, + data=None, + error=r.error or "Request failed.", + attempts=int(r.attempts), + status=r.status, + ) + data = r.data + info = data.get("info") + releases = data.get("releases") + if not isinstance(info, dict) or not isinstance(releases, dict): + return FetchResult( + ok=False, + data=None, + error="Missing info/releases.", + attempts=int(r.attempts), + status=r.status, + ) + latest_version = info.get("version") + latest_version = ( + str(latest_version) + if isinstance(latest_version, str) and latest_version + else None + ) + + latest_release_at: str | None = None + if latest_version and latest_version in releases: + latest_release_at = _max_upload_time_iso(releases.get(latest_version)) + if latest_release_at is None: + best: str | None = None + for _, files in releases.items(): + t = _max_upload_time_iso(files) + if t and (best is None or t > best): + best = t + latest_release_at = best + + result = PyPiResult( + project=key, + latest_version=latest_version, + latest_release_at=latest_release_at, + ) + return FetchResult( + ok=True, data=result, error=None, attempts=int(r.attempts), status=r.status + ) + + def patch_success( + self, comp: dict[str, Any], result: PyPiResult, fetched_at: str + ) -> Patch: + updates = { + "latestVersion": result.latest_version, + "latestReleaseAt": result.latest_release_at, + "fetchedAt": fetched_at or utc_now_iso(), + "isStale": False, + } + return Patch(bucket=self.bucket, updates=updates, changed=True) + + def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch: + return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False) diff --git a/directory/scripts/_enrichers/pypistats.py b/directory/scripts/_enrichers/pypistats.py new file mode 100644 index 00000000..35f526ae --- /dev/null +++ b/directory/scripts/_enrichers/pypistats.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from _utils.enrich import should_refetch +from _utils.enrichment_engine import FetchResult, Patch +from _utils.http import RetryConfig, fetch_json +from _utils.pypi_helpers import infer_pypi_project_from_piplink +from _utils.time import utc_now_iso + +PYPISTATS_BASE = "https://pypistats.org/api/packages" + + +@dataclass(frozen=True) +class PyPiStatsResult: + project: str + last_day: int | None + last_week: int | None + last_month: int | None + + +def _get_project_for_component(comp: dict[str, Any]) -> str | None: + p = comp.get("pypi") + if isinstance(p, str) and p.strip(): + return p.strip() + return infer_pypi_project_from_piplink(comp.get("pipLink")) + + +def _pypistats_recent_url(project: str) -> str: + return f"{PYPISTATS_BASE}/{project}/recent" + + +class PyPiStatsEnricher: + name = "pypistats" + bucket = "pypistats" + + def __init__(self) -> None: + self._retry_cfg = RetryConfig(retry_statuses=(429, 500, 502, 503, 504)) + + def key_for_component(self, comp: dict[str, Any]) -> str | None: + return _get_project_for_component(comp) + + def needs_fetch( + self, comp: dict[str, Any], refresh_older_than_hours: float | None + ) -> bool: + metrics = comp.get("metrics") if isinstance(comp.get("metrics"), dict) else None + pypistats_metrics = ( + metrics.get("pypistats") if isinstance(metrics, dict) else None + ) + existing_fetched_at = ( + pypistats_metrics.get("fetchedAt") + if isinstance(pypistats_metrics, dict) + else None + ) + stale = ( + pypistats_metrics.get("isStale") + if isinstance(pypistats_metrics, dict) + else None + ) + return should_refetch( + fetched_at=( + existing_fetched_at if isinstance(existing_fetched_at, str) else None + ), + is_stale=stale if isinstance(stale, bool) else None, + refresh_older_than_hours=refresh_older_than_hours, + ) + + def fetch(self, key: str, ctx) -> FetchResult: + url = _pypistats_recent_url(key) + headers = { + "Accept": "application/json", + "User-Agent": "component-gallery-enrich-pypistats", + } + r = ctx.request_json( + url=url, + headers=headers, + fetcher=fetch_json, + retry_cfg=self._retry_cfg, + ) + if not r.ok or not isinstance(r.data, dict): + return FetchResult( + ok=False, + data=None, + error=r.error or "Request failed.", + attempts=int(r.attempts), + status=r.status, + ) + data = r.data.get("data") if isinstance(r.data, dict) else None + if not isinstance(data, dict): + return FetchResult( + ok=False, + data=None, + error="Missing data payload.", + attempts=int(r.attempts), + status=r.status, + ) + + def _as_int(x: Any) -> int | None: + return int(x) if isinstance(x, int) and x >= 0 else None + + result = PyPiStatsResult( + project=key, + last_day=_as_int(data.get("last_day")), + last_week=_as_int(data.get("last_week")), + last_month=_as_int(data.get("last_month")), + ) + return FetchResult( + ok=True, data=result, error=None, attempts=int(r.attempts), status=r.status + ) + + def patch_success( + self, comp: dict[str, Any], result: PyPiStatsResult, fetched_at: str + ) -> Patch: + updates = { + "lastDay": result.last_day, + "lastWeek": result.last_week, + "lastMonth": result.last_month, + "fetchedAt": fetched_at or utc_now_iso(), + "isStale": False, + } + return Patch(bucket=self.bucket, updates=updates, changed=True) + + def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch: + return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False) diff --git a/directory/scripts/_utils/__init__.py b/directory/scripts/_utils/__init__.py new file mode 100644 index 00000000..36a57ceb --- /dev/null +++ b/directory/scripts/_utils/__init__.py @@ -0,0 +1,4 @@ +"""Shared utilities for the component-gallery scripts. + +These helpers intentionally avoid third-party dependencies. +""" diff --git a/directory/scripts/_utils/enrich.py b/directory/scripts/_utils/enrich.py new file mode 100644 index 00000000..e1f911ef --- /dev/null +++ b/directory/scripts/_utils/enrich.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +from .time import parse_iso8601 + + +def should_refetch( + *, + fetched_at: str | None, + is_stale: bool | None, + refresh_older_than_hours: float | None, +) -> bool: + """Return True if we should refetch a metric bucket. + + Rules: + - If refresh_older_than_hours is None or <= 0: always refetch + - If we have no parseable fetched_at: refetch + - If is_stale is True: refetch + - Otherwise, refetch only when fetched_at is older than refresh_older_than_hours + """ + if refresh_older_than_hours is None or refresh_older_than_hours <= 0: + return True + if is_stale is True: + return True + + dt = parse_iso8601(fetched_at) + if not dt: + return True + + age_h = (datetime.now(timezone.utc) - dt).total_seconds() / 3600.0 + return age_h >= refresh_older_than_hours + + diff --git a/directory/scripts/_utils/enrichment_engine.py b/directory/scripts/_utils/enrichment_engine.py new file mode 100644 index 00000000..5508f6d5 --- /dev/null +++ b/directory/scripts/_utils/enrichment_engine.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +import threading +import time +from concurrent.futures import Future, ThreadPoolExecutor +from dataclasses import dataclass +from typing import Any, Callable, Iterable, Protocol, cast + +import requests + +from .metrics import Bucket, ensure_bucket + + +class Enricher(Protocol): + name: str + bucket: str + + def key_for_component(self, comp: dict[str, Any]) -> Any | None: ... + + def needs_fetch( + self, comp: dict[str, Any], refresh_older_than_hours: float | None + ) -> bool: ... + + def fetch(self, key: Any, ctx: "EnricherContext") -> "FetchResult": ... + + def patch_success( + self, comp: dict[str, Any], result: Any, fetched_at: str + ) -> "Patch": ... + + def patch_failure(self, comp: dict[str, Any], error: str | None) -> "Patch": ... + + +@dataclass(frozen=True) +class FetchResult: + ok: bool + data: Any | None + error: str | None + attempts: int + status: int | None = None + + +@dataclass(frozen=True) +class Patch: + bucket: str + updates: dict[str, Any] + changed: bool = True + + +@dataclass(frozen=True) +class Failure: + key: str + status: int | None + error: str | None + + +@dataclass +class ServiceStats: + processed: int = 0 + requests: int = 0 + ok: int = 0 + failed: int = 0 + updated: int = 0 + skipped_fresh: int = 0 + skipped_no_key: int = 0 + cache_hits: int = 0 + + +class ServiceLimiter: + def __init__(self, min_interval_s: float) -> None: + self._min_interval_s = max(0.0, float(min_interval_s)) + self._lock = threading.Lock() + self._next_allowed = 0.0 + + def acquire(self) -> None: + if self._min_interval_s <= 0: + return + wait_s = 0.0 + with self._lock: + now = time.monotonic() + if now < self._next_allowed: + wait_s = self._next_allowed - now + self._next_allowed = max(now, self._next_allowed) + self._min_interval_s + if wait_s > 0: + time.sleep(wait_s) + + +class ThreadLocalSession: + def __init__(self) -> None: + self._tls = threading.local() + + def get(self) -> requests.Session: + sess = getattr(self._tls, "session", None) + if sess is None: + sess = requests.Session() + self._tls.session = sess + return sess + + +@dataclass +class EnricherContext: + name: str + limiter: ServiceLimiter + session_getter: Callable[[], requests.Session] + timeout_s: float + + def request_json( + self, + *, + url: str, + headers: dict[str, str] | None, + fetcher: Callable[..., Any], + retry_cfg: Any, + ) -> Any: + self.limiter.acquire() + session = self.session_getter() + return fetcher( + session=session, + url=url, + headers=headers, + timeout_s=self.timeout_s, + retry=retry_cfg, + ) + + +@dataclass +class EngineRunResult: + stats: dict[str, ServiceStats] + failures: dict[str, list[Failure]] + + +def run_enrichment_engine( + *, + components: list[dict[str, Any]], + enrichers: Iterable[Enricher], + refresh_older_than_hours: float | None, + timeout_s: float, + sleep_by_service: dict[str, float], + workers: int, + run_fetched_at: str, + progress_every: int | None = None, +) -> EngineRunResult: + enricher_list = list(enrichers) + stats: dict[str, ServiceStats] = {e.name: ServiceStats() for e in enricher_list} + failures: dict[str, list[Failure]] = {e.name: [] for e in enricher_list} + + inflight: dict[tuple[str, Any], tuple[Enricher, Any, Future[FetchResult]]] = {} + future_meta: dict[Future[FetchResult], tuple[str, Any]] = {} + comp_tasks: list[list[tuple[Enricher, Future[FetchResult]]]] = [ + [] for _ in range(len(components)) + ] + + limiter_by_service = { + e.name: ServiceLimiter(sleep_by_service.get(e.name, 0.0)) for e in enricher_list + } + session_by_service = {e.name: ThreadLocalSession() for e in enricher_list} + + def submit_fetch(enricher: Enricher, key: Any) -> Future[FetchResult]: + limiter = limiter_by_service[enricher.name] + session_factory = session_by_service[enricher.name].get + + def _run() -> FetchResult: + ctx = EnricherContext( + name=enricher.name, + limiter=limiter, + session_getter=session_factory, + timeout_s=timeout_s, + ) + return enricher.fetch(key, ctx) + + return executor.submit(_run) + + with ThreadPoolExecutor(max_workers=max(1, int(workers))) as executor: + for idx, comp in enumerate(components): + if not isinstance(comp, dict): + continue + for enricher in enricher_list: + stats[enricher.name].processed += 1 + if not enricher.needs_fetch(comp, refresh_older_than_hours): + stats[enricher.name].skipped_fresh += 1 + continue + key = enricher.key_for_component(comp) + if key is None: + stats[enricher.name].skipped_no_key += 1 + continue + inflight_key = (enricher.name, key) + if inflight_key in inflight: + stats[enricher.name].cache_hits += 1 + fut = inflight[inflight_key][2] + else: + fut = submit_fetch(enricher, key) + inflight[inflight_key] = (enricher, key, fut) + future_meta[fut] = (enricher.name, key) + comp_tasks[idx].append((enricher, fut)) + + # Apply patches deterministically by component index, resolving futures lazily + # so progress can be reported as components complete. + result_cache: dict[Future[FetchResult], FetchResult] = {} + counted: set[Future[FetchResult]] = set() + for idx, comp in enumerate(components): + if not isinstance(comp, dict): + continue + for enricher, fut in comp_tasks[idx]: + if fut in result_cache: + res = result_cache[fut] + else: + res = fut.result() + result_cache[fut] = res + if fut not in counted: + counted.add(fut) + meta = future_meta.get(fut) + if meta is not None: + service_name, key = meta + else: + service_name, key = enricher.name, "?" + stats[service_name].requests += int(res.attempts) + if res.ok: + stats[service_name].ok += 1 + else: + stats[service_name].failed += 1 + failures[service_name].append( + Failure(key=str(key), status=res.status, error=res.error) + ) + if res.ok: + patch = enricher.patch_success(comp, res.data, run_fetched_at) + else: + patch = enricher.patch_failure(comp, res.error) + bucket = ensure_bucket(comp, cast(Bucket, patch.bucket)) + for k, v in patch.updates.items(): + bucket[k] = v + if patch.changed: + stats[enricher.name].updated += 1 + + if ( + progress_every + and progress_every > 0 + and (idx + 1) % progress_every == 0 + ): + for enricher in enricher_list: + s = stats[enricher.name] + print( + f"[{enricher.name}] requests={s.requests} " + f"ok={s.ok} fail={s.failed} " + f"updated={s.updated} skipped_fresh={s.skipped_fresh} " + f"cache_hits={s.cache_hits} skipped_no_key={s.skipped_no_key}", + flush=True, + ) + + return EngineRunResult(stats=stats, failures=failures) diff --git a/directory/scripts/_utils/github.py b/directory/scripts/_utils/github.py new file mode 100644 index 00000000..30cd870d --- /dev/null +++ b/directory/scripts/_utils/github.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from urllib.parse import urlparse + + +def normalize_github_repo_url(url: str) -> str: + """Canonicalize `https://github.com//` (no trailing slash).""" + parsed = urlparse(url) + if parsed.scheme != "https" or parsed.netloc != "github.com": + raise ValueError(f"Not a GitHub HTTPS URL: {url}") + parts = [p for p in parsed.path.split("/") if p] + if len(parts) < 2: + raise ValueError(f"Not a GitHub repo URL: {url}") + owner, repo = parts[0], parts[1] + return f"https://github.com/{owner}/{repo}" + + +def parse_owner_repo(github_url: str) -> tuple[str, str]: + canonical = normalize_github_repo_url(github_url) + parsed = urlparse(canonical) + parts = [p for p in parsed.path.split("/") if p] + return parts[0], parts[1] + + +def repo_key(github_url: str) -> str: + """Stable identifier `owner/repo` (lowercased).""" + owner, repo = parse_owner_repo(github_url) + return f"{owner.lower()}/{repo.lower()}" diff --git a/directory/scripts/_utils/http.py b/directory/scripts/_utils/http.py new file mode 100644 index 00000000..79778d6c --- /dev/null +++ b/directory/scripts/_utils/http.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import random +import time +from dataclasses import dataclass +from typing import Any + +import requests + + +def _maybe_int(s: str | None) -> int | None: + if not isinstance(s, str) or not s.strip(): + return None + try: + return int(s.strip()) + except Exception: + return None + + +def _retry_after_seconds(headers: dict[str, str] | None) -> int | None: + if not headers: + return None + return _maybe_int(headers.get("Retry-After")) + + +@dataclass(frozen=True) +class RetryConfig: + max_attempts: int = 6 + backoff_base_s: float = 0.5 + backoff_cap_s: float = 60.0 + retry_statuses: tuple[int, ...] = (403, 429, 500, 502, 503, 504) + + +@dataclass(frozen=True) +class FetchJsonResult: + ok: bool + status: int | None + data: Any | None + headers: dict[str, str] | None + error: str | None + attempts: int + last_retry_after_s: int | None = None + + +def fetch_json( + *, + session: requests.Session, + url: str, + headers: dict[str, str] | None, + timeout_s: float, + retry: RetryConfig, +) -> FetchJsonResult: + """GET a URL and parse JSON with retry/backoff. + + Retries on retry_statuses and on request-level exceptions. + Honors integer Retry-After when present. + """ + last_retry_after: int | None = None + + for attempt in range(1, max(1, retry.max_attempts) + 1): + try: + resp = session.get(url, headers=headers, timeout=timeout_s) + except requests.RequestException as e: + # Retryable network error. + if attempt >= retry.max_attempts: + return FetchJsonResult( + ok=False, + status=None, + data=None, + headers=None, + error=f"{type(e).__name__}: {e}", + attempts=attempt, + last_retry_after_s=last_retry_after, + ) + wait_s = min(retry.backoff_cap_s, (2**attempt) * retry.backoff_base_s) + wait_s = wait_s + random.random() * 0.25 + time.sleep(wait_s) + continue + + status = int(resp.status_code) + # Success path. + if 200 <= status < 300: + try: + return FetchJsonResult( + ok=True, + status=status, + data=resp.json(), + headers=dict(resp.headers) if resp.headers else None, + error=None, + attempts=attempt, + last_retry_after_s=last_retry_after, + ) + except Exception as e: + return FetchJsonResult( + ok=False, + status=status, + data=None, + headers=dict(resp.headers) if resp.headers else None, + error=f"Invalid JSON payload: {e}", + attempts=attempt, + last_retry_after_s=last_retry_after, + ) + + # Non-success: decide whether to retry. + body = None + try: + body = resp.text + except Exception: + body = None + + if status in retry.retry_statuses and attempt < retry.max_attempts: + ra = _retry_after_seconds(dict(resp.headers) if resp.headers else None) + if isinstance(ra, int): + last_retry_after = ra + wait_s = min(retry.backoff_cap_s, (2**attempt) * retry.backoff_base_s) + if isinstance(ra, int): + wait_s = max(wait_s, float(ra)) + wait_s = wait_s + random.random() * 0.25 + time.sleep(wait_s) + continue + + # Final failure. + msg = f"HTTP {status}" + if body: + msg = f"{msg}: {body[:5000]}" + ra = _retry_after_seconds(dict(resp.headers) if resp.headers else None) + return FetchJsonResult( + ok=False, + status=status, + data=None, + headers=dict(resp.headers) if resp.headers else None, + error=msg + (f" (Retry-After={ra}s)" if isinstance(ra, int) else ""), + attempts=attempt, + last_retry_after_s=ra if isinstance(ra, int) else last_retry_after, + ) + + # Unreachable. + return FetchJsonResult( + ok=False, + status=None, + data=None, + headers=None, + error="Unknown error.", + attempts=retry.max_attempts, + last_retry_after_s=last_retry_after, + ) diff --git a/directory/scripts/_utils/image_url_policy.py b/directory/scripts/_utils/image_url_policy.py new file mode 100644 index 00000000..c4c13fcd --- /dev/null +++ b/directory/scripts/_utils/image_url_policy.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +# Shared policy constants used by multiple scripts (`validate.py`, `enrich_images.py`). + +DISALLOWED_IMAGE_HOSTS = { + # GitHub's image proxy URLs are often brittle and not the canonical image source. + "camo.githubusercontent.com", +} + +# Keys are compared case-insensitively. +DISALLOWED_IMAGE_QUERY_KEYS = { + # AWS SigV4 + "x-amz-algorithm", + "x-amz-credential", + "x-amz-date", + "x-amz-expires", + "x-amz-signature", + "x-amz-signedheaders", + # GCS signed URLs + "x-goog-algorithm", + "x-goog-credential", + "x-goog-date", + "x-goog-expires", + "x-goog-signature", + # CloudFront (common) + "expires", + "signature", + "key-pair-id", + "policy", +} + diff --git a/directory/scripts/_utils/io.py b/directory/scripts/_utils/io.py new file mode 100644 index 00000000..899230d6 --- /dev/null +++ b/directory/scripts/_utils/io.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + + +def load_json(path: Path) -> Any: + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def dump_json(path: Path, obj: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + # Use 2-space indentation for human-friendly diffs in GitHub PRs. + json.dump(obj, f, indent=2, ensure_ascii=False, sort_keys=True) + f.write("\n") diff --git a/directory/scripts/_utils/metrics.py b/directory/scripts/_utils/metrics.py new file mode 100644 index 00000000..fb40daa8 --- /dev/null +++ b/directory/scripts/_utils/metrics.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import Any, Literal + +Bucket = Literal["github", "pypi", "pypistats"] + + +def ensure_metrics(comp: dict[str, Any]) -> dict[str, Any]: + metrics = comp.get("metrics") + if not isinstance(metrics, dict): + metrics = {} + comp["metrics"] = metrics + return metrics + + +def ensure_bucket(comp: dict[str, Any], bucket: Bucket) -> dict[str, Any]: + metrics = ensure_metrics(comp) + b = metrics.get(bucket) + if not isinstance(b, dict): + b = {} + metrics[bucket] = b + return b diff --git a/directory/scripts/_utils/pypi_helpers.py b/directory/scripts/_utils/pypi_helpers.py new file mode 100644 index 00000000..d4d6e014 --- /dev/null +++ b/directory/scripts/_utils/pypi_helpers.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import re + +_PIP_INSTALL_RE = re.compile( + r"""(?ix) + ^\s* + pip(?:3)? # pip / pip3 + \s+install + \s+ + (?P\S+) # first argument to pip install + """ +) + + +def infer_pypi_project_from_piplink(pip_link: str | None) -> str | None: + """Infer the PyPI project name from a compiled catalog `pipLink` string. + + This is intentionally conservative: it refuses URL/git-based installs and + strips common version specifiers and extras. + """ + if not isinstance(pip_link, str) or not pip_link.strip(): + return None + m = _PIP_INSTALL_RE.match(pip_link) + if not m: + return None + spec = m.group("spec").strip().strip('"').strip("'") + if not spec: + return None + if "://" in spec or spec.startswith("git+"): + return None + base = spec.split("==", 1)[0].split(">=", 1)[0].split("<=", 1)[0].split("~=", 1)[0] + base = base.split("[", 1)[0] + base = base.strip() + return base or None diff --git a/directory/scripts/_utils/time.py b/directory/scripts/_utils/time.py new file mode 100644 index 00000000..9f385aeb --- /dev/null +++ b/directory/scripts/_utils/time.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from datetime import datetime, timezone + + +def utc_now_iso() -> str: + """UTC now in ISO8601 with Z suffix (e.g. 2025-12-19T00:00:00Z).""" + return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + + +def parse_iso8601(dt: str | None) -> datetime | None: + """Parse a subset of ISO8601/RFC3339 strings used by GitHub/PyPI. + + Accepts timestamps like: + - 2025-11-30T12:33:58Z + - 2025-11-23T22:30:23.036058Z + - 2025-11-23T22:30:23+00:00 + + Returns timezone-aware UTC datetimes when possible. + """ + if not isinstance(dt, str) or not dt.strip(): + return None + s = dt.strip() + # `datetime.fromisoformat` doesn't accept "Z" suffix; normalize it. + if s.endswith("Z"): + s = s[:-1] + "+00:00" + try: + parsed = datetime.fromisoformat(s) + except ValueError: + return None + if parsed.tzinfo is None: + # Assume UTC if tzinfo is missing (shouldn't happen with our sources) + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) diff --git a/directory/scripts/build_catalog.py b/directory/scripts/build_catalog.py new file mode 100644 index 00000000..4ba8a7ac --- /dev/null +++ b/directory/scripts/build_catalog.py @@ -0,0 +1,455 @@ +""" +Build the compiled Component Gallery catalog artifact. + +This script compiles per-component submissions in `components/*.json` into a +single legacy-compatible artifact at `compiled/components.json` that the +Streamlit gallery app reads from local disk. + +It also supports carrying forward "last-known-good" computed fields (e.g. stars) +from a previous compiled artifact to avoid regressing metrics when enrichment is +not yet implemented. + +Run from the repo root (recommended): + + python directory/scripts/build_catalog.py + +Common variants: + + # Write somewhere else + python directory/scripts/build_catalog.py --out dist/components.json + + # Skip invalid component JSON files (prints errors and continues) + python directory/scripts/build_catalog.py --skip-invalid + + # Explicitly choose the prior artifact used for carry-forward + python directory/scripts/build_catalog.py --previous compiled/components.json +""" + +from __future__ import annotations + +import argparse +import json +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable + +from _utils.github import normalize_github_repo_url, repo_key +from _utils.io import dump_json, load_json +from _utils.time import utc_now_iso + + +@dataclass(frozen=True) +class ComponentBuildError: + file: Path + message: str + json_path: str | None = None + + +def _load_json(path: Path) -> Any: + return load_json(path) + + +def _load_schema(repo_root: Path) -> dict[str, Any]: + schema_path = repo_root / "schemas" / "component.schema.json" + obj = load_json(schema_path) + if not isinstance(obj, dict): + raise TypeError(f"Schema must be a JSON object: {schema_path}") + return obj + + +def _taxonomy_categories(repo_root: Path) -> list[str]: + """Return the fixed taxonomy categories, prefixed with 'All'.""" + schema = _load_schema(repo_root) + try: + enum = schema["properties"]["categories"]["items"]["enum"] + except Exception as e: # pragma: no cover + raise KeyError( + "Could not find taxonomy enum at " + "schemas/component.schema.json::properties.categories.items.enum" + ) from e + if not isinstance(enum, list) or not all(isinstance(x, str) for x in enum): + raise TypeError("Category enum must be a list of strings.") + return ["All", *enum] + + +def _format_json_path(parts: Iterable[Any]) -> str: + out: list[str] = [] + for p in parts: + if isinstance(p, int): + out.append(f"[{p}]") + else: + if out: + out.append(".") + out.append(str(p)) + return "".join(out) or "$" + + +def _validate_instance( + instance: Any, schema: dict[str, Any] +) -> list[ComponentBuildError]: + try: + from jsonschema import Draft202012Validator # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "Missing dependency `jsonschema`.\n\n" + "Install it with:\n" + " pip install jsonschema\n" + "or add it to `component-gallery/requirements.txt`." + ) from e + + validator = Draft202012Validator(schema) + errors: list[ComponentBuildError] = [] + for err in sorted(validator.iter_errors(instance), key=lambda x: list(x.path)): + errors.append( + ComponentBuildError( + file=Path(""), + message=err.message, + json_path=_format_json_path(err.path), + ) + ) + return errors + + +def _normalize_github_repo_url(url: str) -> str: + return normalize_github_repo_url(url) + + +def _component_key_from_github_url(url: str) -> str: + return repo_key(url) + + +def _load_previous_index(previous_path: Path | None) -> dict[str, dict[str, Any]]: + """Index previous compiled components by canonical github owner/repo.""" + if previous_path is None or not previous_path.is_file(): + return {} + obj = load_json(previous_path) + if not isinstance(obj, dict): + return {} + comps = obj.get("components", []) + if not isinstance(comps, list): + return {} + + out: dict[str, dict[str, Any]] = {} + for c in comps: + if not isinstance(c, dict): + continue + gh = c.get("gitHubUrl") + if not isinstance(gh, str) or not gh: + continue + try: + key = repo_key(gh) + except Exception: + continue + out[key] = c + return out + + +def _prev_int(prev: dict[str, Any], *path: str) -> int | None: + cur: Any = prev + for p in path: + if not isinstance(cur, dict): + return None + cur = cur.get(p) + return int(cur) if isinstance(cur, int) else None + + +def _prev_str(prev: dict[str, Any], *path: str) -> str | None: + cur: Any = prev + for p in path: + if not isinstance(cur, dict): + return None + cur = cur.get(p) + return str(cur) if isinstance(cur, str) else None + + +def _prev_bool(prev: dict[str, Any], *path: str) -> bool | None: + cur: Any = prev + for p in path: + if not isinstance(cur, dict): + return None + cur = cur.get(p) + return bool(cur) if isinstance(cur, bool) else None + + +def _pip_cmd_from_submission( + links: dict[str, Any], install: dict[str, Any] | None +) -> str | None: + if isinstance(install, dict): + pip_cmd = install.get("pip") + if isinstance(pip_cmd, str) and pip_cmd.strip(): + return pip_cmd.strip() + + pkg = links.get("pypi") + if isinstance(pkg, str) and pkg.strip(): + return f"pip install {pkg.strip()}" + return None + + +def build_catalog( + *, + repo_root: Path, + out_path: Path, + components_dir: Path, + previous_path: Path | None, + skip_invalid: bool, +) -> tuple[dict[str, Any], list[ComponentBuildError]]: + schema = _load_schema(repo_root) + categories = _taxonomy_categories(repo_root) + prev_index = _load_previous_index(previous_path) + + errors: list[ComponentBuildError] = [] + compiled_components: list[dict[str, Any]] = [] + + if not components_dir.is_dir(): + raise FileNotFoundError(f"Missing components directory: {components_dir}") + + seen_keys: set[str] = set() + for json_file in sorted(components_dir.glob("*.json")): + try: + submission = _load_json(json_file) + except json.JSONDecodeError as e: + errors.append( + ComponentBuildError(file=json_file, message=str(e), json_path=None) + ) + continue + + if not isinstance(submission, dict): + errors.append( + ComponentBuildError( + file=json_file, + message="Submission JSON must be an object.", + json_path="$", + ) + ) + continue + + # Schema validation (so we can safely map fields) + for ve in _validate_instance(submission, schema): + errors.append( + ComponentBuildError( + file=json_file, + message=ve.message, + json_path=ve.json_path, + ) + ) + if any(e.file == json_file for e in errors) and not skip_invalid: + continue + if any(e.file == json_file for e in errors) and skip_invalid: + # Skip this component but keep going. + continue + + try: + author_obj = submission["author"] + links = submission["links"] + governance = submission["governance"] + title = submission["title"] + + author_github = author_obj["github"] + github_url = normalize_github_repo_url(links["github"]) + key = repo_key(github_url) + + if key in seen_keys: + errors.append( + ComponentBuildError( + file=json_file, + message=f"Duplicate component identity (same GitHub repo): {key}", + json_path="links.github", + ) + ) + continue + seen_keys.add(key) + + pip_cmd = _pip_cmd_from_submission(links, submission.get("install")) + pypi_project = links.get("pypi") + if not isinstance(pypi_project, str) or not pypi_project.strip(): + pypi_project = None + demo_url = links.get("demo") + app_url = demo_url if isinstance(demo_url, str) else None + + media = ( + submission.get("media") + if isinstance(submission.get("media"), dict) + else None + ) + image_url = media.get("image") if isinstance(media, dict) else None + if not isinstance(image_url, str): + image_url = None + + enabled = bool(governance.get("enabled", True)) + + # Compiled per-component categories should NOT include "All". + # "All" is an implied UI filter mode, not a real category assignment. + submitted_categories = submission.get("categories", []) + cat_list: list[str] = [] + if isinstance(submitted_categories, list): + for c in submitted_categories: + if isinstance(c, str) and c != "All" and c not in cat_list: + cat_list.append(c) + if not cat_list: + raise ValueError( + "Per-component categories must be non-empty (and must not be 'All')." + ) + + prev = prev_index.get(key, {}) + if not isinstance(prev, dict): + prev = {} + + # Prefer previous metrics.github stars if present, else (legacy) top-level stars. + stars_val: int | None = _prev_int(prev, "metrics", "github", "stars") + if stars_val is None: + stars_val = _prev_int(prev, "stars") + # Default to 0 to match the current gallery UI expectations. + if stars_val is None: + stars_val = 0 + + prev_forks = _prev_int(prev, "metrics", "github", "forks") + prev_open_issues = _prev_int(prev, "metrics", "github", "openIssues") + prev_contributors = _prev_int( + prev, "metrics", "github", "contributorsCount" + ) + prev_last_push_at = _prev_str(prev, "metrics", "github", "lastPushAt") + prev_fetched_at = _prev_str(prev, "metrics", "github", "fetchedAt") + prev_is_stale = _prev_bool(prev, "metrics", "github", "isStale") + prev_pypi = ( + prev.get("metrics", {}).get("pypi") + if isinstance(prev.get("metrics"), dict) + else None + ) + prev_pypistats = ( + prev.get("metrics", {}).get("pypistats") + if isinstance(prev.get("metrics"), dict) + else None + ) + + social_url = f"https://github.com/{author_github}" + + compiled_components.append( + { + "title": title, + "author": author_github, + "pipLink": pip_cmd, + "pypi": pypi_project, + "categories": cat_list, + "image": image_url, + "gitHubUrl": github_url, + "enabled": enabled, + "appUrl": app_url, + "socialUrl": social_url, + "metrics": { + "github": { + "stars": stars_val, + "forks": prev_forks, + "openIssues": prev_open_issues, + "contributorsCount": prev_contributors, + "lastPushAt": prev_last_push_at, + "fetchedAt": prev_fetched_at, + "isStale": prev_is_stale, + }, + "pypi": prev_pypi, + "pypistats": prev_pypistats, + }, + } + ) + except Exception as e: + errors.append( + ComponentBuildError(file=json_file, message=str(e), json_path=None) + ) + if not skip_invalid: + continue + + # Deterministic ordering for stable diffs. + compiled_components.sort( + key=lambda c: (c.get("gitHubUrl") or "", c.get("title") or "") + ) + + compiled = { + "generatedAt": utc_now_iso(), + "schemaVersion": 1, + "categories": categories, + "components": compiled_components, + } + return compiled, errors + + +def main(argv: list[str]) -> int: + parser = argparse.ArgumentParser( + description="Build the compiled components catalog." + ) + parser.add_argument( + "--out", + default=None, + help="Output path for the compiled JSON (default: compiled/components.json).", + ) + parser.add_argument( + "--components-dir", + default=None, + help="Directory containing per-component JSON submissions (default: components/).", + ) + parser.add_argument( + "--previous", + default=None, + help=( + "Path to a previous compiled artifact to carry forward metrics like stars. " + "Defaults to compiled/components.json if present." + ), + ) + parser.add_argument( + "--skip-invalid", + action="store_true", + help="Skip invalid component JSON files instead of failing the build.", + ) + args = parser.parse_args(argv) + + repo_root = Path(__file__).resolve().parents[1] + + out_path = ( + Path(args.out) if args.out else (repo_root / "compiled" / "components.json") + ) + components_dir = ( + Path(args.components_dir) if args.components_dir else (repo_root / "components") + ) + + previous_path: Path | None + if args.previous: + previous_path = Path(args.previous) + else: + candidate = repo_root / "compiled" / "components.json" + previous_path = candidate if candidate.is_file() else None + + compiled, errors = build_catalog( + repo_root=repo_root, + out_path=out_path, + components_dir=components_dir, + previous_path=previous_path, + skip_invalid=args.skip_invalid, + ) + + if errors and not args.skip_invalid: + print( + "ERROR: build failed due to invalid component submissions:", file=sys.stderr + ) + for e in errors: + rel = e.file.relative_to(repo_root) if e.file.is_absolute() else e.file + jp = f"{e.json_path}: " if e.json_path else "" + print(f"- {rel}: {jp}{e.message}", file=sys.stderr) + return 1 + + dump_json(out_path, compiled) + + # Print a compact summary for CI logs + ts = utc_now_iso() + print( + f"Wrote {len(compiled.get('components', []))} component(s) to {out_path} at {ts}." + ) + if errors and args.skip_invalid: + print(f"NOTE: Skipped {len(errors)} validation error(s).", file=sys.stderr) + for e in errors: + rel = e.file.relative_to(repo_root) if e.file.is_absolute() else e.file + jp = f"{e.json_path}: " if e.json_path else "" + print(f"- {rel}: {jp}{e.message}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/directory/scripts/compute_ranking.py b/directory/scripts/compute_ranking.py new file mode 100644 index 00000000..5b85695f --- /dev/null +++ b/directory/scripts/compute_ranking.py @@ -0,0 +1,284 @@ +""" +Compute and persist ranking signals for the compiled component catalog. + +This script reads `compiled/components.json` and writes a `ranking` block for each +component, following the tech spec's v1 proposal: + +- starsScore = log10(stars + 1) +- recencyScore = exp(-days_since_update / half_life_days) + - days_since_update = min(days_since_github_push, days_since_pypi_release) when both exist + +The final score is: + score = w_stars * starsScore + w_recency * recencyScore + +If recency data is missing, the score falls back to the stars-only term. + +Run from the repo root (recommended): + + python directory/scripts/compute_ranking.py +""" + +from __future__ import annotations + +import argparse +import math +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from _utils.io import dump_json, load_json +from _utils.time import parse_iso8601, utc_now_iso + + +@dataclass(frozen=True) +class RankingConfig: + half_life_days: float + w_stars: float + w_recency: float + w_contributors: float + w_downloads: float + + +def _load_ranking_config(path: Path) -> RankingConfig: + obj = load_json(path) + if not isinstance(obj, dict): + raise TypeError(f"Ranking config must be a JSON object: {path}") + half_life = obj.get("halfLifeDays", 90.0) + weights = obj.get("weights", {}) + if not isinstance(weights, dict): + weights = {} + w_stars = weights.get("stars", 1.0) + w_recency = weights.get("recency", 2.0) + w_contributors = weights.get("contributors", 0.0) + w_downloads = weights.get("downloads", 0.0) + + try: + half_life_f = float(half_life) + w_stars_f = float(w_stars) + w_recency_f = float(w_recency) + w_contributors_f = float(w_contributors) + w_downloads_f = float(w_downloads) + except Exception as e: # pragma: no cover + raise TypeError("Ranking config values must be numeric.") from e + + if half_life_f <= 0: + raise ValueError("halfLifeDays must be > 0.") + + return RankingConfig( + half_life_days=half_life_f, + w_stars=w_stars_f, + w_recency=w_recency_f, + w_contributors=w_contributors_f, + w_downloads=w_downloads_f, + ) + + +def _days_since(dt: datetime, now: datetime) -> float: + delta_s = (now - dt).total_seconds() + # If clocks or sources are weird and dt is in the future, clamp to 0. + if delta_s < 0: + delta_s = 0.0 + return delta_s / 86400.0 + + +def _get_nested(comp: dict[str, Any], *path: str) -> Any: + cur: Any = comp + for p in path: + if not isinstance(cur, dict): + return None + cur = cur.get(p) + return cur + + +def _stars_for_component(comp: dict[str, Any]) -> int: + # Prefer nested metrics.github.stars if available. + s = _get_nested(comp, "metrics", "github", "stars") + if isinstance(s, int): + return max(0, s) + return 0 + + +def _contributors_for_component(comp: dict[str, Any]) -> int | None: + c = _get_nested(comp, "metrics", "github", "contributorsCount") + if isinstance(c, int): + return max(0, c) + return None + + +def _downloads_last_month(comp: dict[str, Any]) -> int | None: + d = _get_nested(comp, "metrics", "pypistats", "lastMonth") + if isinstance(d, int) and d >= 0: + return d + return None + + +def _recency_days( + comp: dict[str, Any], now: datetime +) -> tuple[float | None, float | None, float | None]: + gh_last_push = _get_nested(comp, "metrics", "github", "lastPushAt") + pypi_latest_release = _get_nested(comp, "metrics", "pypi", "latestReleaseAt") + + gh_dt = parse_iso8601(gh_last_push if isinstance(gh_last_push, str) else None) + pypi_dt = parse_iso8601( + pypi_latest_release if isinstance(pypi_latest_release, str) else None + ) + + gh_days = _days_since(gh_dt, now) if gh_dt else None + pypi_days = _days_since(pypi_dt, now) if pypi_dt else None + + days_since_update: float | None + if gh_days is not None and pypi_days is not None: + days_since_update = min(gh_days, pypi_days) + else: + days_since_update = gh_days if gh_days is not None else pypi_days + + return days_since_update, gh_days, pypi_days + + +def _compute_ranking( + comp: dict[str, Any], *, cfg: RankingConfig, now: datetime +) -> dict[str, Any]: + stars = _stars_for_component(comp) + stars_score = math.log10(stars + 1) + + contributors = _contributors_for_component(comp) + contributors_score: float | None = None + if contributors is not None: + contributors_score = math.log10(contributors + 1) + + downloads_last_month = _downloads_last_month(comp) + downloads_score: float | None = None + if downloads_last_month is not None: + downloads_score = math.log10(downloads_last_month + 1) + + days_since_update, gh_days, pypi_days = _recency_days(comp, now) + recency_score: float | None = None + if days_since_update is not None: + recency_score = math.exp(-days_since_update / cfg.half_life_days) + + score = cfg.w_stars * stars_score + if recency_score is not None: + score += cfg.w_recency * recency_score + if contributors_score is not None: + score += cfg.w_contributors * contributors_score + if downloads_score is not None: + score += cfg.w_downloads * downloads_score + + # Keep ranking explainable and stable. + return { + "score": score, + "signals": { + "starsScore": stars_score, + "recencyScore": recency_score, + "contributorsScore": contributors_score, + "daysSinceUpdate": days_since_update, + "daysSinceGithubPush": gh_days, + "daysSincePypiRelease": pypi_days, + "downloadsScore": downloads_score, + }, + "computedAt": utc_now_iso(), + } + + +def compute_rankings( + *, + compiled_in: Path, + compiled_out: Path, + config_path: Path, + limit: int | None, +) -> int: + obj = load_json(compiled_in) + if not isinstance(obj, dict): + print( + f"ERROR: compiled catalog must be a JSON object: {compiled_in}", + file=sys.stderr, + ) + return 2 + + comps = obj.get("components") + if not isinstance(comps, list): + print( + f"ERROR: compiled catalog missing `components` array: {compiled_in}", + file=sys.stderr, + ) + return 2 + + cfg = _load_ranking_config(config_path) + now = datetime.now(timezone.utc) + + processed = 0 + for comp in comps: + if limit is not None and processed >= limit: + break + processed += 1 + if not isinstance(comp, dict): + continue + comp["ranking"] = _compute_ranking(comp, cfg=cfg, now=now) + + dump_json(compiled_out, obj) + print(f"Wrote rankings for {processed} component(s) to {compiled_out}.") + return 0 + + +def main(argv: list[str]) -> int: + parser = argparse.ArgumentParser( + description="Compute ranking fields for compiled/components.json." + ) + parser.add_argument( + "--in", + dest="compiled_in", + default=None, + help="Input compiled catalog path (default: compiled/components.json).", + ) + parser.add_argument( + "--out", + dest="compiled_out", + default=None, + help="Output compiled catalog path (default: overwrite --in).", + ) + parser.add_argument( + "--config", + dest="config_path", + default=None, + help="Ranking config path (default: ranking_config.json at repo root).", + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Only process the first N components (debug).", + ) + args = parser.parse_args(argv) + + repo_root = Path(__file__).resolve().parents[1] + compiled_in = ( + Path(args.compiled_in) + if args.compiled_in + else (repo_root / "compiled" / "components.json") + ) + compiled_out = Path(args.compiled_out) if args.compiled_out else compiled_in + config_path = ( + Path(args.config_path) + if args.config_path + else (repo_root / "ranking_config.json") + ) + + if not compiled_in.is_file(): + print(f"ERROR: Missing compiled catalog: {compiled_in}", file=sys.stderr) + return 2 + if not config_path.is_file(): + print(f"ERROR: Missing ranking config: {config_path}", file=sys.stderr) + return 2 + + return compute_rankings( + compiled_in=compiled_in, + compiled_out=compiled_out, + config_path=config_path, + limit=args.limit, + ) + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/directory/scripts/enrich.py b/directory/scripts/enrich.py new file mode 100644 index 00000000..e02e22a0 --- /dev/null +++ b/directory/scripts/enrich.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +from _enrichers import get_default_enrichers # type: ignore[import-not-found] +from _utils.enrichment_engine import ( + run_enrichment_engine, # type: ignore[import-not-found] +) +from _utils.io import dump_json, load_json # type: ignore[import-not-found] +from _utils.time import utc_now_iso # type: ignore[import-not-found] + + +def _parse_services(raw: list[str] | None) -> list[str]: + if not raw: + return ["github", "pypi", "pypistats"] + return [s.strip().lower() for s in raw if s.strip()] + + +def main(argv: list[str]) -> int: + parser = argparse.ArgumentParser( + description="Enrich compiled/components.json using GitHub, PyPI, and pypistats." + ) + parser.add_argument( + "--in", + dest="compiled_in", + default=None, + help="Input compiled catalog path (default: compiled/components.json).", + ) + parser.add_argument( + "--out", + dest="compiled_out", + default=None, + help="Output compiled catalog path (default: overwrite --in).", + ) + parser.add_argument( + "--services", + nargs="*", + default=None, + help="Which enrichers to run (default: github pypi pypistats).", + ) + parser.add_argument( + "--token-env", + default="GH_TOKEN", + help="Environment variable name holding a GitHub token (default: GH_TOKEN).", + ) + parser.add_argument( + "--timeout", + type=float, + default=20.0, + help="HTTP timeout in seconds (default: 20).", + ) + parser.add_argument( + "--sleep-github", + type=float, + default=None, + help="Sleep between unique GitHub API requests in seconds.", + ) + parser.add_argument( + "--sleep-pypi", + type=float, + default=None, + help="Sleep between unique PyPI API requests in seconds.", + ) + parser.add_argument( + "--sleep-pypistats", + type=float, + default=None, + help="Sleep between unique pypistats API requests in seconds.", + ) + parser.add_argument( + "--refresh-older-than-hours", + type=float, + default=24.0, + help=( + "Only refetch metrics if existing fetchedAt values are older than this many " + "hours (default: 24). Use 0 to refetch everything." + ), + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Only process the first N components (debug).", + ) + parser.add_argument( + "--allow-failures", + action="store_true", + help="Do not fail the process if some enrichment fetches fail.", + ) + parser.add_argument( + "--workers", + type=int, + default=max(4, (os.cpu_count() or 4) * 4), + help="Max worker threads (default: 4 * CPU count).", + ) + parser.add_argument( + "--progress-every", + dest="progress_every", + type=int, + default=25, + help="Print progress every N processed components (default: 25). Use 0 to disable.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Print per-service failure details.", + ) + args = parser.parse_args(argv) + + repo_root = Path(__file__).resolve().parents[1] + compiled_in = ( + Path(args.compiled_in) + if args.compiled_in + else (repo_root / "compiled" / "components.json") + ) + compiled_out = Path(args.compiled_out) if args.compiled_out else compiled_in + + if not compiled_in.is_file(): + print(f"ERROR: Missing compiled catalog: {compiled_in}", file=sys.stderr) + return 2 + + obj = load_json(compiled_in) + if not isinstance(obj, dict): + print( + f"ERROR: compiled catalog must be a JSON object: {compiled_in}", + file=sys.stderr, + ) + return 2 + comps = obj.get("components") + if not isinstance(comps, list): + print( + f"ERROR: compiled catalog missing `components` array: {compiled_in}", + file=sys.stderr, + ) + return 2 + + services = _parse_services(args.services) + enrichers = [ + e + for e in get_default_enrichers(github_token_env=args.token_env) + if e.name in services + ] + if not enrichers: + print(f"ERROR: No valid services selected: {services}", file=sys.stderr) + return 2 + + has_gh_token = bool( + os.environ.get(args.token_env) + or os.environ.get("GH_TOKEN") + or os.environ.get("GH_API_TOKEN") + or os.environ.get("GITHUB_TOKEN") + ) + github_sleep = ( + float(args.sleep_github) + if args.sleep_github is not None + else (0.2 if has_gh_token else 1.0) + ) + pypi_sleep = float(args.sleep_pypi) if args.sleep_pypi is not None else 0.3 + pypistats_sleep = ( + float(args.sleep_pypistats) if args.sleep_pypistats is not None else pypi_sleep + ) + + sleep_by_service = { + "github": github_sleep, + "pypi": pypi_sleep, + "pypistats": pypistats_sleep, + } + + run_fetched_at = utc_now_iso() + comps_for_run = comps if args.limit is None else comps[: int(args.limit)] + expected_counts: dict[str, int] = {e.name: 0 for e in enrichers} + for comp in comps_for_run: + if not isinstance(comp, dict): + continue + for enricher in enrichers: + if not enricher.needs_fetch(comp, args.refresh_older_than_hours): + continue + if enricher.key_for_component(comp) is None: + continue + expected_counts[enricher.name] += 1 + for enricher in enrichers: + print( + f"[{enricher.name}] will attempt {expected_counts[enricher.name]} component(s).", + flush=True, + ) + result = run_enrichment_engine( + components=comps_for_run, + enrichers=enrichers, + refresh_older_than_hours=args.refresh_older_than_hours, + timeout_s=float(args.timeout), + sleep_by_service=sleep_by_service, + workers=int(args.workers), + run_fetched_at=run_fetched_at, + progress_every=( + int(args.progress_every) if args.progress_every is not None else None + ), + ) + + dump_json(compiled_out, obj) + + ts = utc_now_iso() + print(f"Wrote {compiled_out} at {ts}.") + for enricher in enrichers: + s = result.stats[enricher.name] + print( + f"[{enricher.name}] summary: processed={s.processed} " + f"requests={s.requests} ok={s.ok} fail={s.failed} " + f"updated={s.updated} skipped_fresh={s.skipped_fresh} " + f"cache_hits={s.cache_hits} skipped_no_key={s.skipped_no_key}", + flush=True, + ) + + any_failures = False + for enricher in enrichers: + fails = result.failures[enricher.name] + if fails: + any_failures = True + print( + f"WARNING: {len(fails)} {enricher.name} fetch failure(s):", + file=sys.stderr, + ) + for f in fails[:50]: + code = f" (status {f.status})" if f.status is not None else "" + print(f"- {f.key}{code}: {f.error}", file=sys.stderr) + if len(fails) > 50: + print(f"... and {len(fails) - 50} more", file=sys.stderr) + if args.verbose: + for f in fails: + code = f" (status {f.status})" if f.status is not None else "" + print( + f"[{enricher.name}] FAIL {f.key}{code}: {f.error}", + file=sys.stderr, + ) + + if any_failures and not args.allow_failures: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/directory/scripts/enrich_images.py b/directory/scripts/enrich_images.py new file mode 100644 index 00000000..05b8ccbf --- /dev/null +++ b/directory/scripts/enrich_images.py @@ -0,0 +1,501 @@ +""" +Validate Component Gallery preview images (`media.image`) for stability + accessibility. + +Rules enforced: +- `media.image` is optional (it may be missing, null, or an empty string). If present and non-empty: +- URL must be https:// +- must not be a brittle proxy (e.g. `camo.githubusercontent.com`) +- must not contain signed/expiring query params (X-Amz-*, X-Goog-*, Signature/Expires/etc) +- must be fetchable (HTTP 2xx) and plausibly an image (best-effort via Content-Type) + +Typical usage (from repo root): + python directory/scripts/enrich_images.py --check-only + +Notes: +- This script is intentionally check-only (no auto-fix, no caching). +- It requires outbound network access. +""" + +from __future__ import annotations + +import argparse +import os +import sys +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from pathlib import Path +from typing import Any +from urllib.parse import parse_qsl, urlparse + +import requests +from _utils.image_url_policy import DISALLOWED_IMAGE_HOSTS, DISALLOWED_IMAGE_QUERY_KEYS +from _utils.io import load_json +from requests.adapters import HTTPAdapter + +DEFAULT_TIMEOUT_S = 15.0 +DEFAULT_WORKERS = min(32, max(4, (os.cpu_count() or 4) * 5)) + + +def _is_https_url(url: str) -> bool: + """Return True if a URL is a well-formed HTTPS URL. + + Parameters + ---------- + url + URL to check. + + Returns + ------- + bool + True if the URL uses the ``https`` scheme and has a non-empty network + location (host). + """ + p = urlparse(url) + return p.scheme == "https" and bool(p.netloc) + + +def _is_disallowed_host(url: str) -> bool: + """Return True if the URL host is disallowed for preview images. + + Parameters + ---------- + url + URL to check. + + Returns + ------- + bool + True if the URL host is in the disallowed host list. + """ + host = (urlparse(url).netloc or "").lower() + return host in DISALLOWED_IMAGE_HOSTS + + +def _has_disallowed_query_params(url: str) -> bool: + """Return True if the URL includes signed/expiring query parameters. + + Parameters + ---------- + url + URL to check. + + Returns + ------- + bool + True if any query parameter key matches a disallowed key (case-insensitive). + """ + for k, _ in parse_qsl(urlparse(url).query, keep_blank_values=True): + if k.strip().lower() in DISALLOWED_IMAGE_QUERY_KEYS: + return True + return False + + +def _is_imageish_content_type(ct: str | None) -> bool: + """Return True if an HTTP Content-Type is plausibly an image. + + Parameters + ---------- + ct + Content-Type header value (may be missing). + + Returns + ------- + bool + True if the value looks like an image MIME type (``image/*``), or if it + is missing/blank, or if it is ``application/octet-stream`` (some CDNs + mislabel images). This is intentionally permissive to avoid false negatives. + """ + if not isinstance(ct, str) or not ct.strip(): + return True + base = ct.split(";", 1)[0].strip().lower() + if base.startswith("image/"): + return True + if base in {"application/octet-stream"}: + return True + return False + + +@dataclass(frozen=True) +class ImageCheck: + """Result of a best-effort remote image URL check. + + Attributes + ---------- + ok + True if the URL was fetchable (HTTP 2xx). + status + HTTP status code if available. + final_url + Final URL after redirects if available. + content_type + Response Content-Type header if available. + error + Human-readable error string for failures. + """ + + ok: bool + status: int | None + final_url: str | None + content_type: str | None + error: str | None = None + + +_tls = threading.local() + + +def _get_thread_session(*, pool_maxsize: int) -> requests.Session: + """Return a thread-local `requests.Session` for connection reuse. + + Parameters + ---------- + pool_maxsize + Max number of pooled connections to keep per host for the mounted adapters. + + Returns + ------- + requests.Session + A per-thread session instance. + + Notes + ----- + `requests.Session` is not guaranteed to be thread-safe. Using one session per + worker thread preserves connection pooling without cross-thread sharing. + """ + s = getattr(_tls, "session", None) + if s is None: + s = requests.Session() + adapter = HTTPAdapter(pool_connections=pool_maxsize, pool_maxsize=pool_maxsize) + s.mount("https://", adapter) + s.mount("http://", adapter) + _tls.session = s + return s + + +def _check_fetchable( + session: requests.Session, url: str, *, timeout_s: float +) -> ImageCheck: + """Fetch an image URL (HEAD then GET) and return an `ImageCheck`. + + Parameters + ---------- + session + `requests` session used to issue HTTP requests. + url + Image URL to fetch. + timeout_s + Per-request timeout in seconds. + + Returns + ------- + ImageCheck + Structured result containing status, final URL after redirects, and + Content-Type (best-effort). + + Notes + ----- + This function tries a ``HEAD`` request first (faster, less bandwidth). Some + servers block or mishandle ``HEAD``; in that case we fall back to a streaming + ``GET``. + """ + headers = {"User-Agent": "component-gallery-image-check"} + + # HEAD first; fall back to GET (some servers block HEAD). + try: + with session.head( + url, allow_redirects=True, timeout=timeout_s, headers=headers + ) as r: + status = int(r.status_code) + ct = r.headers.get("Content-Type") + if 200 <= status < 300: + return ImageCheck( + ok=True, + status=status, + final_url=str(r.url), + content_type=ct, + error=None, + ) + except requests.RequestException: + # Some servers reject or mishandle HEAD requests; ignore the error and + # fall back to a full GET request below. + pass + + try: + with session.get( + url, allow_redirects=True, timeout=timeout_s, headers=headers, stream=True + ) as r: + status = int(r.status_code) + ct = r.headers.get("Content-Type") + if 200 <= status < 300: + return ImageCheck( + ok=True, + status=status, + final_url=str(r.url), + content_type=ct, + error=None, + ) + return ImageCheck( + ok=False, + status=status, + final_url=str(r.url), + content_type=ct, + error=f"HTTP {status}", + ) + except requests.RequestException as e: + return ImageCheck( + ok=False, + status=None, + final_url=None, + content_type=None, + error=f"{type(e).__name__}: {e}", + ) + + +def _get_media_image(obj: dict[str, Any]) -> str | None: + """Extract a non-empty `media.image` string from a component JSON object. + + Parameters + ---------- + obj + Parsed JSON object for a component. + + Returns + ------- + str | None + The stripped image URL if present and non-empty; otherwise ``None``. + """ + media = obj.get("media") + if not isinstance(media, dict): + return None + img = media.get("image") + return img.strip() if isinstance(img, str) and img.strip() else None + + +@dataclass(frozen=True) +class _FetchTask: + """Unit of work for a single network fetch.""" + + json_name: str + url: str + + +@dataclass(frozen=True) +class _FetchResult: + """Network fetch result for a single component JSON file.""" + + json_name: str + url: str + chk: ImageCheck + + +def _fetch_one( + task: _FetchTask, *, timeout_s: float, pool_maxsize: int +) -> _FetchResult: + """Fetch one URL for a `_FetchTask` and return a `_FetchResult`. + + Parameters + ---------- + task + Task describing which component file the URL came from. + timeout_s + Per-request timeout in seconds. + pool_maxsize + Per-thread connection pool size for the thread-local session. + + Returns + ------- + _FetchResult + Result record with the originating JSON file name and `ImageCheck`. + """ + session = _get_thread_session(pool_maxsize=pool_maxsize) + chk = _check_fetchable(session, task.url, timeout_s=timeout_s) + return _FetchResult(json_name=task.json_name, url=task.url, chk=chk) + + +def check_images( + *, + components_dir: Path, + timeout_s: float, + verbose: bool, + workers: int, +) -> int: + """Validate component preview image URLs under `components_dir`. + + Parameters + ---------- + components_dir + Directory containing `components/*.json` submission files. + timeout_s + Per-request timeout in seconds for image fetch checks. + verbose + If True, print an OK line for each successfully validated image. + workers + Maximum number of concurrent network fetches to run. + + Returns + ------- + int + Process-style return code: 0 if all checks pass, otherwise 1. + + Notes + ----- + The validation happens in two phases: + + 1. Local policy checks (HTTPS, disallowed hosts, disallowed query params). + 2. Network checks (fetchability + permissive Content-Type validation), run in + parallel because they are I/O-bound. + """ + failures = 0 + tasks: list[_FetchTask] = [] + local_failures: dict[str, str] = {} + + for json_file in sorted(components_dir.glob("*.json")): + try: + obj = load_json(json_file) + except Exception as e: + local_failures[json_file.name] = f"invalid JSON ({e})" + continue + if not isinstance(obj, dict): + continue + + img = _get_media_image(obj) + if not img: + # Optional: null/empty is allowed. + continue + + if not _is_https_url(img): + local_failures[json_file.name] = f"not https:// ({img})" + continue + + if _is_disallowed_host(img): + local_failures[json_file.name] = f"disallowed_host=camo ({img})" + continue + + if _has_disallowed_query_params(img): + local_failures[json_file.name] = f"signed/expiring_url ({img})" + continue + + tasks.append(_FetchTask(json_name=json_file.name, url=img)) + + # Report local (non-network) failures deterministically. + for json_name in sorted(local_failures.keys()): + print( + f"[images] FAIL {json_name}: {local_failures[json_name]}", file=sys.stderr + ) + failures += 1 + + # Network-bound checks: run in parallel (bounded). + results_by_json: dict[str, _FetchResult] = {} + if tasks: + # Ensure a sane lower bound; allow workers=1 for debugging. + w = max(1, int(workers)) + pool_maxsize = max(8, w) + with ThreadPoolExecutor(max_workers=w) as ex: + futs = [ + ex.submit(_fetch_one, t, timeout_s=timeout_s, pool_maxsize=pool_maxsize) + for t in tasks + ] + for fut in as_completed(futs): + r = fut.result() + results_by_json[r.json_name] = r + + for json_name in sorted(results_by_json.keys()): + r = results_by_json[json_name] + chk = r.chk + img = r.url + + if not chk.ok: + print( + f"[images] FAIL {json_name}: unfetchable ({chk.error}) ({img})", + file=sys.stderr, + ) + failures += 1 + continue + + if not _is_imageish_content_type(chk.content_type): + print( + f"[images] FAIL {json_name}: non-image content-type ({chk.content_type}) ({img})", + file=sys.stderr, + ) + failures += 1 + continue + + if verbose: + print(f"[images] OK {json_name}: {chk.final_url or img}") + + print(f"[images] done: failures={failures}") + return 1 if failures else 0 + + +def main(argv: list[str]) -> int: + """CLI entrypoint. + + Parameters + ---------- + argv + Command line arguments excluding the program name (i.e., ``sys.argv[1:]``). + + Returns + ------- + int + Process exit code: + + - 0: all checks passed + - 1: one or more checks failed + - 2: configuration error (e.g., missing components dir, offline mode set) + """ + parser = argparse.ArgumentParser( + description="Validate `media.image` URLs for components." + ) + parser.add_argument( + "--components-dir", + default=None, + help="Directory containing components/*.json (default: components/).", + ) + parser.add_argument( + "--check-only", + action="store_true", + help="Compatibility flag; this script is always check-only.", + ) + parser.add_argument( + "--timeout", + type=float, + default=DEFAULT_TIMEOUT_S, + help=f"HTTP timeout in seconds (default: {DEFAULT_TIMEOUT_S}).", + ) + parser.add_argument( + "--workers", + type=int, + default=DEFAULT_WORKERS, + help=f"Max parallel fetches (default: {DEFAULT_WORKERS}).", + ) + parser.add_argument("--verbose", action="store_true", help="Verbose output.") + args = parser.parse_args(argv) + + if os.environ.get("COMPONENT_GALLERY_OFFLINE") == "1": + print( + "ERROR: COMPONENT_GALLERY_OFFLINE=1 set; image checks require network.", + file=sys.stderr, + ) + return 2 + + project_root = Path(__file__).resolve().parents[1] + components_dir = ( + Path(args.components_dir) + if args.components_dir + else (project_root / "components") + ) + if not components_dir.is_dir(): + print(f"ERROR: components dir not found: {components_dir}", file=sys.stderr) + return 2 + + return check_images( + components_dir=components_dir, + timeout_s=float(args.timeout), + verbose=bool(args.verbose), + workers=int(args.workers), + ) + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/directory/scripts/run_pipeline.py b/directory/scripts/run_pipeline.py new file mode 100644 index 00000000..01de5c09 --- /dev/null +++ b/directory/scripts/run_pipeline.py @@ -0,0 +1,273 @@ +""" +Run the full Component Gallery pipeline with a single entrypoint. + +Default pipeline: + + 1) Validate `components/*.json` submissions + 2) Build `compiled/components.json` + 3) Validate `compiled/components.json` + 4) Enrich GitHub metrics + 5) Enrich PyPI metrics + 6) Compute ranking signals + 7) Validate `compiled/components.json` again + +Run from the repo root (recommended): + + python directory/scripts/run_pipeline.py + +Typical CI usage: + + # Build + validate only (no network) + python directory/scripts/run_pipeline.py --no-enrich +""" + +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +from pathlib import Path + + +def _run(cmd: list[str]) -> int: + proc = subprocess.run(cmd) + return int(proc.returncode) + + +def main(argv: list[str]) -> int: + parser = argparse.ArgumentParser( + description="Run validate -> build -> enrich pipeline for the component gallery." + ) + parser.add_argument( + "--no-validate", + action="store_true", + help="Skip validation steps (not recommended).", + ) + parser.add_argument( + "--no-build", + action="store_true", + help="Skip build step (assumes compiled/components.json already exists).", + ) + parser.add_argument( + "--no-github", + action="store_true", + help="Skip GitHub enrichment.", + ) + parser.add_argument( + "--no-pypi", + action="store_true", + help="Skip PyPI enrichment.", + ) + parser.add_argument( + "--no-pypistats", + action="store_true", + help="Skip PyPI download enrichment (pypistats).", + ) + parser.add_argument( + "--no-enrich", + action="store_true", + help="Skip all enrichment (equivalent to --no-github --no-pypi).", + ) + parser.add_argument( + "--no-ranking", + action="store_true", + help="Skip ranking computation (not recommended).", + ) + parser.add_argument( + "--no-images", + action="store_true", + help="Skip image URL checking (requires outbound network).", + ) + parser.add_argument( + "--allow-enrich-failures", + action="store_true", + help="Do not fail the pipeline if some enrichment fetches fail.", + ) + parser.add_argument( + "--refresh-older-than-hours", + type=float, + default=24.0, + help=( + "Only refetch enrichment metrics if existing fetchedAt values are older " + "than this many hours (default: 24). Use 0 to force refetching everything." + ), + ) + parser.add_argument( + "--enrich-progress-every", + type=int, + default=None, + help=( + "Forwarded to enrichers as --progress-every N. " + "Default: use each enricher's default." + ), + ) + parser.add_argument( + "--enrich-verbose", + action="store_true", + help="Forwarded to enrichers as --verbose (prints per-request failures as they happen).", + ) + parser.add_argument( + "--enrich-sleep-github", + type=float, + default=None, + help=( + "Sleep between unique GitHub API requests in seconds. " + "Default: 0.2 with GH_TOKEN set, else 1.0 (safer for large catalogs)." + ), + ) + parser.add_argument( + "--enrich-sleep-pypi", + type=float, + default=None, + help=( + "Sleep between unique PyPI API requests in seconds. " + "Default: 0.3 (safer for large catalogs)." + ), + ) + parser.add_argument( + "--enrich-sleep-pypistats", + type=float, + default=None, + help=( + "Sleep between unique pypistats API requests in seconds. " + "Default: reuse PyPI sleep (0.3 by default)." + ), + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Only process the first N components for each enrichment step (debug).", + ) + args = parser.parse_args(argv) + + repo_root = Path(__file__).resolve().parents[1] + py = sys.executable + + if args.no_enrich: + args.no_github = True + args.no_pypi = True + args.no_pypistats = True + + # Choose conservative enrichment pacing defaults, especially for large catalogs. + has_gh_token = bool( + os.environ.get("GH_TOKEN") + or os.environ.get("GH_API_TOKEN") + or os.environ.get("GITHUB_TOKEN") + ) + github_sleep = ( + float(args.enrich_sleep_github) + if args.enrich_sleep_github is not None + else (0.2 if has_gh_token else 1.0) + ) + pypi_sleep = ( + float(args.enrich_sleep_pypi) if args.enrich_sleep_pypi is not None else 0.3 + ) + pypistats_sleep = ( + float(args.enrich_sleep_pypistats) + if args.enrich_sleep_pypistats is not None + else pypi_sleep + ) + + def run_step(name: str, cmd: list[str]) -> int: + # Flush so headers appear before subprocess output in buffered environments. + print(f"\n==> {name}\n$ {' '.join(cmd)}", flush=True) + return _run(cmd) + + # 1) Validate submissions + if not args.no_validate: + rc = run_step( + "Validate submissions", [py, str(repo_root / "scripts" / "validate.py")] + ) + if rc != 0: + return rc + + # 1b) Check image URLs (network). Keep this separate from schema validation so + # CI can enforce it while local/offline runs can skip it. + if not args.no_images: + rc = run_step( + "Check images", + [py, str(repo_root / "scripts" / "enrich_images.py"), "--check-only"], + ) + if rc != 0: + return rc + + # 2) Build compiled artifact + if not args.no_build: + rc = run_step( + "Build compiled catalog", + [py, str(repo_root / "scripts" / "build_catalog.py")], + ) + if rc != 0: + return rc + + # 3) Validate compiled artifact + if not args.no_validate: + rc = run_step( + "Validate compiled catalog", + [py, str(repo_root / "scripts" / "validate.py"), "--compiled"], + ) + if rc != 0: + return rc + + # 4) Enrich (GitHub/PyPI/pypistats) + services: list[str] = [] + if not args.no_github: + services.append("github") + if not args.no_pypi: + services.append("pypi") + if not args.no_pypistats: + services.append("pypistats") + + if services: + cmd = [ + py, + str(repo_root / "scripts" / "enrich.py"), + "--services", + *services, + "--sleep-github", + str(github_sleep), + "--sleep-pypi", + str(pypi_sleep), + "--sleep-pypistats", + str(pypistats_sleep), + "--refresh-older-than-hours", + str(args.refresh_older_than_hours), + ] + if args.enrich_progress_every is not None: + cmd += ["--progress-every", str(args.enrich_progress_every)] + if args.enrich_verbose: + cmd += ["--verbose"] + if args.limit is not None: + cmd += ["--limit", str(args.limit)] + if args.allow_enrich_failures: + cmd += ["--allow-failures"] + rc = run_step("Enrich catalog", cmd) + if rc != 0: + return rc + + # 6) Compute ranking + if not args.no_ranking: + cmd = [py, str(repo_root / "scripts" / "compute_ranking.py")] + if args.limit is not None: + cmd += ["--limit", str(args.limit)] + rc = run_step("Compute ranking", cmd) + if rc != 0: + return rc + + # 7) Final validate compiled artifact + if not args.no_validate: + rc = run_step( + "Final validate compiled catalog", + [py, str(repo_root / "scripts" / "validate.py"), "--compiled"], + ) + if rc != 0: + return rc + + print("\nOK: pipeline completed successfully.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/directory/scripts/validate.py b/directory/scripts/validate.py new file mode 100644 index 00000000..1cdc2b57 --- /dev/null +++ b/directory/scripts/validate.py @@ -0,0 +1,554 @@ +""" +Validate Component Gallery JSON files. + +This script validates: + +- Source-of-truth component submissions: `components/*.json` + against `schemas/component.schema.json`. +- Optionally, the compiled artifact: `compiled/components.json` + against `schemas/compiled.schema.json` (use `--compiled`). + +Run from the repo root (recommended): + + python directory/scripts/validate.py + python directory/scripts/validate.py --compiled +""" + +from __future__ import annotations + +import argparse +import sys +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable +from urllib.parse import parse_qsl, urlparse + +from _utils.github import normalize_github_repo_url +from _utils.image_url_policy import DISALLOWED_IMAGE_HOSTS, DISALLOWED_IMAGE_QUERY_KEYS +from _utils.io import load_json + + +@dataclass(frozen=True) +class ValidationIssue: + """A single schema validation issue tied to a specific JSON file.""" + + file: Path + schema: Path + message: str + json_path: str | None = None + + +def _format_json_path(parts: Iterable[Any]) -> str: + """Format a jsonschema error path into a compact JSONPath-ish string. + + Parameters + ---------- + parts + Iterable of path parts (strings for object keys, ints for array indices), + typically from `jsonschema.ValidationError.path`. + + Returns + ------- + str + A compact, human-readable path (e.g. ``$``, ``author.github``, + ``components[0].title``). + """ + out: list[str] = [] + for p in parts: + if isinstance(p, int): + out.append(f"[{p}]") + else: + if out: + out.append(".") + out.append(str(p)) + return "".join(out) or "$" + + +def _load_json(path: Path) -> Any: + """Load a JSON file from disk. + + Parameters + ---------- + path + Path to a JSON file. + + Returns + ------- + Any + Parsed JSON data. + """ + return load_json(path) + + +def _load_schema(path: Path) -> dict[str, Any]: + """Load and sanity-check a JSON Schema from disk. + + Parameters + ---------- + path + Path to a JSON Schema file. + + Returns + ------- + dict[str, Any] + Parsed schema object. + + Raises + ------ + TypeError + If the schema file does not contain a JSON object. + """ + obj = _load_json(path) + if not isinstance(obj, dict): + raise TypeError(f"Schema must be a JSON object: {path}") + return obj + + +def _missing_required_fields(err: Any) -> list[str] | None: + """Compute missing required field names for a jsonschema "required" error. + + jsonschema "required" errors can be noisy; this extracts the specific fields + missing at the failing location so output stays readable. + + Parameters + ---------- + err + A `jsonschema.ValidationError` instance (typed as `Any` to keep this + script dependency-light). + + Returns + ------- + list[str] | None + List of missing field names if applicable; otherwise ``None``. + """ + if err.validator != "required" or not isinstance(err.validator_value, list): + return None + if not isinstance(err.instance, dict): + return None + # validator_value is the list of required fields for the schema at this path. + required: list[str] = [str(x) for x in err.validator_value] + return [k for k in required if k not in err.instance] + + +def _validate_one(instance_path: Path, schema_path: Path) -> list[ValidationIssue]: + """Validate one JSON instance file against a JSON Schema. + + Parameters + ---------- + instance_path + Path to the JSON file to validate. + schema_path + Path to the JSON Schema file to validate against. + + Returns + ------- + list[ValidationIssue] + A (de-duplicated) list of validation issues for this file. Empty means + the file is valid. + + Raises + ------ + RuntimeError + If the `jsonschema` dependency is not installed. + TypeError + If the schema file is not a JSON object. + json.JSONDecodeError + If either the schema or instance JSON cannot be parsed. + """ + try: + from jsonschema import Draft202012Validator # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "Missing dependency `jsonschema`.\n\n" + "Install it with:\n" + " pip install jsonschema\n" + "or add it to `component-gallery/requirements.txt`." + ) from e + + schema = _load_schema(schema_path) + instance = _load_json(instance_path) + + validator = Draft202012Validator(schema) + issues: list[ValidationIssue] = [] + + for err in sorted(validator.iter_errors(instance), key=lambda x: list(x.path)): + # jsonschema gives a path deque; make it readable + json_path = _format_json_path(err.path) + message = err.message + + missing = _missing_required_fields(err) + if missing: + message = f"Missing required field(s): {', '.join(missing)}" + + issues.append( + ValidationIssue( + file=instance_path, + schema=schema_path, + message=message, + json_path=json_path, + ) + ) + + # De-dupe identical messages (common when multiple schemas report the same root-level issue) + deduped: list[ValidationIssue] = [] + seen: set[tuple[str, str]] = set() + for issue in issues: + key = (issue.json_path or "$", issue.message) + if key in seen: + continue + seen.add(key) + deduped.append(issue) + return deduped + + +def validate_components(repo_root: Path) -> list[ValidationIssue]: + """Validate all source component submissions under `components/`. + + Parameters + ---------- + repo_root + Path to the component-gallery repo root. + + Returns + ------- + list[ValidationIssue] + Validation issues across all `components/*.json` files. + """ + schema_path = repo_root / "schemas" / "component.schema.json" + components_dir = repo_root / "components" + + issues: list[ValidationIssue] = [] + for json_file in sorted(components_dir.glob("*.json")): + issues.extend(_validate_one(json_file, schema_path)) + return issues + + +def _is_https_url(url: str) -> bool: + parsed = urlparse(url) + return parsed.scheme == "https" and bool(parsed.netloc) + + +def _is_disallowed_url(url: str) -> bool: + """Reject obvious XSS / unsafe schemes even if schema is relaxed.""" + parsed = urlparse(url) + return parsed.scheme in {"javascript", "data", "file"} + + +# --- Image URL hardening ----------------------------------------------------- +# +# We want preview images to remain stable over time. In practice, the most common +# sources of broken images are: +# - Signed / expiring URLs (S3/GCS/CloudFront style query params) +# - Proxy URLs like `camo.githubusercontent.com` (can change/expire and is not the +# canonical image source) +# +# We enforce these constraints only for `media.image` (not general links). + + +def _has_disallowed_image_query_params(url: str) -> bool: + parsed = urlparse(url) + for k, _ in parse_qsl(parsed.query, keep_blank_values=True): + if k.strip().lower() in DISALLOWED_IMAGE_QUERY_KEYS: + return True + return False + + +def _is_disallowed_image_host(url: str) -> bool: + parsed = urlparse(url) + host = (parsed.netloc or "").lower() + return host in DISALLOWED_IMAGE_HOSTS + + +def validate_policies( + repo_root: Path, *, max_component_bytes: int = 50_000 +) -> list[ValidationIssue]: + """Policy/lint checks beyond JSON Schema for `components/*.json`. + + This matches the tech spec's CI expectations: + - Unique component identity (unique GitHub owner/repo across submissions) + - HTTPS-only URLs + - Basic abuse guardrails (file size) + """ + schema_path = repo_root / "schemas" / "component.schema.json" + components_dir = repo_root / "components" + + issues: list[ValidationIssue] = [] + first_by_repo: dict[str, Path] = {} + + for json_file in sorted(components_dir.glob("*.json")): + # File size abuse guardrail + try: + size = json_file.stat().st_size + except OSError as e: # pragma: no cover + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message=f"Could not stat file: {e}", + json_path=None, + ) + ) + continue + if size > max_component_bytes: + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message=( + f"File too large ({size} bytes). " + f"Max allowed is {max_component_bytes} bytes." + ), + json_path=None, + ) + ) + + # Best-effort JSON load for lint checks (schema validation handled separately) + try: + obj = _load_json(json_file) + except Exception: + continue + if not isinstance(obj, dict): + continue + + links = obj.get("links") + if not isinstance(links, dict): + continue + + gh = links.get("github") + if isinstance(gh, str) and gh: + # Extra HTTPS enforcement (schema already restricts, but keep as policy) + if _is_disallowed_url(gh) or not _is_https_url(gh): + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message="URL must be https:// and must not use a disallowed scheme.", + json_path="links.github", + ) + ) + else: + try: + canonical = normalize_github_repo_url(gh) + key = urlparse(canonical).path.lower().strip("/") + if key in first_by_repo: + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message=( + f"Duplicate component identity: links.github repo `{key}` " + f"already submitted in `{first_by_repo[key].name}`." + ), + json_path="links.github", + ) + ) + else: + first_by_repo[key] = json_file + except Exception as e: + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message=str(e), + json_path="links.github", + ) + ) + + # Enforce HTTPS for other URL fields we accept + for path, val in ( + ("links.demo", links.get("demo")), + ("links.docs", links.get("docs")), + ): + if val is None: + continue + if isinstance(val, str): + if _is_disallowed_url(val) or not _is_https_url(val): + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message="URL must be https:// and must not use a disallowed scheme.", + json_path=path, + ) + ) + + media = obj.get("media") + if isinstance(media, dict): + img = media.get("image") + if img is None: + # Image is optional; null is allowed. + pass + elif isinstance(img, str): + if _is_disallowed_url(img) or not _is_https_url(img): + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message="URL must be https:// and must not use a disallowed scheme.", + json_path="media.image", + ) + ) + elif _is_disallowed_image_host(img): + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message=( + "Image host is not allowed for `media.image` " + "(brittle proxy). Use a stable upstream URL instead." + ), + json_path="media.image", + ) + ) + elif _has_disallowed_image_query_params(img): + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message=( + "Signed/expiring image URLs are not allowed for `media.image` " + "(disallowed query parameters detected)." + ), + json_path="media.image", + ) + ) + else: + issues.append( + ValidationIssue( + file=json_file, + schema=schema_path, + message="`media.image` must be a string URL or null.", + json_path="media.image", + ) + ) + + return issues + + +def validate_compiled(repo_root: Path) -> list[ValidationIssue]: + """Validate the compiled catalog artifact `compiled/components.json`. + + Parameters + ---------- + repo_root + Path to the component-gallery repo root. + + Returns + ------- + list[ValidationIssue] + Validation issues for the compiled artifact. If the artifact is missing, + returns a single issue indicating it was skipped. + """ + schema_path = repo_root / "schemas" / "compiled.schema.json" + compiled_path = repo_root / "compiled" / "components.json" + if not compiled_path.is_file(): + return [ + ValidationIssue( + file=compiled_path, + schema=schema_path, + message="Compiled artifact not found (skipping).", + json_path=None, + ) + ] + return _validate_one(compiled_path, schema_path) + + +def main(argv: list[str]) -> int: + """CLI entrypoint. + + Parameters + ---------- + argv + CLI arguments excluding the program name (i.e., ``sys.argv[1:]``). + + Returns + ------- + int + Process exit code: + + - 0: success + - 1: validation failed + - 2: configuration error (missing required files/dirs) + """ + parser = argparse.ArgumentParser( + description="Validate Component Gallery JSON files." + ) + parser.add_argument( + "--compiled", + action="store_true", + help="Also validate compiled/components.json against schemas/compiled.schema.json.", + ) + parser.add_argument( + "--no-policy", + action="store_true", + help="Disable policy/lint checks beyond schema validation.", + ) + parser.add_argument( + "--max-component-bytes", + type=int, + default=50_000, + help="Max allowed size for each components/*.json file (default: 50000).", + ) + args = parser.parse_args(argv) + + repo_root = Path(__file__).resolve().parents[1] + + all_issues: list[ValidationIssue] = [] + + # Guardrails for common mistakes + if not (repo_root / "schemas" / "component.schema.json").is_file(): + print("ERROR: Missing schema: schemas/component.schema.json", file=sys.stderr) + return 2 + if not (repo_root / "components").is_dir(): + print("ERROR: Missing directory: components/", file=sys.stderr) + return 2 + + all_issues.extend(validate_components(repo_root)) + if not args.no_policy: + all_issues.extend( + validate_policies(repo_root, max_component_bytes=args.max_component_bytes) + ) + if args.compiled: + all_issues.extend(validate_compiled(repo_root)) + + hard_errors = [i for i in all_issues if "skipping" not in i.message.lower()] + + if hard_errors: + # Group and compress output by file for readability. + by_file: dict[Path, list[ValidationIssue]] = defaultdict(list) + for issue in hard_errors: + by_file[issue.file].append(issue) + + total_files = len(by_file) + print( + f"Found {len(hard_errors)} validation error(s) across {total_files} file(s):", + file=sys.stderr, + ) + + for file_path in sorted(by_file.keys()): + issues = by_file[file_path] + # All issues for a given file share the same schema in our usage. + schema_path = issues[0].schema + rel = ( + file_path.relative_to(repo_root) + if file_path.is_absolute() + else file_path + ) + print(f"\n- {rel} ({len(issues)} error(s))", file=sys.stderr) + print(f" schema: {schema_path.relative_to(repo_root)}", file=sys.stderr) + for issue in sorted(issues, key=lambda i: (i.json_path or "$", i.message)): + jp = issue.json_path or "$" + print(f" - {jp}: {issue.message}", file=sys.stderr) + return 1 + + print("OK: all validated files passed.") + # If the only issues are "compiled missing (skipping)", be explicit. + skipped = [i for i in all_issues if "skipping" in i.message.lower()] + for s in skipped: + print(f"NOTE: {s.file} - {s.message}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..9d09f36a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +jsonschema>=4.25.1 +requests>=2.32.0