|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Validate docs/papers.yml. |
| 3 | +
|
| 4 | +Goal: a fast, deterministic PR check that catches the common formatting issues |
| 5 | +maintainers would otherwise check manually. |
| 6 | +
|
| 7 | +This is intentionally "simple yet robust": it validates structure and a few |
| 8 | +key invariants, without trying to enforce stylistic preferences. |
| 9 | +
|
| 10 | +Rules: |
| 11 | +- YAML must parse. |
| 12 | +- Top-level must be a mapping with key `papers` as a list. |
| 13 | +- Each paper must be a mapping with required keys: |
| 14 | + - title: non-empty string |
| 15 | + - authors: non-empty list of strings |
| 16 | + - link: non-empty string |
| 17 | + - date: string in YYYY-MM-DD format (basic ISO date) |
| 18 | + - image: string that is either: |
| 19 | + - absolute http(s) URL, OR |
| 20 | + - a basename filename that exists under docs/src/public/images/ |
| 21 | +
|
| 22 | +We also reject a common footgun: |
| 23 | +- image URLs pointing at the *temporary* bot branch `paper-images/pr-...`. |
| 24 | +""" |
| 25 | + |
| 26 | +from __future__ import annotations |
| 27 | + |
| 28 | +import os |
| 29 | +import re |
| 30 | +import sys |
| 31 | +import datetime as _dt |
| 32 | +from pathlib import Path |
| 33 | + |
| 34 | +import yaml |
| 35 | + |
| 36 | +PAPERS_PATH = Path("docs/papers.yml") |
| 37 | +IMAGES_DIR = Path("docs/src/public/images") |
| 38 | + |
| 39 | +DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$") |
| 40 | +IMAGE_EXT_RE = re.compile(r"\.(png|jpe?g|webp)$", re.IGNORECASE) |
| 41 | +TEMP_BOT_BRANCH_RE = re.compile(r"/paper-images/pr-\d+(/|$)") |
| 42 | + |
| 43 | + |
| 44 | +def fail(msg: str) -> "NoReturn": |
| 45 | + print(f"ERROR: {msg}") |
| 46 | + raise SystemExit(1) |
| 47 | + |
| 48 | + |
| 49 | +def is_nonempty_str(x: object) -> bool: |
| 50 | + return isinstance(x, str) and x.strip() != "" |
| 51 | + |
| 52 | + |
| 53 | +def main() -> None: |
| 54 | + if not PAPERS_PATH.exists(): |
| 55 | + fail(f"Missing {PAPERS_PATH}") |
| 56 | + |
| 57 | + raw = PAPERS_PATH.read_text(encoding="utf-8") |
| 58 | + |
| 59 | + try: |
| 60 | + obj = yaml.safe_load(raw) |
| 61 | + except Exception as e: |
| 62 | + fail(f"YAML parse error in {PAPERS_PATH}: {e}") |
| 63 | + |
| 64 | + if not isinstance(obj, dict): |
| 65 | + fail(f"{PAPERS_PATH} must be a mapping at top level") |
| 66 | + |
| 67 | + papers = obj.get("papers") |
| 68 | + if not isinstance(papers, list): |
| 69 | + fail(f"{PAPERS_PATH} must contain `papers:` as a list") |
| 70 | + |
| 71 | + errors: list[str] = [] |
| 72 | + |
| 73 | + for i, paper in enumerate(papers): |
| 74 | + prefix = f"papers[{i}]" |
| 75 | + |
| 76 | + if not isinstance(paper, dict): |
| 77 | + errors.append(f"{prefix}: must be a mapping") |
| 78 | + continue |
| 79 | + |
| 80 | + # Required fields |
| 81 | + title = paper.get("title") |
| 82 | + if not is_nonempty_str(title): |
| 83 | + errors.append(f"{prefix}.title: required non-empty string") |
| 84 | + |
| 85 | + authors = paper.get("authors") |
| 86 | + if not isinstance(authors, list) or len(authors) == 0 or not all( |
| 87 | + is_nonempty_str(a) for a in authors |
| 88 | + ): |
| 89 | + errors.append(f"{prefix}.authors: required non-empty list of strings") |
| 90 | + |
| 91 | + link = paper.get("link") |
| 92 | + if not is_nonempty_str(link): |
| 93 | + errors.append(f"{prefix}.link: required non-empty string") |
| 94 | + |
| 95 | + date = paper.get("date") |
| 96 | + # PyYAML parses unquoted ISO dates as datetime.date |
| 97 | + if isinstance(date, _dt.date): |
| 98 | + date_s = date.isoformat() |
| 99 | + elif isinstance(date, str): |
| 100 | + date_s = date.strip() |
| 101 | + else: |
| 102 | + date_s = "" |
| 103 | + |
| 104 | + if not date_s or not DATE_RE.match(date_s): |
| 105 | + errors.append(f"{prefix}.date: required YYYY-MM-DD") |
| 106 | + |
| 107 | + image = paper.get("image") |
| 108 | + if not is_nonempty_str(image): |
| 109 | + errors.append(f"{prefix}.image: required string") |
| 110 | + else: |
| 111 | + image_s = image.strip() |
| 112 | + |
| 113 | + if image_s.startswith("http://") or image_s.startswith("https://"): |
| 114 | + # Reject temporary bot-branch URLs (these are not stable and shouldn't |
| 115 | + # be used in papers.yml). |
| 116 | + if TEMP_BOT_BRANCH_RE.search(image_s): |
| 117 | + errors.append( |
| 118 | + f"{prefix}.image: points to temporary bot branch URL; use a stable URL (e.g. PySR_Docs master)" |
| 119 | + ) |
| 120 | + else: |
| 121 | + # Must be a basename only (no paths) |
| 122 | + if os.path.basename(image_s) != image_s or "/" in image_s or "\\" in image_s: |
| 123 | + errors.append( |
| 124 | + f"{prefix}.image: local image must be a basename (e.g. myfig.jpg), not a path" |
| 125 | + ) |
| 126 | + else: |
| 127 | + if not IMAGE_EXT_RE.search(image_s): |
| 128 | + errors.append( |
| 129 | + f"{prefix}.image: local image must end in .png/.jpg/.jpeg/.webp" |
| 130 | + ) |
| 131 | + img_path = IMAGES_DIR / image_s |
| 132 | + if not img_path.exists(): |
| 133 | + errors.append( |
| 134 | + f"{prefix}.image: local image {image_s!r} not found at {img_path}" |
| 135 | + ) |
| 136 | + |
| 137 | + if errors: |
| 138 | + print("Papers.yml validation failed:\n") |
| 139 | + for e in errors: |
| 140 | + print(f"- {e}") |
| 141 | + raise SystemExit(1) |
| 142 | + |
| 143 | + print("OK: docs/papers.yml looks valid") |
| 144 | + |
| 145 | + |
| 146 | +if __name__ == "__main__": |
| 147 | + main() |
0 commit comments