diff --git a/.github/workflows/validate_pr.yml b/.github/workflows/validate_pr.yml index 36ea8561..d24bbf16 100644 --- a/.github/workflows/validate_pr.yml +++ b/.github/workflows/validate_pr.yml @@ -11,6 +11,7 @@ on: - 'schema/platforms.json' - 'tools/generate_platforms_matrix.py' - 'README.md' + - 'leaderboard/site/**' jobs: validate-runners: @@ -205,4 +206,23 @@ jobs: issue_number: context.issue.number, body, }); - } \ No newline at end of file + } + + frontend-tests: + name: Frontend unit tests (modal viz) + runs-on: ubuntu-latest + # Frontend tests don't need full git history; a shallow checkout + # plus Node 20 (which ships node:test) is enough. + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: '20' + + # No npm install — the suite intentionally has zero runtime deps + # (a hand-written DOM stub stands in for jsdom / happy-dom). Add + # extra files to leaderboard/site/test/ to widen coverage; the + # glob below picks them up automatically. + - name: Run leaderboard frontend tests + run: node --test leaderboard/site/test/*.test.mjs \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2ea1c8c9..07e5795e 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,9 @@ mini_result/ *_backup/ backup/ /tmp/ + +# ── Local-only handoff notes (across-session continuity) ──────────────────── +.handoff-*.md + +# ── Local-only design QA screenshots ──────────────────────────────────────── +.screenshots/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5dda79da..6ff8d5d1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -346,6 +346,69 @@ verified result is itself reproducible by definition. --- +## How your result appears on the leaderboard + +The frontend treats every submission as data — there's nothing per-vendor or per-chip hand-coded on the UI side. A few conventions are worth knowing if you want your result to look its best: + +### Chip identity vs. chip count + +The chip-detail page (`#/chip/`) is keyed on the **chip model** alone, not on `chip_count`. That means a single chip page aggregates every fan-out you've ever submitted (×1, ×4, ×8, ×16) into one overview, with the runs table sorted by `chip_count` ascending and the per-suite KPI card flagging the deployment behind the best score (e.g. `×8` badge next to the metric). + +Implication: if your runner emits one `result.json` per chip-count, you don't need to invent fake chip names to keep them apart — submit them with the same `chip` field and they'll merge cleanly. Old `…-x` URLs from before this change auto-redirect to the bare-model slug, so existing shared links keep working. + +### Vendor colours + +Vendor accents (chip dot, vendor pill, peer card border) are driven entirely by `assets/js/data.js`'s `VENDOR_COLORS` map. New vendors get a deterministic fallback colour from a 9-entry palette the first time they appear in the dataset — no frontend change required to ship the result. + +If you want a brand-accurate accent for a new vendor (e.g. your accelerator's official colour), open a one-line PR to `VENDOR_COLORS`: + +```js +export const VENDOR_COLORS = { + // … + Cerebras: "#ff6f3c", // ← add yours +}; +``` + +`VENDOR_ORDER` (used to lay out the rankings facet pill row) is derived from `Object.keys(VENDOR_COLORS)`, so the same edit also pins your vendor's position in the brand list. Vendors not in the map are appended alphabetically after the brand-named ones. + +### Optional: viz fields for richer modal charts + +The run-detail modal's **Visualize** tab is hidden when `result.viz` is absent. Populate it to surface scenario-specific charts: + +```jsonc +{ + "viz": { + "type": "decode", // bandwidth-bound suite (A/F/G default) + "offline": { + "labels": [1, 2, 4, 8, 16, 32], + "throughput": [120, 230, 410, 760, 1100, 1380] + }, + "interactive": { // optional, suite_D-style + "ttft_p50": 78, "ttft_p90": 110, "ttft_p99": 135, + "tpot_p50": 9, "tpot_p90": 11, "tpot_p99": 14 + } + } +} +``` + +Suite-specific shapes the frontend understands today: + +| `viz.type` | Suites | Required keys | +|---|---|---| +| `decode` | A · F · G | `offline.labels[]`, `offline.throughput[]` | +| `multichip` | B | `offline.labels[]`, `offline.throughput[]`, `offline.throughput_per_chip[]` | +| `quant` | C | `precisions[]`, `throughput[]`, optional `accuracy[]` | +| `longctx` | D | `offline.labels[]`, `offline.throughput[]`, `interactive.{ttft_p50,…,tpot_p99}` | +| `scaling` | E | `chip_counts[]`, `throughput[]`, `efficiency_pct[]` | + +`viz` is **fully optional** — runners that don't emit it still get a clean Details / Implementation modal. When present, the same fields drive the per-suite head-to-head charts on the Compare page (so two basket runs render directly comparable visualisations instead of falling back to a metric-table-only view). + +### Submitter handle + +The leaderboard surfaces the value of `meta.submitted_by` as `@` next to your result on every list (home recent, suite cards, chip-detail submissions table). Anything that looks like a GitHub login, an email, or a `Name ` form is reduced to the local-part — see `submitterHandle` in `assets/js/utils.js`. + +--- + ## Using local or air-gapped models AccelMark separates the **model identifier** (used for leaderboard comparisons) diff --git a/leaderboard/generate.py b/leaderboard/generate.py index 65bfbdd0..4a9ac5ab 100644 --- a/leaderboard/generate.py +++ b/leaderboard/generate.py @@ -6,6 +6,7 @@ python leaderboard/generate.py """ +import hashlib import json import re import statistics @@ -48,6 +49,65 @@ def _get_suite_precision_required(suite_id: str) -> str: return "BF16" +def _collect_suite_specs() -> dict: + """Collect UI-relevant per-suite spec from suites/suite_*/suite.json. + + Baked into the generated leaderboard.js as ``window.SUITE_SPECS`` so + the static leaderboard UI auto-syncs whenever a maintainer edits a + suite contract — model id, dataset, prompt distribution, scenarios + default/extra split, online SLA, etc. Editorial UI content (titles, + taglines, descriptions) stays in assets/js/data.js since it isn't a + property of the suite contract. + + Returns a ``{ suite_id: spec }`` mapping with only the fields the UI + consumes. Missing fields are omitted (the JS-side merge keeps the + hardcoded fallback when a key is absent). + """ + out: dict = {} + suites_dir = Path("suites") + if not suites_dir.exists(): + return out + for sd in sorted(suites_dir.iterdir()): + if not sd.is_dir(): + continue + sf = sd / "suite.json" + if not sf.exists(): + continue + try: + with open(sf) as f: + data = json.load(f) + except Exception: + continue + sid = data.get("suite_id") or sd.name + rd = data.get("request_distribution") or {} + scn = data.get("scenarios") or {} + spec: dict = {} + # Fields the UI displays in suite cards / specs / compare headers. + for k in ( + "model_id", + "model_revision", + "dataset", + "precision_required", + "allowed_precisions", + "max_model_len", + "concurrency_levels", + "online_qps_levels", + "online_sla_ttft_ms", + ): + if k in data and data[k] is not None: + spec[k] = data[k] + if rd.get("input_tokens_p50") is not None: + spec["input_tokens_p50"] = rd["input_tokens_p50"] + if rd.get("output_tokens_p50") is not None: + spec["output_tokens_p50"] = rd["output_tokens_p50"] + if scn.get("default"): + spec["scenarios_default"] = list(scn["default"]) + if scn.get("extra"): + spec["scenarios_extra"] = list(scn["extra"]) + out[sid] = spec + return out + + # ── Data loading ────────────────────────────────────────────────────────────── def load_results() -> list[dict]: @@ -1080,15 +1140,55 @@ def main(): rows = _deduped + suite_specs = _collect_suite_specs() + SITE_DIR.mkdir(parents=True, exist_ok=True) out_path = SITE_DIR / "leaderboard.js" with open(out_path, "w") as f: f.write("// Auto-generated by leaderboard/generate.py. Do not edit manually.\n") + # window.LEADERBOARD_DATA so ES modules (assets/js/data.js) can read it. + # Also exposed as bare LEADERBOARD_DATA for any legacy classic-script consumers. f.write(f"const LEADERBOARD_DATA = {json.dumps(rows, indent=2)};\n") + f.write("window.LEADERBOARD_DATA = LEADERBOARD_DATA;\n") + # window.SUITE_SPECS — canonical per-suite spec from suites/suite_*/suite.json. + # data.js merges these into SUITE_META at init() so UI facts auto-sync + # with a suite contract edit (no JS to keep in step manually). + f.write(f"const SUITE_SPECS = {json.dumps(suite_specs, indent=2)};\n") + f.write("window.SUITE_SPECS = SUITE_SPECS;\n") + + print(f"Leaderboard data written to {out_path} " + f"({len(rows)} rows, {len(suite_specs)} suite specs).") + + # Cache-bust leaderboard.js in index.html so a stale CDN / browser + # cached copy never out-survives the data refresh. GitHub Pages + # serves the static files with a 10-minute Cache-Control by default + # and *no* ETag-aware revalidation on cross-domain ` - - + + +
+
Loading leaderboard…
+
+ + + + + + + + + + - \ No newline at end of file + diff --git a/leaderboard/site/style.css b/leaderboard/site/style.css deleted file mode 100644 index bad295be..00000000 --- a/leaderboard/site/style.css +++ /dev/null @@ -1,92 +0,0 @@ -*, *::before, *::after { box-sizing: border-box; } - -body { - font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; - margin: 0; - background: #f8f9fa; - color: #212529; -} - -header { - background: #1a1a2e; - color: #fff; - padding: 2rem; - text-align: center; -} - -header h1 { margin: 0 0 0.5rem; font-size: 2rem; } -header p { margin: 0 0 1rem; opacity: 0.8; } - -.btn { - display: inline-block; - margin: 0 0.4rem; - padding: 0.5rem 1.2rem; - background: #e94560; - color: #fff; - border-radius: 4px; - text-decoration: none; - font-size: 0.9rem; -} -.btn:hover { background: #c73652; } - -main { max-width: 1400px; margin: 0 auto; padding: 1.5rem; } - -#filters { - display: flex; - gap: 1.5rem; - flex-wrap: wrap; - margin-bottom: 1rem; - background: #fff; - padding: 1rem 1.5rem; - border-radius: 6px; - box-shadow: 0 1px 3px rgba(0,0,0,.1); -} - -#filters label { font-size: 0.9rem; } -#filters select { - margin-left: 0.4rem; - padding: 0.3rem 0.5rem; - border: 1px solid #ced4da; - border-radius: 4px; -} - -.table-wrap { overflow-x: auto; } - -table { - width: 100%; - border-collapse: collapse; - background: #fff; - border-radius: 6px; - overflow: hidden; - box-shadow: 0 1px 3px rgba(0,0,0,.1); - font-size: 0.88rem; -} - -th { - background: #1a1a2e; - color: #fff; - padding: 0.7rem 0.8rem; - text-align: left; - white-space: nowrap; -} - -td { - padding: 0.6rem 0.8rem; - border-bottom: 1px solid #e9ecef; -} - -tr:last-child td { border-bottom: none; } -tr:hover td { background: #f1f3f5; } - -.tier-verified { color: #2f9e44; font-weight: 600; } -.tier-community { color: #1971c2; } - -.acc-valid { color: #2f9e44; } -.acc-invalid { color: #e03131; } - -footer { - text-align: center; - padding: 2rem; - color: #6c757d; - font-size: 0.85rem; -} diff --git a/leaderboard/site/test/chip_count_scaling.test.mjs b/leaderboard/site/test/chip_count_scaling.test.mjs new file mode 100644 index 00000000..24413ffa --- /dev/null +++ b/leaderboard/site/test/chip_count_scaling.test.mjs @@ -0,0 +1,98 @@ +// chip_count_scaling.test.mjs — guard the chip-detail scaling chart. +// +// chipCountScaling(slug) feeds the grouped bar chart on the chip-detail +// 03 · Scaling section. Two regressions to lock down: +// +// 1. Single-fan-out chips return an empty `suites` list — the +// section depends on `data.suites.length` to decide whether to +// render at all, and rendering an empty bar chart for, say, an +// Apple M2 (always ×1) would look like a layout bug. +// 2. Per-suite normalization is intra-cluster only. Each chip-count +// cell on a suite is normalised to that suite's max value across +// the chip's own variants — NOT to a global max. A regression +// that conflates them would make every bar read 100% on bandwidth +// suites and ~0% on multi-chip suites. + +import test from "node:test"; +import assert from "node:assert/strict"; + +import { installDom } from "./dom_stub.mjs"; + +installDom(); + +// Two synthetic chips: +// ChipScale: ×1, ×4, ×8 on suite_A; only ×1 on suite_B (so ×4/×8 +// cells should be zero-filled, not absent). +// ChipSolo: only ×1 on suite_A — drives the "empty suites list" +// branch via chipCounts < 2. +globalThis.window.LEADERBOARD_DATA = [ + // ChipScale on suite_A + { run_id: "scale-a-1", date: "2026-04-01", suite: "suite_A", chip: "ChipScale", chip_count: 1, vendor: "V", + offline_throughput: 100, primary_metric: 100 }, + { run_id: "scale-a-4", date: "2026-04-02", suite: "suite_A", chip: "ChipScale", chip_count: 4, vendor: "V", + offline_throughput: 380, primary_metric: 380 }, + { run_id: "scale-a-8", date: "2026-04-03", suite: "suite_A", chip: "ChipScale", chip_count: 8, vendor: "V", + offline_throughput: 700, primary_metric: 700 }, + // ChipScale on suite_B (only ×1 — should still render a row with ×4/×8 cells nullified) + { run_id: "scale-b-1", date: "2026-04-04", suite: "suite_B", chip: "ChipScale", chip_count: 1, vendor: "V", + offline_throughput: 50, primary_metric: 50 }, + // ChipSolo on suite_A — single fan-out → scaling section should suppress. + { run_id: "solo-a-1", date: "2026-04-05", suite: "suite_A", chip: "ChipSolo", chip_count: 1, vendor: "V", + offline_throughput: 200, primary_metric: 200 }, +]; + +const { chipCountScaling, init } = await import("../assets/js/data.js"); +init(); + +test("chipCountScaling: chip with a single fan-out returns empty suites + chipCounts of length 1", () => { + const out = chipCountScaling("chipsolo"); + assert.deepEqual(out.chipCounts, [1]); + assert.equal(out.suites.length, 0); +}); + +test("chipCountScaling: multi-fan-out chip surfaces every chip_count and normalises within suite", () => { + const out = chipCountScaling("chipscale"); + assert.deepEqual(out.chipCounts, [1, 4, 8]); + // suite_A should be present with a perCount Map covering 1/4/8. + const suiteA = out.suites.find((s) => s.sid === "suite_A"); + assert.ok(suiteA, "suite_A entry should exist"); + const cellA1 = suiteA.perCount.get(1); + const cellA4 = suiteA.perCount.get(4); + const cellA8 = suiteA.perCount.get(8); + assert.equal(cellA1.value, 100); + assert.equal(cellA4.value, 380); + assert.equal(cellA8.value, 700); + // Normalised against the cluster max (700). + assert.equal(cellA1.normalized, 100 / 700); + assert.equal(cellA4.normalized, 380 / 700); + assert.equal(cellA8.normalized, 1.0); +}); + +test("chipCountScaling: cells without a submission for that chip-count come back as null + zero", () => { + const out = chipCountScaling("chipscale"); + const suiteB = out.suites.find((s) => s.sid === "suite_B"); + assert.ok(suiteB, "suite_B should still render so the chart shows the gap"); + // ×1 has data, ×4 and ×8 don't — must be null/zero, not missing keys. + assert.equal(suiteB.perCount.get(1).value, 50); + assert.equal(suiteB.perCount.get(1).normalized, 1.0); + assert.equal(suiteB.perCount.get(4).value, null); + assert.equal(suiteB.perCount.get(4).normalized, 0); + assert.equal(suiteB.perCount.get(8).value, null); + assert.equal(suiteB.perCount.get(8).normalized, 0); +}); + +test("chipCountScaling: suites the chip never submitted to (any count) are omitted entirely", () => { + const out = chipCountScaling("chipscale"); + // ChipScale never touched suites C/D/E/F/G — they should NOT appear, + // saving the chart from rendering empty clusters. + for (const sid of ["suite_C", "suite_D", "suite_E", "suite_F", "suite_G"]) { + assert.equal(out.suites.find((s) => s.sid === sid), undefined, + `${sid} should be omitted when the chip has no data on it at any chip_count`); + } +}); + +test("chipCountScaling: unknown chip slug returns a stable shape (no throw, empty suites)", () => { + const out = chipCountScaling("does-not-exist"); + assert.deepEqual(out.chipCounts, []); + assert.deepEqual(out.suites, []); +}); diff --git a/leaderboard/site/test/chip_slug.test.mjs b/leaderboard/site/test/chip_slug.test.mjs new file mode 100644 index 00000000..d9a4c88e --- /dev/null +++ b/leaderboard/site/test/chip_slug.test.mjs @@ -0,0 +1,65 @@ +// chip_slug.test.mjs — guard the chip-detail URL contract. +// +// chipSlug deliberately does NOT encode chip_count: 4090D, 4090D ×4, +// 4090D ×8 are the same hardware and share a chip-detail page. This +// is a behaviour change from the original implementation, where the +// slug used to carry an "-x" suffix. Two regressions we'd want to +// catch fast: +// +// 1. someone re-introducing chip_count into chipSlug — would re-fork +// ×1 / ×4 / ×8 back into separate detail pages. +// 2. someone removing the legacy "-x" → bare-model normaliser — +// would 404 every shared link / bookmark from before the change. + +import test from "node:test"; +import assert from "node:assert/strict"; + +import { chipSlug, normalizeChipSlug, slugify } from "../assets/js/utils.js"; + +test("chipSlug: same chip at different chip_count produces the same slug", () => { + const x1 = { chip: "RTX 4090D", chip_count: 1 }; + const x4 = { chip: "RTX 4090D", chip_count: 4 }; + const x8 = { chip: "RTX 4090D", chip_count: 8 }; + assert.equal(chipSlug(x1), chipSlug(x4)); + assert.equal(chipSlug(x4), chipSlug(x8)); + assert.equal(chipSlug(x1), slugify("RTX 4090D")); +}); + +test("chipSlug: missing chip / row returns empty string", () => { + assert.equal(chipSlug(null), ""); + assert.equal(chipSlug(undefined), ""); + assert.equal(chipSlug({}), ""); +}); + +test("chipSlug: precomputed _chip_slug wins over recomputation", () => { + const row = { chip: "Will Be Ignored", _chip_slug: "precomputed-value" }; + assert.equal(chipSlug(row), "precomputed-value"); +}); + +test("normalizeChipSlug: legacy -x shape is mapped back to the bare model", () => { + assert.equal(normalizeChipSlug("nvidia-rtx-4090d-x4"), "nvidia-rtx-4090d"); + assert.equal(normalizeChipSlug("apple-m4-max-x1"), "apple-m4-max"); + assert.equal(normalizeChipSlug("h200-x16"), "h200"); +}); + +test("normalizeChipSlug: new bare-model slugs are reported unchanged (null)", () => { + // Returning null lets the router skip the rewrite + replaceState + // dance for slugs that are already in the new shape. + assert.equal(normalizeChipSlug("nvidia-rtx-4090d"), null); + assert.equal(normalizeChipSlug("apple-m4-max"), null); + assert.equal(normalizeChipSlug(""), null); + assert.equal(normalizeChipSlug(null), null); +}); + +test("normalizeChipSlug: known boundary — slug tokens that look like -x", () => { + // The regex matches any `-x$` tail, so a hypothetical chip + // ending in "x86" or "x64" would also be rewritten. Document this + // here so a future refactor doesn't widen the regex by accident. + // No real chip in the dataset has this shape today; if one ever + // ships, switch the redirect to a known-suffix allowlist or check + // the rewrite target actually exists in `_byChip` before applying. + assert.equal(normalizeChipSlug("foo-x86"), "foo"); + // Suffix with non-digits stays put — no rewrite. + assert.equal(normalizeChipSlug("foo-xy"), null); + assert.equal(normalizeChipSlug("foo-x"), null); +}); diff --git a/leaderboard/site/test/dom_stub.mjs b/leaderboard/site/test/dom_stub.mjs new file mode 100644 index 00000000..50bb2eb5 --- /dev/null +++ b/leaderboard/site/test/dom_stub.mjs @@ -0,0 +1,139 @@ +// dom_stub.mjs — Tiny DOM + Chart.js stand-in for headless node tests. +// +// modal.js's Visualize tab renderer touches a small DOM surface +// (createElement / appendChild / classList / innerHTML / textContent / +// dataset / style + getComputedStyle on the documentElement) plus the +// global Chart constructor. Spinning up jsdom or happy-dom for that is +// overkill; this file ships the minimum implementation required to make +// `_test.renderViz(panel, row)` reach every per-suite branch without +// throwing. When the real DOM grows another usage in modal.js, add it +// here — the failing test will tell you exactly which method. + +class FakeEl { + constructor(tag) { + this.tagName = String(tag).toUpperCase(); + this.children = []; + this._attrs = {}; + this._cls = new Set(); + this._innerHTML = ""; + this._textContent = ""; + this.style = {}; + this.dataset = {}; + this.classList = { + add: (c) => this._cls.add(c), + remove: (c) => this._cls.delete(c), + toggle: (c, v) => (v ? this._cls.add(c) : this._cls.delete(c)), + contains: (c) => this._cls.has(c), + }; + } + set className(v) { + this._cls = new Set(String(v).split(/\s+/).filter(Boolean)); + } + get className() { return [...this._cls].join(" "); } + // Mirror DOM `el.id` ↔ `el.getAttribute("id")` so callers that use + // either form (data.js's injectVendorStyles uses the property form) + // both update the same backing slot the head's appendChild looks at. + set id(v) { this._attrs.id = String(v); } + get id() { return this._attrs.id || ""; } + set innerHTML(v) { + this._innerHTML = String(v == null ? "" : v); + if (this._innerHTML === "") this.children = []; + } + get innerHTML() { return this._innerHTML; } + set textContent(v) { this._textContent = String(v); } + get textContent() { return this._textContent; } + appendChild(c) { this.children.push(c); return c; } + removeChild(c) { + const i = this.children.indexOf(c); + if (i >= 0) this.children.splice(i, 1); + return c; + } + setAttribute(k, v) { this._attrs[k] = v; } + removeAttribute(k) { delete this._attrs[k]; } + getAttribute(k) { return this._attrs[k] ?? null; } + // Synthetic `.click()` fired by downloadCanvasAsPng to + // trigger a browser download. Tests only need it to not throw — + // the actual download has no real-DOM observable side effect. + click() {} + // Minimal querySelector stub: we only ever need to call this for + // helpers like flashButtonLabel that look for ".copy-btn-label" + // inside a button. Returning null lets the helper fall through to + // its "label is the button itself" branch, which the tests assert. + querySelector() { return null; } + querySelectorAll() { return []; } + + // Recursively flatten children — handy for assertions that care + // about how many cards / canvases the renderer produced regardless + // of nesting depth. + descendants() { + const out = []; + const walk = (el) => { + for (const c of el.children) { + out.push(c); + if (c instanceof FakeEl) walk(c); + } + }; + walk(this); + return out; + } +} + +// installDom() wires fake document + window onto globalThis and returns +// helpers a test uses to inspect chart creation. Each call resets +// counters so test cases stay independent. +export function installDom() { + const created = []; + + function FakeChart(canvas, config) { + this.canvas = canvas; + this.config = config; + this.destroyed = false; + this.destroy = () => { this.destroyed = true; }; + created.push(this); + } + + // `_byId` lets injectVendorStyles in data.js mount its singleton + //