diff --git a/.github/workflows/generate_leaderboard.yml b/.github/workflows/generate_leaderboard.yml
index 550c13ba..31dec72d 100644
--- a/.github/workflows/generate_leaderboard.yml
+++ b/.github/workflows/generate_leaderboard.yml
@@ -1,15 +1,22 @@
 name: Generate Leaderboard
 
-# Triggers only when results/ changes land on main.
-# This covers both direct merges and squash merges.
+# Triggers when anything that can change the rendered leaderboard lands on
+# main: new community/verified results, generator code, the static site, or
+# the auto-generated README platforms matrix (so a runner/platform metadata
+# change also redeploys). This covers both direct merges and squash merges.
 on:
   push:
     branches:
       - main
     paths:
       - 'results/**'
+      - 'leaderboard/**'
+      - 'tools/generate_platforms_matrix.py'
+      - 'schema/platforms.json'
+      - 'runners/*/meta.json'
 
-  # Allow manual trigger from Actions tab (useful for first deploy)
+  # Allow manual trigger from Actions tab (useful for first deploy or to
+  # force a redeploy when nothing in the watched paths changed).
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/validate_pr.yml b/.github/workflows/validate_pr.yml
index 0ce5ada4..36ea8561 100644
--- a/.github/workflows/validate_pr.yml
+++ b/.github/workflows/validate_pr.yml
@@ -1,12 +1,16 @@
 name: Validate Submission
 
-# Triggers only when results/ directory is touched in a PR.
-# Schema changes, script changes, and doc changes do not trigger this.
+# Triggers when results/, runner metadata, the platforms catalogue, the
+# README (which contains the auto-generated matrix), or the matrix
+# generator itself are touched in a PR.
 on:
   pull_request:
     paths:
       - 'results/**'
       - 'runners/**'
+      - 'schema/platforms.json'
+      - 'tools/generate_platforms_matrix.py'
+      - 'README.md'
 
 jobs:
   validate-runners:
@@ -66,6 +70,24 @@ jobs:
         if: steps.changed.outputs.folders == ''
         run: echo "No runner folders changed in this PR — skipping."
 
+      # Always validate every runner folder (not just the ones touched in
+      # this PR). This catches drift introduced by shared changes — e.g.
+      # a meta.schema.json edit that breaks an unrelated existing runner.
+      - name: Validate all runner folders (drift check)
+        run: |
+          echo "::group::Validating every runner folder in the repo"
+          python runners/validate_runners.py
+          echo "::endgroup::"
+
+      # README "Supported platforms" matrix is generated from each runner's
+      # meta.json. If a PR changes a runner's suite_support / hardware_label
+      # or adds a new runner without regenerating the table, fail.
+      - name: README platforms matrix is in sync
+        run: |
+          echo "::group::tools/generate_platforms_matrix.py --check"
+          python tools/generate_platforms_matrix.py --check
+          echo "::endgroup::"
+
   validate:
     name: Validate result submissions
     runs-on: ubuntu-latest
diff --git a/.gitignore b/.gitignore
index 503a227f..2ea1c8c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,27 +1,50 @@
+# ── Python ──────────────────────────────────────────────────────────────────
 __pycache__/
 *.py[cod]
+*.egg
 *.egg-info/
 dist/
 build/
 .venv/
 venv/
 env/
-*.egg
+
+# ── Editor / IDE ────────────────────────────────────────────────────────────
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+*.tmp
 .DS_Store
-*.log
-my_submission/
-mini_result/
-/tmp/
-leaderboard/site/leaderboard.js
-leaderboard/site/api/
+
+# ── Test / lint caches ──────────────────────────────────────────────────────
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+.coverage.*
+htmlcov/
+.tox/
+
+# ── Jupyter ─────────────────────────────────────────────────────────────────
+.ipynb_checkpoints/
+
+# ── AccelMark local-only files ──────────────────────────────────────────────
 configs/models_local.yaml
 configs/submitter.yaml
 configs/runner_configs/*.yaml
+leaderboard/site/leaderboard.js
+leaderboard/site/api/
 
-# Local-only benchmark artifacts (not needed for submission)
+# ── Benchmark artifacts (local-only — not part of submissions) ──────────────
+samples.jsonl
+samples.jsonl.ipynb_checkpoints/
 accuracy_outputs.jsonl
 run.log
-samples.jsonl.ipynb_checkpoints/
+*.log
+my_submission/
+mini_result/
 *_backup/
 backup/
-.ipynb_checkpoints/
+/tmp/
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..354654ba
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,144 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and maintainers pledge to make participation in
+AccelMark a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic
+status, nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances
+  of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Benchmark-specific expectations
+
+AccelMark is a results-driven leaderboard. The following are specifically
+out of scope:
+
+* **Cherry-picked, doctored, or fabricated results.** Submitting a result
+  knowing it does not reflect the listed hardware / software is misconduct,
+  not a mistake. Mistakes are expected and welcome; fabrication is not.
+* **Misrepresentation of affiliation.** Vendor employees may submit results
+  for their own hardware (it is encouraged) — but the `[vendor]` tag in the
+  submitter name must be present, per `CONTRIBUTING.md`.
+* **Disparaging another vendor or contributor's hardware in PR/issue
+  comments.** Numbers speak; commentary should focus on methodology and
+  reproducibility, not on the entity behind a competing result.
+
+## Enforcement Responsibilities
+
+Project maintainers are responsible for clarifying and enforcing our
+standards of acceptable behavior and will take appropriate and fair
+corrective action in response to any behavior that they deem inappropriate,
+threatening, offensive, or harmful.
+
+Maintainers have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that
+are not aligned to this Code of Conduct, and will communicate reasons for
+moderation decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies
+when an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail
+address, posting via an official social media account, or acting as an
+appointed representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the project maintainers by opening a confidential security
+advisory at <https://github.com/JuhaoLiang1997/AccelMark/security/advisories/new>
+or, when GitHub access is not available, by emailing the maintainer listed
+in the repository profile. All complaints will be reviewed and investigated
+promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of
+the reporter of any incident.
+
+## Enforcement Guidelines
+
+Project maintainers will follow these Community Impact Guidelines in
+determining the consequences for any action they deem in violation of this
+Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from a maintainer, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, for a specified period of time.
+Violating these terms may lead to a temporary or permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public
+or private interaction with the people involved, including unsolicited
+interaction with those enforcing the Code of Conduct, is allowed during
+this period. Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+<https://www.contributor-covenant.org/version/2/1/code_of_conduct.html>.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+For answers to common questions about this code of conduct, see the FAQ at
+<https://www.contributor-covenant.org/faq>. Translations are available at
+<https://www.contributor-covenant.org/translations>.
+
+[homepage]: https://www.contributor-covenant.org
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f4b2e798..5dda79da 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,9 +11,10 @@ in the leaderboard and submitting your results.
 **Got a GPU? Here's the shortest path to getting on the leaderboard:**
 
 ```bash
-# 1. Clone and install
-git clone https://github.com/JuhaoLiang1997/AccelMark.git
+# 1. Fork the repo on GitHub, then clone your fork
+git clone https://github.com/<you>/AccelMark.git
 cd AccelMark
+pip install -e .
 pip install -r runners/nvidia_vllm_47f5d58e/requirements.txt
 
 # 2. Set your name (one-time setup)
@@ -23,14 +24,19 @@ cp configs/submitter.yaml.example configs/submitter.yaml
 # 3. Run the benchmark (~11 min on A100 for default scenarios)
 #    Accuracy gate runs automatically before the benchmark starts.
 #    Output directory is auto-named using run_name, e.g.:
-    #    results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm_47f5d58e_ed4b0557
-    python run.py --runner nvidia_vllm_47f5d58e --suite suite_A
+#    results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm_47f5d58e_ed4b0557
+python run.py --runner nvidia_vllm_47f5d58e --suite suite_A
 
-# 4. Submit — open a GitHub Issue and paste your result.json
-# https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md
+# 4. Open a pull request with your result
+git checkout -b submit/<your-hardware>
+git add results/ && git commit -m "results: <hardware> on suite_A"
+git push origin submit/<your-hardware>
+gh pr create   # or open the PR via the GitHub web UI
 ```
 
-That's it. The CI bot handles the rest.
+That's it. CI validates the result automatically; merging the PR publishes it to the leaderboard.
+
+> _Prefer not to use git?_ Open a [Community Submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md), paste your `result.json`, and the CI bot will draft the PR on your behalf.
 
 ---
 
@@ -87,6 +93,22 @@ models:
 `configs/models_local.yaml` is gitignored. Once configured, you don't
 need `--model-path` on the command line.
 
+### Per-runner config overrides (optional)
+
+If you want to permanently change a runner's defaults (e.g. raise
+`max_num_seqs`, enable `enforce_eager`, set `tensor_parallel_size`) without
+adding flags to every invocation, drop a yaml at
+`configs/runner_configs/runner_<runner_id>.yaml`. The file is
+**gitignored** — only `*.yaml.example` companions are checked into the
+repo. That makes the override strictly local to your machine and keeps
+the canonical defaults intact for everyone else.
+
+```bash
+cp configs/runner_configs/runner_nvidia_vllm_47f5d58e.yaml.example \
+   configs/runner_configs/runner_nvidia_vllm_47f5d58e.yaml
+# edit freely — your benchmarks now pick up the overrides automatically
+```
+
 ---
 
 ## Running the benchmark
@@ -238,7 +260,7 @@ Add **speculative** (~3 min extra on Suite A, ~24 min extra on Suite D) or **bur
 
 ---
 
-## Submitting your results
+## Submitting a result
 
 ### Accuracy gate (automatic)
 
@@ -271,28 +293,42 @@ framework, same precision, same inference stack.
 **Resuming an interrupted run:** Re-running the same command resumes from
 where it stopped. Completed steps are skipped automatically.
 
-### Step 1: Open a GitHub Issue
+### Recommended: open a pull request
 
-Go to [Issues → New → Community Submission](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md).
+After a successful run, validate locally and open a PR:
 
-Paste the full contents of your `result.json` into the code block and submit.
+```bash
+# Validate the produced files against the schemas (the same check CI runs).
+python runners/validate_submission.py \
+    results/community/<run_name>/result.json
+
+# Stage just the new result and env file.
+git checkout -b submit/<your-hardware>
+git add results/community/<run_name>/
+git commit -m "results: <hardware> on suite_A"
+git push origin submit/<your-hardware>
+
+# Open the PR — either via the GitHub web UI or:
+gh pr create --fill
+```
 
-> **The CI bot validates your result automatically** — recommend to run
-> `validate_submission.py` locally first. If validation fails, the bot
-> comments on your issue explaining what to fix.
+What gets committed is *only* the new files under `results/community/<run_name>/`:
+your `result.json`, `env_info.json`, and (optionally) `samples.jsonl`. Nothing
+else in the repo should change.
 
-> **Why paste instead of attach?** The CI bot reads `result.json` directly
-> from the issue body. File attachments are not accessible to GitHub Actions.
+CI then re-runs the schema validator and the runner-folder integrity check.
+When both pass and a contributor reviews the diff, the PR is merged and your
+result shows up on the leaderboard on the next site build.
 
-### Step 2: Done
+### Alternative: open a submission issue (no git required)
 
-The CI bot will:
-1. Validate your `result.json` against the schema
-2. Open a PR with your result files
-3. Comment on your issue with a link to the PR
+If you'd rather not use git, paste your `result.json` into a
+[Community Submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md).
+A bot will validate the JSON, draft a PR with the files in the right place,
+and link it back to your issue. You don't need to touch git or fork the repo.
 
-Your result appears on the **Community** tab after the maintainer reviews
-and merges the PR — usually within a day or two.
+> **Why paste instead of attach?** The bot reads `result.json` directly from
+> the issue body. File attachments are not accessible to GitHub Actions.
 
 ---
 
@@ -300,10 +336,13 @@ and merges the PR — usually within a day or two.
 
 | Tier | How to get it | Leaderboard placement |
 |------|--------------|----------------------|
-| **community** | Submit via GitHub Issue, passes CI validation | Community tab |
-| **verified** | Maintainer reproduced your result within 5% | Main leaderboard |
+| **community** | Submit a PR (or issue → bot-drafted PR) and pass CI validation | Community tab |
+| **verified** | Independently reproduced on the same hardware/runner within 5% | Main leaderboard |
 
-To request verification, comment on your submission issue.
+To promote a community result to **verified**, anyone with the same hardware
+and runner can run the same suite and open a follow-up PR that lands the
+reproduction in `results/verified/`. Maintainers do not gate this — every
+verified result is itself reproducible by definition.
 
 ---
 
@@ -329,11 +368,29 @@ If your local copy was downloaded at a different revision, add a note in
 
 ---
 
-## Adding support for a new platform
-
-Create a new runner folder under `runners/` by subclassing `BenchmarkRunner`.
-See [DEVELOPMENT.md](DEVELOPMENT.md) for the full implementation guide including
-how to compute your runner's hash ID.
+## Adding a new runner
+
+A "runner" here is a Python class that wraps an inference framework (vLLM,
+SGLang, mlx-lm, …) and exposes the AccelMark standard interface. Adding
+one for an **existing** platform (NVIDIA, AMD, Ascend, Apple, Google TPU,
+Moore Threads, …) does not require touching any shared file. The full
+walk-through lives in [`runners/README.md`](runners/README.md); the short
+version is:
+
+1. Copy `runners/template/runner.py` into a temporary folder and fill in
+   the three required methods (`load_model`, `inference_fn_offline`,
+   `release_resources`) plus `inference_fn_streaming` if your framework
+   has a streaming API.
+2. Compute the hash and rename the folder:
+   `python runners/hash_runner.py runners/tmp/`
+   produces e.g. `nvidia_myframework_3f8a2c1d`.
+3. Write `meta.json` next to it, including `suite_support` — that field
+   is **how the top-level `README.md` table picks up your runner**. You
+   never edit `README.md` yourself.
+4. Add a `requirements.txt`.
+5. Validate: `python runners/validate_runners.py --dir runners/<your_folder>`.
+6. Regenerate the README matrix locally:
+   `python tools/generate_platforms_matrix.py`.
 
 ```python
 # runners/your_platform_{hash8}/runner.py
@@ -347,7 +404,7 @@ class MyFrameworkRunner(BenchmarkRunner):
     SUPPORTS_ONLINE     = True
     SUPPORTS_MULTI_CHIP = True    # set False if no tensor parallelism
 
-    def load_model(self, model_path: str, suite: dict, parallelism: dict) -> None:
+    def load_model(self, model_path: str, parallelism: dict) -> None:
         tp_size = parallelism["tensor_parallel_size"]
         self.model = MyFramework.load(model_path, tp=tp_size)
 
@@ -383,14 +440,37 @@ if __name__ == "__main__":
 All orchestration (result building, accuracy reuse, Suite E, etc.) is
 inherited from `BenchmarkRunner` automatically.
 
-**Checklist for a new platform PR:**
+**Checklist for a new-runner PR (existing platform):**
 - [ ] Runner folder named `{platform}_{name}_{hash8}` with correct hash
 - [ ] `runner.py` subclasses `BenchmarkRunner` and passes `runners/validate_runners.py`
-- [ ] `meta.json` present and valid (see `runners/meta.schema.json`)
+- [ ] `meta.json` present and valid (see `runners/meta.schema.json`), with
+      `suite_support` declared for every suite your runner can or cannot run
 - [ ] `requirements.txt` included
-- [ ] At least one reference result in `results/community/`
-- [ ] `runners/collect_env.py` updated to detect your hardware (see [DEVELOPMENT.md](DEVELOPMENT.md))
-- [ ] `README.md` supported platforms table updated
+- [ ] `tools/generate_platforms_matrix.py --check` passes locally (CI also
+      enforces this)
+- [ ] At least one reference result in `results/community/` (validated by CI)
+
+### Adding a new accelerator family
+
+If you are bringing up a **new platform** (e.g. a vendor not yet in
+`schema/platforms.json`), the only additional file you need to ship is
+
+```
+runners/platforms/<your_platform>.py
+```
+
+which exports module-level `collect()`, `detect_runtime_version()` and a
+few optional helpers. The collector at `runners/collect_env.py`
+auto-discovers it; no change to that file is required. See
+[`runners/README.md`](runners/README.md#adding-a-new-accelerator-family)
+for the full protocol and a worked example.
+
+Optional polish steps when the new platform stabilises:
+
+- Add an entry to `schema/platforms.json` so the README matrix renders a
+  pretty hardware label and stable sort order. Until then, the matrix
+  renders the bare identifier and `validate_runners.py` emits a
+  non-fatal warning prompting this follow-up.
 
 See [DEVELOPMENT.md](DEVELOPMENT.md) for the full implementation reference.
 
@@ -404,9 +484,10 @@ If a result looks wrong:
 2. Include: the submission name, what looks wrong, and ideally your own
    run on the same hardware as evidence
 
-Maintainers will investigate. If confirmed suspicious, the result's `meta.flagged`
-field will be set to a reason string and it will appear with a ⚠️ badge on the
-leaderboard.
+The community discusses the report in the issue. If the consensus is that
+the result is suspicious, a PR sets `meta.flagged` on that result to a
+reason string and the entry shows up with a ⚠️ badge on the leaderboard.
+Anyone can open that follow-up PR.
 
 ---
 
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 22d183c9..343a3132 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -714,7 +714,7 @@ Common patterns:
 # Resolve model path (checks models_local.yaml)
 path = br._resolve_model_path(model_id, args.model_path)
 
-# Parse scenarios config (handles legacy flat array and new dict format)
+# Parse scenarios config — expects {"default": [...], "extra": [...]}
 default, extra = br._parse_scenarios_config(suite)
 
 # Merge scenario results after running offline+online+interactive
@@ -897,7 +897,7 @@ Key constraints:
 | `meta.run_id` | string\|null | 8-char hex hash of hardware+software+suite+submitter. Deterministic — same config always produces same `run_id`. Used for duplicate detection. |
 | `meta.run_name` | string\|null | Full directory name: `{chip}x{count}_{suite}_{runner}_{run_id}`. Used as the output directory name. |
 | `meta.time` | string\|null | Benchmark start time HH:MM:SS. |
-| `meta.flagged` | string\|null | Null for normal results. Maintainer sets to a reason string if result is suspicious — triggers ⚠️ badge on leaderboard. |
+| `meta.flagged` | string\|null | Null for normal results. Set to a reason string if community review concludes the result is suspicious (via a follow-up PR) — triggers ⚠️ badge on leaderboard. |
 
 These fields are optional in the schema for backward compatibility with older results.
 New benchmark runs populate all four automatically.
diff --git a/README.md b/README.md
index 88fbf406..ea9e2b65 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,27 @@
-# ⚡ AccelMark
-
-**Open benchmark leaderboard for AI accelerators on LLM workloads.**
-
-[![Live Leaderboard](https://img.shields.io/badge/leaderboard-live-brightgreen)](https://juhaoliang1997.github.io/AccelMark)
-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
-[![Contributions welcome](https://img.shields.io/badge/contributions-welcome-orange.svg)](CONTRIBUTING.md)
-
-[**→ Live Leaderboard**](https://juhaoliang1997.github.io/AccelMark) · [Contributing](CONTRIBUTING.md) · [Suites](suites/README.md) · [Development](DEVELOPMENT.md)
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="docs/assets/logo-wordmark-dark.svg">
+    <img src="docs/assets/logo-wordmark.svg" alt="AccelMark" width="360">
+  </picture>
+</p>
+
+<p align="center">
+  <strong>Open benchmark leaderboard for AI accelerators on LLM workloads.</strong>
+</p>
+
+<p align="center">
+  <a href="https://juhaoliang1997.github.io/AccelMark"><img src="https://img.shields.io/badge/leaderboard-live-brightgreen" alt="Live Leaderboard"></a>
+  <a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue.svg" alt="License: MIT"></a>
+  <a href="CONTRIBUTING.md"><img src="https://img.shields.io/badge/contributions-welcome-orange.svg" alt="Contributions welcome"></a>
+</p>
+
+<p align="center">
+  <a href="https://juhaoliang1997.github.io/AccelMark"><strong>→ Live Leaderboard</strong></a> ·
+  <a href="CONTRIBUTING.md">Contributing</a> ·
+  <a href="suites/README.md">Suites</a> ·
+  <a href="https://github.com/JuhaoLiang1997/AccelMark/discussions">Discussions</a> ·
+  <a href="DEVELOPMENT.md">Development</a>
+</p>
 
 ---
 
@@ -36,11 +51,14 @@ cp configs/submitter.yaml.example configs/submitter.yaml
 # 3. Run the benchmark (~11 min on A100)
 python run.py --runner nvidia_vllm_47f5d58e --suite suite_A
 
-# 4. Submit — open a GitHub Issue and paste your result.json
-# https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md
+# 4. Submit your result — open a pull request:
+#    git checkout -b submit/<your-hardware>
+#    cp results/your-result.json results/community/<runner_id>/result.json
+#    git add results/ env_info.json && git commit -m "results: <hardware>"
+#    gh pr create   # or open via the GitHub web UI
 ```
 
-See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide.
+See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide. If you'd rather skip the PR workflow, [open a submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md) instead and a bot will draft the PR for you.
 
 ---
 
@@ -64,19 +82,26 @@ See [suites/README.md](suites/README.md) for full specs, time budgets, SLA defin
 
 ## Supported platforms
 
-Reference runners live under `runners/` (see each folder’s `meta.json`). Checkmarks mark suites **implemented and runnable** with that runner in this repository.
+Reference runners live under `runners/` (see each folder’s `meta.json`). The table below is **auto-generated** from each runner's `meta.json` — never hand-edited. Add a runner, declare its `suite_support` in `meta.json`, and the matrix updates on its own.
 
+<!-- platforms-matrix:start -->
 | Hardware | Runner folder | Framework | A | B | C | D | E | F | G |
-|----------|---------------|-----------|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+| NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
-| NVIDIA GPU | `nvidia_sglang_6da83845` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
-| AMD GPU | `amd_vllm_rocm_5355c2c6` | vLLM (ROCm) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
-| Huawei Ascend NPU | `ascend_vllm_ascend_605db33a` | vLLM-Ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
-| Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu (JAX/XLA) | ✓ | — | — | ✓ | — | ✓ | — |
+| AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
+| Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
+| Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
+
+_Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
+<!-- platforms-matrix:end -->
+
+> Regenerate locally with `python tools/generate_platforms_matrix.py`. CI runs `--check` and fails the PR if the README and runner metadata disagree.
 
 Other stacks (TensorRT-LLM, MindIE, mlx-lm, etc.) can be added as new runner folders; see the contributor guide.
 
-Adding a new platform? See [CONTRIBUTING.md#adding-support-for-a-new-platform](CONTRIBUTING.md#adding-support-for-a-new-platform).
+Adding a new runner? See [CONTRIBUTING.md#adding-a-new-runner](CONTRIBUTING.md#adding-a-new-runner). Adding a new accelerator family? See [`runners/README.md`](runners/README.md#adding-a-new-accelerator-family).
 
 ---
 
@@ -84,53 +109,10 @@ Adding a new platform? See [CONTRIBUTING.md#adding-support-for-a-new-platform](C
 
 | Tier | How | Where |
 |------|-----|-------|
-| **community** | Submit via GitHub Issue, passes CI validation | Community tab |
-| **verified** | Independently reproduced by maintainer within 5% | Main leaderboard |
-
-Community results are fully visible and comparable — they just haven't been independently reproduced yet.
+| **community** | Submitted by anyone via PR (or issue → bot-drafted PR) and passes CI validation | Community tab |
+| **verified** | Independently reproduced on the same hardware/runner and matches the original within 5% | Main leaderboard |
 
----
-
-## Repository structure
-
-```
-AccelMark/
-├── suites/              # Suite definitions — see suites/README.md
-├── runners/             # Platform benchmark runners
-│   ├── benchmark_runner.py   # Shared base class — all orchestration logic
-│   ├── collect_env.py        # Hardware/software detection → env_info.json
-│   ├── validate_submission.py
-│   ├── validate_runners.py
-│   ├── protocol.py           # RunnerProtocol interface (serve layer)
-│   ├── template/             # Annotated starter template for new runners
-│   └── nvidia_vllm_{hash8}/  # Example: NVIDIA vLLM runner
-│       ├── runner.py
-│       ├── meta.json
-│       └── requirements.txt
-├── loadgen/             # Shared request sending and timing logic
-│   ├── loadgen.py       # Core timing engine — do not modify per-platform
-│   └── types.py         # InferenceResult, SampleRecord
-├── serve/               # OpenAI-compatible inference server
-│   ├── server.py        # FastAPI app — wraps any runner as an HTTP API
-│   └── adapter.py       # OpenAI request/response models
-├── schema/              # JSON schemas, accuracy subset, cloud pricing
-│   ├── result.schema.json
-│   ├── accuracy_subset.jsonl    # immutable
-│   └── cloud_pricing.json
-├── results/             # Benchmark results
-│   ├── verified/        # Maintainer-reproduced results
-│   └── community/       # Community-submitted results
-├── leaderboard/         # Static leaderboard site (GitHub Pages)
-│   ├── generate.py      # Reads results/, writes leaderboard.js + api/
-│   └── site/
-│       ├── index.html
-│       └── leaderboard.js   # Auto-generated — do not edit manually
-├── run.py               # Unified entry point — benchmark and serve
-├── CONTRIBUTING.md
-├── DEVELOPMENT.md
-└── configs/             # Local config — gitignored
-    └── submitter.yaml.example
-```
+Community results are fully visible and comparable — they just haven't been independently reproduced yet. Anyone with the listed hardware can promote a community result to verified by submitting a reproduction PR.
 
 ---
 
@@ -138,11 +120,15 @@ AccelMark/
 
 The most valuable contribution is running the benchmark on hardware not yet in the leaderboard.
 
-- **Submit a result** → [Community Submission guide](CONTRIBUTING.md)
+- **Submit a result** → [Submitting a result](CONTRIBUTING.md#submitting-a-result)
+- **Add a new runner** → [Adding a new runner](CONTRIBUTING.md#adding-a-new-runner)
+- **Add a new accelerator family** → [Platform plug-in guide](runners/README.md#adding-a-new-accelerator-family)
 - **Report a bug** → [Open an issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=bug_report.md)
-- **Add platform support** → [Platform guide](CONTRIBUTING.md#adding-support-for-a-new-platform)
+- **Ask a question / share results** → [Discussions](https://github.com/JuhaoLiang1997/AccelMark/discussions)
 - **Extend the leaderboard** → [Development guide](DEVELOPMENT.md)
 
+> _Optional:_ AccelMark also ships a small voice-driven launcher for the [OpenClaw](https://clawhub.ai) ecosystem — see [`openclaw_skill/`](openclaw_skill/README.md). It's not required to run, contribute, or submit results.
+
 ---
 
 ## Citation
@@ -152,7 +138,7 @@ If you use AccelMark results in research, please cite:
 ```bibtex
 @misc{accelmark2026,
   title  = {AccelMark: Open Benchmark Leaderboard for AI Accelerators on LLM Workloads},
-  author = {Liang, Juhao},
+  author = {Liang, Juhao and {The AccelMark Contributors}},
   year   = {2026},
   url    = {https://github.com/JuhaoLiang1997/AccelMark}
 }
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 00000000..67be232b
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,70 @@
+# Security Policy
+
+## Scope
+
+AccelMark is a benchmarking framework. The "interesting" security surface
+is correspondingly small, but two areas matter:
+
+1. **Code that runs locally on contributor / maintainer machines.**
+   The repository ships Python that reads model files, parses third-party
+   tool output (`nvidia-smi`, `rocm-smi`, `npu-smi`, `mthreads-gmi`, etc.),
+   reads YAML configuration, and runs inference frameworks (vLLM, SGLang,
+   mlx-lm, …) under their own dependency stacks. A malicious config,
+   meta.json, or runner.py landing in `main` could compromise anyone who
+   pulls and runs the repo.
+
+2. **Submitted results.**
+   `results/community/**` is community-contributed JSON. A malicious
+   `result.json` cannot execute code on its own, but it can poison the
+   leaderboard if the validator can be bypassed. Bugs in
+   `runners/validate_submission.py` that allow obviously-fake results to
+   merge are treated as security issues.
+
+Outside of those two surfaces (in particular: bugs that produce wrong
+benchmark *numbers* without a reproducibility problem) are normal bugs and
+should be reported via a regular GitHub issue.
+
+## Supported versions
+
+AccelMark is pre-1.0 and ships from `main`. The latest commit on `main` is
+the only "supported" version; we backport fixes to release tags only after
+1.0.
+
+## Reporting a vulnerability
+
+**Please do not open a public GitHub issue for a security report.**
+
+Use GitHub's [private security advisory][advisory] form on this repository.
+A maintainer will respond within **7 days** acknowledging the report and
+providing an initial assessment. We aim to publish a fix and credit the
+reporter within **30 days** of acknowledgement; if a fix is going to take
+longer we will say so in the response.
+
+[advisory]: https://github.com/JuhaoLiang1997/AccelMark/security/advisories/new
+
+When reporting, please include:
+
+* The version (commit SHA on `main`, or release tag).
+* A minimal reproduction — config files, the exact command, and the
+  observed behaviour. For supply-chain reports, the offending dependency
+  and version.
+* Your assessment of the impact (e.g. "arbitrary file read at runner
+  startup", "validator accepts result with mismatched chip name", …).
+
+We do not currently run a paid bug bounty, but we are happy to credit
+reporters in the release notes for the fix.
+
+## What is *not* a vulnerability
+
+For clarity, the following are explicitly out of scope:
+
+* **Results you disagree with.** Use the *Challenge a Result* GitHub
+  issue template; this is a leaderboard-policy matter, not a security one.
+* **A runner that performs poorly on your hardware.** Open a regular issue
+  or PR.
+* **Resource exhaustion when running a benchmark you started yourself.**
+  Benchmarks intentionally saturate the device; OOM and similar are
+  expected operating conditions.
+* **Dependencies of a runner being slow / outdated.** The runner author
+  pins versions in `requirements.txt`; submit a PR for a new runner with
+  updated pins (immutability rule — see `runners/README.md`).
diff --git a/configs/runner_configs/runner_amd_vllm_rocm_523da458.yaml.example b/configs/runner_configs/runner_amd_vllm_rocm_523da458.yaml.example
deleted file mode 100644
index 352bf16a..00000000
--- a/configs/runner_configs/runner_amd_vllm_rocm_523da458.yaml.example
+++ /dev/null
@@ -1,31 +0,0 @@
-# AccelMark runner config — amd_vllm_rocm_d65c6686 (vLLM-ROCm on AMD)
-#
-# Copy this file to runner_amd_vllm_rocm_d65c6686.yaml (remove .example suffix)
-# and edit as needed for your hardware.
-#
-# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
-
-# ── Global defaults ────────────────────────────────────────────────────────────
-
-# Tensor parallel size — number of GPUs to use (default: 1)
-tensor_parallel_size: 1
-
-# Disable HIP graph compilation (ROCm equivalent of enforce_eager).
-enforce_eager: false
-
-# Maximum number of sequences in a batch (default: 512).
-max_num_seqs: 512
-
-# Fraction of GPU memory for the KV cache (default: 0.90).
-gpu_memory_utilization: 0.90
-
-# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
-# engine_kwargs:
-#   swap_space: 8
-
-# ── Suite-specific overrides ───────────────────────────────────────────────────
-
-suites:
-  suite_D:
-    max_num_seqs: 64
-    gpu_memory_utilization: 0.85
diff --git a/configs/runner_configs/runner_ascend_vllm_ascend_605db33a.yaml.example b/configs/runner_configs/runner_ascend_vllm_ascend_605db33a.yaml.example
deleted file mode 100644
index 6e1ac0ba..00000000
--- a/configs/runner_configs/runner_ascend_vllm_ascend_605db33a.yaml.example
+++ /dev/null
@@ -1,36 +0,0 @@
-# AccelMark runner config — ascend_vllm_ascend_605db33a (vllm-ascend on Ascend NPU)
-#
-# Copy this file to runner_ascend_vllm_ascend_605db33a.yaml (remove .example suffix)
-# and edit as needed for your hardware.
-#
-# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
-
-# ── Global defaults ────────────────────────────────────────────────────────────
-
-# Tensor parallel size — number of NPUs to use (default: 1)
-tensor_parallel_size: 1
-
-# Disable graph compilation (default: false).
-# Set to true if you encounter CANN graph compilation errors on your NPU model.
-enforce_eager: false
-
-# Maximum number of sequences in a batch (default: 512).
-# Reduce on lower-memory NPUs: 128 for 32 GB, 64 for 16 GB or less.
-max_num_seqs: 512
-
-# Fraction of NPU memory reserved for the KV cache (default: 0.90).
-# Reduce if you get NPU OOM errors: try 0.85 for tighter memory budgets.
-# vllm-ascend exposes this as gpu_memory_utilization (same parameter name as vLLM).
-gpu_memory_utilization: 0.90
-
-# Pass-through kwargs forwarded directly to vllm-ascend LLM() / AsyncEngineArgs().
-# engine_kwargs:
-#   block_size: 16
-
-# ── Suite-specific overrides ───────────────────────────────────────────────────
-
-suites:
-  suite_D:
-    # Long-context suite — reduce batch size and reserve more NPU memory.
-    max_num_seqs: 64
-    gpu_memory_utilization: 0.85
diff --git a/configs/runner_configs/runner_nvidia_sglang_9f42fabb.yaml.example b/configs/runner_configs/runner_nvidia_sglang_9f42fabb.yaml.example
deleted file mode 100644
index cd1fbcf9..00000000
--- a/configs/runner_configs/runner_nvidia_sglang_9f42fabb.yaml.example
+++ /dev/null
@@ -1,29 +0,0 @@
-# AccelMark runner config — nvidia_sglang_df27d2dd (SGLang on NVIDIA)
-#
-# Copy this file to runner_nvidia_sglang_df27d2dd.yaml (remove .example suffix)
-# and edit as needed for your hardware.
-#
-# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
-
-# ── Global defaults ────────────────────────────────────────────────────────────
-
-# Tensor parallel size — number of GPUs to use (default: 1)
-tensor_parallel_size: 1
-
-# Disable CUDA graph. Equivalent to vLLM's enforce_eager.
-# Set to true if you encounter CUDA graph errors on older hardware.
-disable_cuda_graph: false
-
-# Fraction of GPU memory for the KV cache (SGLang: mem_fraction_static, default: 0.88)
-mem_fraction_static: 0.88
-
-# Pass-through kwargs forwarded directly to sglang.Engine() / sglang.AsyncEngine().
-# See SGLang docs for valid keys.
-# engine_kwargs:
-#   chunked_prefill_size: 512
-
-# ── Suite-specific overrides ───────────────────────────────────────────────────
-
-suites:
-  suite_D:
-    mem_fraction_static: 0.80
diff --git a/docs/assets/logo-wordmark-dark.svg b/docs/assets/logo-wordmark-dark.svg
new file mode 100644
index 00000000..ddc810da
--- /dev/null
+++ b/docs/assets/logo-wordmark-dark.svg
@@ -0,0 +1,23 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 280 72" role="img" aria-label="AccelMark">
+  <title>AccelMark</title>
+  <desc>AccelMark wordmark (dark theme): a lightning bolt over a speedometer arc, next to the project name.</desc>
+  <defs>
+    <linearGradient id="amBoltD" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" stop-color="#FCD34D"/>
+      <stop offset="100%" stop-color="#FBBF24"/>
+    </linearGradient>
+    <linearGradient id="amGaugeD" x1="0%" y1="0%" x2="100%" y2="0%">
+      <stop offset="0%" stop-color="#93C5FD"/>
+      <stop offset="100%" stop-color="#60A5FA"/>
+    </linearGradient>
+  </defs>
+
+  <g transform="translate(8, 0)">
+    <path d="M 10 50 A 26 26 0 1 1 62 50" fill="none" stroke="url(#amGaugeD)" stroke-width="5.5" stroke-linecap="round"/>
+    <circle cx="10" cy="50" r="3" fill="url(#amGaugeD)"/>
+    <circle cx="62" cy="50" r="3" fill="url(#amGaugeD)"/>
+    <path d="M 40 14 L 24 42 L 33 42 L 28 58 L 50 30 L 41 30 Z" fill="url(#amBoltD)" stroke="#78350F" stroke-width="0.7" stroke-linejoin="round"/>
+  </g>
+
+  <text x="84" y="50" font-family="-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif" font-size="40" font-weight="800" letter-spacing="-1.2" fill="#F9FAFB">AccelMark</text>
+</svg>
diff --git a/docs/assets/logo-wordmark.svg b/docs/assets/logo-wordmark.svg
new file mode 100644
index 00000000..985b1690
--- /dev/null
+++ b/docs/assets/logo-wordmark.svg
@@ -0,0 +1,23 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 280 72" role="img" aria-label="AccelMark">
+  <title>AccelMark</title>
+  <desc>AccelMark wordmark (light theme): a lightning bolt over a speedometer arc, next to the project name.</desc>
+  <defs>
+    <linearGradient id="amBoltW" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" stop-color="#FBBF24"/>
+      <stop offset="100%" stop-color="#F59E0B"/>
+    </linearGradient>
+    <linearGradient id="amGaugeW" x1="0%" y1="0%" x2="100%" y2="0%">
+      <stop offset="0%" stop-color="#60A5FA"/>
+      <stop offset="100%" stop-color="#2563EB"/>
+    </linearGradient>
+  </defs>
+
+  <g transform="translate(8, 0)">
+    <path d="M 10 50 A 26 26 0 1 1 62 50" fill="none" stroke="url(#amGaugeW)" stroke-width="5.5" stroke-linecap="round"/>
+    <circle cx="10" cy="50" r="3" fill="url(#amGaugeW)"/>
+    <circle cx="62" cy="50" r="3" fill="url(#amGaugeW)"/>
+    <path d="M 40 14 L 24 42 L 33 42 L 28 58 L 50 30 L 41 30 Z" fill="url(#amBoltW)" stroke="#92400E" stroke-width="0.7" stroke-linejoin="round"/>
+  </g>
+
+  <text x="84" y="50" font-family="-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif" font-size="40" font-weight="800" letter-spacing="-1.2" fill="#111827">AccelMark</text>
+</svg>
diff --git a/docs/assets/logo.svg b/docs/assets/logo.svg
new file mode 100644
index 00000000..bb6d81eb
--- /dev/null
+++ b/docs/assets/logo.svg
@@ -0,0 +1,18 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 72 72" role="img" aria-label="AccelMark">
+  <title>AccelMark</title>
+  <desc>Lightning bolt over a speedometer arc - the AccelMark mark for accelerator benchmarking.</desc>
+  <defs>
+    <linearGradient id="amBolt" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" stop-color="#FBBF24"/>
+      <stop offset="100%" stop-color="#F59E0B"/>
+    </linearGradient>
+    <linearGradient id="amGauge" x1="0%" y1="0%" x2="100%" y2="0%">
+      <stop offset="0%" stop-color="#60A5FA"/>
+      <stop offset="100%" stop-color="#2563EB"/>
+    </linearGradient>
+  </defs>
+  <path d="M 10 50 A 26 26 0 1 1 62 50" fill="none" stroke="url(#amGauge)" stroke-width="5.5" stroke-linecap="round"/>
+  <circle cx="10" cy="50" r="3" fill="url(#amGauge)"/>
+  <circle cx="62" cy="50" r="3" fill="url(#amGauge)"/>
+  <path d="M 40 14 L 24 42 L 33 42 L 28 58 L 50 30 L 41 30 Z" fill="url(#amBolt)" stroke="#92400E" stroke-width="0.7" stroke-linejoin="round"/>
+</svg>
diff --git a/leaderboard/site/favicon.svg b/leaderboard/site/favicon.svg
new file mode 100644
index 00000000..bb6d81eb
--- /dev/null
+++ b/leaderboard/site/favicon.svg
@@ -0,0 +1,18 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 72 72" role="img" aria-label="AccelMark">
+  <title>AccelMark</title>
+  <desc>Lightning bolt over a speedometer arc - the AccelMark mark for accelerator benchmarking.</desc>
+  <defs>
+    <linearGradient id="amBolt" x1="0%" y1="0%" x2="0%" y2="100%">
+      <stop offset="0%" stop-color="#FBBF24"/>
+      <stop offset="100%" stop-color="#F59E0B"/>
+    </linearGradient>
+    <linearGradient id="amGauge" x1="0%" y1="0%" x2="100%" y2="0%">
+      <stop offset="0%" stop-color="#60A5FA"/>
+      <stop offset="100%" stop-color="#2563EB"/>
+    </linearGradient>
+  </defs>
+  <path d="M 10 50 A 26 26 0 1 1 62 50" fill="none" stroke="url(#amGauge)" stroke-width="5.5" stroke-linecap="round"/>
+  <circle cx="10" cy="50" r="3" fill="url(#amGauge)"/>
+  <circle cx="62" cy="50" r="3" fill="url(#amGauge)"/>
+  <path d="M 40 14 L 24 42 L 33 42 L 28 58 L 50 30 L 41 30 Z" fill="url(#amBolt)" stroke="#92400E" stroke-width="0.7" stroke-linejoin="round"/>
+</svg>
diff --git a/leaderboard/site/index.html b/leaderboard/site/index.html
index caea8541..a890e5e2 100644
--- a/leaderboard/site/index.html
+++ b/leaderboard/site/index.html
@@ -3,6 +3,7 @@
 <head>
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <link rel="icon" type="image/svg+xml" href="favicon.svg">
   <title>AccelMark — AI Accelerator Leaderboard</title>
   <style>
     * { box-sizing: border-box; margin: 0; padding: 0; }
@@ -23,6 +24,13 @@
       font-size: 1.5rem;
       font-weight: 700;
       color: #f0f6fc;
+      display: flex;
+      align-items: center;
+      gap: 0.6rem;
+    }
+
+    header h1 .brand-mark {
+      flex: 0 0 auto;
     }
 
     header p {
@@ -489,7 +497,25 @@
 <body>
 
 <header>
-  <h1>⚡ AccelMark Leaderboard</h1>
+  <h1>
+    <svg class="brand-mark" viewBox="0 0 72 72" width="34" height="34" role="img" aria-hidden="true">
+      <defs>
+        <linearGradient id="lbBolt" x1="0%" y1="0%" x2="0%" y2="100%">
+          <stop offset="0%" stop-color="#FCD34D"/>
+          <stop offset="100%" stop-color="#FBBF24"/>
+        </linearGradient>
+        <linearGradient id="lbGauge" x1="0%" y1="0%" x2="100%" y2="0%">
+          <stop offset="0%" stop-color="#93C5FD"/>
+          <stop offset="100%" stop-color="#60A5FA"/>
+        </linearGradient>
+      </defs>
+      <path d="M 10 50 A 26 26 0 1 1 62 50" fill="none" stroke="url(#lbGauge)" stroke-width="5.5" stroke-linecap="round"/>
+      <circle cx="10" cy="50" r="3" fill="url(#lbGauge)"/>
+      <circle cx="62" cy="50" r="3" fill="url(#lbGauge)"/>
+      <path d="M 40 14 L 24 42 L 33 42 L 28 58 L 50 30 L 41 30 Z" fill="url(#lbBolt)" stroke="#78350F" stroke-width="0.7" stroke-linejoin="round"/>
+    </svg>
+    <span>AccelMark Leaderboard</span>
+  </h1>
   <p>Open benchmark for AI accelerators on LLM workloads · <a href="https://github.com/JuhaoLiang1997/AccelMark" style="color:#58a6ff">GitHub</a></p>
 </header>
 
@@ -829,7 +855,8 @@ <h1>⚡ AccelMark Leaderboard</h1>
           `}
         </div>
         <div class="card-model">
-          ${row.model || ''} · ${row.framework || ''} ·
+          ${row.model || ''} ·
+          <span title="${row.implementation_id ? 'runner: ' + row.implementation_id : ''}">${row.framework || ''}${row.framework_version ? ' ' + row.framework_version : ''}</span> ·
           ${row.precision_fallback
             ? `<span style="color:#d29922" title="BF16 not supported on this hardware — using FP16">${row.precision || ''} ⚠</span>`
             : (row.precision || '')}
@@ -876,6 +903,7 @@ <h1>⚡ AccelMark Leaderboard</h1>
     const fwVersion = row.framework_version || '';
     const script = row.reproduce_script || '';
     const notes = row.notes || '';
+    const runnerId = row.implementation_id || '';
 
     if (!fw || fw === 'unknown') {
       return `<span class="framework-unknown">—</span>`;
@@ -890,7 +918,10 @@ <h1>⚡ AccelMark Leaderboard</h1>
       ? `<span class="badge" style="background:#3d2b00;color:#d29922;margin-left:4px;font-size:0.7rem">custom</span>`
       : '';
 
+    const versionInline = fwVersion ? ` <span style="color:#8b949e;font-weight:400;font-size:0.85em">${fwVersion}</span>` : '';
+
     const tooltipLines = [];
+    if (runnerId)  tooltipLines.push(`Runner: ${runnerId}`);
     if (fwVersion) tooltipLines.push(`Version: ${fwVersion}`);
     if (script)    tooltipLines.push(`Script: ${script}`);
     if (notes)     tooltipLines.push(`Notes: ${notes}`);
@@ -900,7 +931,7 @@ <h1>⚡ AccelMark Leaderboard</h1>
 
     return `
       <div class="framework-wrapper">
-        <span class="framework-cell ${colorClass}">${fw}${badge}</span>
+        <span class="framework-cell ${colorClass}">${fw}${versionInline}${badge}</span>
         ${tooltip}
       </div>
     `;
diff --git a/pyproject.toml b/pyproject.toml
index 658b37e4..ea8d476d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,45 @@
 [project]
 name = "accelmark"
 version = "0.1.0"
-description = "LLM inference benchmarking framework for multiple hardware platforms"
+description = "Open benchmark leaderboard for AI accelerators on LLM workloads"
 readme = "README.md"
 license = "MIT"
+license-files = ["LICENSE"]
 requires-python = ">=3.10"
+authors = [
+    { name = "Juhao Liang", email = "juhaoliang1997@gmail.com" },
+]
+maintainers = [
+    { name = "Juhao Liang", email = "juhaoliang1997@gmail.com" },
+]
+keywords = [
+    "benchmark",
+    "llm",
+    "inference",
+    "accelerator",
+    "gpu",
+    "tpu",
+    "npu",
+    "vllm",
+    "sglang",
+    "mlx",
+    "leaderboard",
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: System :: Benchmark",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: MacOS :: MacOS X",
+    "Environment :: GPU",
+]
 
 dependencies = [
     "jsonschema>=4.0.0",
@@ -12,9 +47,17 @@ dependencies = [
     "pyyaml>=6.0",
 ]
 
+[project.urls]
+Homepage      = "https://github.com/JuhaoLiang1997/AccelMark"
+Leaderboard   = "https://juhaoliang1997.github.io/AccelMark"
+Documentation = "https://github.com/JuhaoLiang1997/AccelMark/blob/main/README.md"
+Repository    = "https://github.com/JuhaoLiang1997/AccelMark"
+Issues        = "https://github.com/JuhaoLiang1997/AccelMark/issues"
+Changelog     = "https://github.com/JuhaoLiang1997/AccelMark/releases"
+
 [build-system]
 requires = ["setuptools>=68"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-include = ["loadgen*"]
\ No newline at end of file
+include = ["loadgen*"]
diff --git a/results/README.md b/results/README.md
index 96fd9b08..9d88e1d6 100644
--- a/results/README.md
+++ b/results/README.md
@@ -6,10 +6,10 @@ Benchmark results organized by trust tier.
 
 | Tier | Description |
 |------|-------------|
-| `verified/` | Independently reproduced by a maintainer within 5%. Shown on the main leaderboard. |
+| `verified/` | Independently reproduced on the same hardware / runner and matches the original within 5%. Shown on the main leaderboard. |
 | `community/` | Passed schema validation, not yet independently reproduced. Shown on the community tab. |
 
-Submissions start in `community/` and may be promoted to `verified/` by a maintainer.
+Submissions start in `community/`. Anyone with the same hardware can promote a result to `verified/` by submitting a reproduction PR.
 
 ---
 
diff --git a/runners/README.md b/runners/README.md
index fd531eec..95290aae 100644
--- a/runners/README.md
+++ b/runners/README.md
@@ -24,12 +24,21 @@ You write ~50 lines. AccelMark handles the rest.
 runners/
 ├── benchmark_runner.py     ← Base class — inherit from this
 ├── protocol.py             ← RunnerProtocol — the serve layer interface
-├── collect_env.py          ← Hardware/software detection
+├── collect_env.py          ← Top-level environment collector (orchestrator only)
 ├── validate_submission.py  ← Result validator
 ├── validate_runners.py     ← Runner folder validator
 ├── hash_runner.py          ← Compute runner ID before submitting
 ├── meta.schema.json        ← JSON schema for meta.json
 │
+├── platforms/              ← One file per accelerator family (plug-ins)
+│   ├── __init__.py         ← Auto-discovery loader
+│   ├── nvidia.py
+│   ├── amd.py
+│   ├── ascend.py
+│   ├── apple.py
+│   ├── google.py
+│   └── moorethreads.py
+│
 ├── template/               ← starter template for new runners
 │   └── runner.py
 │
@@ -178,12 +187,37 @@ is **immutable** — updates require a new folder with a new hash.
   "submitted_by": "your_github_username",
   "description":  "One sentence describing what makes this runner distinct.",
   "notes":        null,
-  "created":      "2026-03-22"
+  "created":      "2026-03-22",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "validated",
+    "B": "validated",
+    "C": "pending",
+    "D": "validated",
+    "E": "pending",
+    "F": "validated",
+    "G": "unsupported"
+  }
 }
 ```
 
 `id` must exactly match the folder name.
 
+`suite_support` is **how your runner declares what works**. The top-level
+`README.md` platforms matrix is generated from this field — you never
+edit the README yourself. Allowed values:
+
+| Value | Meaning | Rendered |
+|---|---|---|
+| `"validated"` | Smoke-tested end-to-end and produces a `result.json` | ✓ |
+| `"pending"` | You believe it works but have not run it through yet | ⋯ |
+| `"unsupported"` | Hardware or framework cannot run this suite | — |
+
+`hardware_label` is optional. When `null`, the README falls back to the
+`display_name` from `schema/platforms.json` for your platform. Set it
+explicitly if your runner targets a narrower hardware scope than the
+platform (e.g. `"NVIDIA V100 (SM70)"` for a V100-only fork).
+
 ### Step 4 — Validate
 
 ```bash
@@ -207,7 +241,7 @@ validator automatically and post a comment on the PR with the result.
 
 | Segment | Examples | Rules |
 |---------|----------|-------|
-| `platform` | `nvidia`, `amd`, `ascend`, `apple`, `other` | Lowercase, one of the allowed values |
+| `platform` | `nvidia`, `amd`, `ascend`, `apple`, `google`, `moorethreads`, `other` | Lowercase alphanumeric, no underscores. Known identifiers live in `schema/platforms.json`; new ones validate fine — `validate_runners.py` just emits a warning suggesting you add them to the catalogue in a follow-up PR. |
 | `customname` | `vllm`, `trtllm_fp8`, `lmdeploy_tp4` | Lowercase alphanumeric and underscores, your choice |
 | `hash8` | `3f8a2c1d` | First 8 hex chars of SHA-256 of `runner.py` — computed automatically |
 
@@ -218,6 +252,7 @@ nvidia_trtllm_fp8_8d2f1a4b
 amd_vllm_rocm_7b2e1d8f
 ascend_mindie_9c4a3f11
 apple_mlx_b3e21f09
+moorethreads_vllm_musa_57ff5443
 ```
 
 ---
@@ -298,6 +333,53 @@ used. The leaderboard shows an amber ⚠ when BF16 was requested but FP16 was us
 
 ---
 
+## Adding a new accelerator family
+
+Adding a runner for an existing platform (NVIDIA, AMD, Ascend, Apple,
+Google TPU, Moore Threads) follows the 5 steps above — **no shared file
+needs to change**. The README matrix regenerates from `meta.json` and
+the schema accepts any well-formed identifier.
+
+Adding a runner for a *new* accelerator family is slightly more
+involved, but still self-contained. Drop a single file:
+
+```
+runners/platforms/<my_platform>.py
+```
+
+exporting any subset of the following module-level symbols:
+
+```python
+ID            = "my_platform"        # short identifier, lowercase
+DISPLAY_NAME  = "My Accelerator"
+VENDOR_LABEL  = "My Vendor"          # written to accelerator["vendor"]
+PRIORITY      = 50                   # lower runs first; default 50
+
+def collect() -> list[dict]: ...     # accelerator records; [] if absent
+
+# All functions below are optional
+def detect_runtime_version() -> str | None: ...
+def detect_pcie_gen() -> str | None: ...
+def detect_topology() -> str | None: ...
+def detect_intra_node_interconnect() -> str | None: ...
+def diagnostics(env, accelerators) -> list[str]: ...
+```
+
+`runners/collect_env.py` auto-discovers the file — no change required.
+
+Two optional polish steps when you're ready to merge:
+
+1. Add an entry to `schema/platforms.json` so the README matrix renders a
+   nice hardware label and stable sort order. Until then, the
+   matrix falls back to the capitalised identifier.
+2. Update `tools/generate_platforms_matrix.py`'s catalogue check if you
+   want the validator to recognise the new identifier as "official".
+
+`runners/meta.schema.json` already accepts any well-formed lowercase
+identifier as a `platform`, so you do **not** need to edit it.
+
+---
+
 ## Updating a runner
 
 Runner folders are **immutable once merged**. Any edit to `runner.py` produces
diff --git a/runners/amd_vllm_rocm_6c18cd8f/meta.json b/runners/amd_vllm_rocm_6c18cd8f/meta.json
index e712aa35..d0f8e05a 100644
--- a/runners/amd_vllm_rocm_6c18cd8f/meta.json
+++ b/runners/amd_vllm_rocm_6c18cd8f/meta.json
@@ -7,5 +7,15 @@
   "description": "AccelMark runner for AMD GPUs (MI250X, MI300X) using vLLM on the ROCm backend. Enables direct apples-to-apples comparison between AMD and NVIDIA on the same benchmark suites.",
   "supersedes_chain": [],
   "notes": "Decouple runners from suite and scenario knowledge — load_model() uses use_async from parallelism dict instead of checking scenario name.",
-  "created": "2026-04-03"
+  "created": "2026-04-03",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "validated",
+    "B": "validated",
+    "C": "validated",
+    "D": "validated",
+    "E": "validated",
+    "F": "validated",
+    "G": "validated"
+  }
 }
diff --git a/runners/apple_mlx_lm_9546b8b5/meta.json b/runners/apple_mlx_lm_9546b8b5/meta.json
index a0a02484..7adc15b2 100644
--- a/runners/apple_mlx_lm_9546b8b5/meta.json
+++ b/runners/apple_mlx_lm_9546b8b5/meta.json
@@ -7,5 +7,15 @@
   "description": "Apple Silicon inference via mlx_lm (Metal). Offline uses stream_generate; streaming scenarios use asyncio.to_thread. Online scenario disabled (single-device sync API).",
   "supersedes_chain": [],
   "notes": "Tensor parallelism not supported; chip_count=1. Match mlx_example.py load/generate patterns.",
-  "created": "2026-04-20"
+  "created": "2026-04-20",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "pending",
+    "B": "unsupported",
+    "C": "unsupported",
+    "D": "pending",
+    "E": "unsupported",
+    "F": "pending",
+    "G": "unsupported"
+  }
 }
diff --git a/runners/ascend_vllm_ascend_d4aa9fda/meta.json b/runners/ascend_vllm_ascend_d4aa9fda/meta.json
index 817251c4..5ebc107a 100644
--- a/runners/ascend_vllm_ascend_d4aa9fda/meta.json
+++ b/runners/ascend_vllm_ascend_d4aa9fda/meta.json
@@ -7,5 +7,15 @@
   "description": "AccelMark runner for Huawei Ascend 910B/910C NPUs using vllm-ascend (vLLM community fork for CANN). Supports Suites A–F. NPU memory queried via torch_npu. FP8 excluded: not supported on current Ascend 910B/910C hardware. W4A16 uses gptq backend (Marlin kernels not available on CANN).",
   "supersedes_chain": [],
   "notes": "Fixes vs initial version: _last_accuracy_outputs added to inference_fn_offline; gpu_memory_utilization read from runner config and passed to engine; engine_kwargs filtered against EngineArgs fields to prevent TypeError on startup.",
-  "created": "2026-04-03"
+  "created": "2026-04-03",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "validated",
+    "B": "validated",
+    "C": "validated",
+    "D": "validated",
+    "E": "validated",
+    "F": "unsupported",
+    "G": "unsupported"
+  }
 }
diff --git a/runners/benchmark_runner.py b/runners/benchmark_runner.py
index 1bece421..30afb70c 100644
--- a/runners/benchmark_runner.py
+++ b/runners/benchmark_runner.py
@@ -2269,16 +2269,9 @@ def _parse_scenarios_config(self, suite: dict) -> tuple[list[str], list[str]]:
         """
         Parse the suite's scenarios config into (default_scenarios, extra_scenarios).
 
-        Handles both legacy flat-array format and new dict format:
-          Legacy: "scenarios": ["accuracy", "offline", "online", "interactive"]
-          New:    "scenarios": {"default": [...], "extra": [...]}
-
-        Returns (default_scenarios, extra_scenarios).
+        Expects the dict form: "scenarios": {"default": [...], "extra": [...]}.
         """
-        config = suite.get("scenarios", {})
-        if isinstance(config, list):
-            # Legacy format — entire list is treated as default, no extras
-            return config, []
+        config  = suite.get("scenarios", {})
         default = config.get("default", [])
         extra   = config.get("extra", [])
         return default, extra
@@ -2618,43 +2611,37 @@ def _resolve_model_path(self, model_id: str, cli_override: Optional[str]) -> str
 
     def _resolve_requests_path(self, suite: dict) -> Path:
         """
-        Resolve the requests.jsonl path for a suite.
-
-        Resolution order:
-          1. suite["dataset"] key → datasets/{dataset}/requests.jsonl
-          2. Legacy: suites/{suite_id}/requests.jsonl (backward compatible)
+        Resolve the requests.jsonl path for a suite via its `dataset` key.
 
         Datasets are shared immutable collections in the datasets/ folder.
-        Suites reference them by name: "dataset": "sharegpt_standard_v1".
-        If not found at either location, raises FileNotFoundError with a
-        helpful message.
+        Suites reference them by name: "dataset": "sharegpt_standard_v1",
+        which resolves to datasets/sharegpt_standard_v1/requests.jsonl.
+
+        Raises FileNotFoundError with a helpful message if `dataset` is
+        absent or the referenced dataset folder does not exist.
         """
         suite_id = suite.get("suite_id", "")
         dataset  = suite.get("dataset")
 
-        if dataset:
-            dataset_path = _REPO_ROOT / "datasets" / dataset / "requests.jsonl"
-            if dataset_path.exists():
-                return dataset_path
+        if not dataset:
             raise FileNotFoundError(
-                f"Dataset '{dataset}' not found at {dataset_path}.\n"
-                f"Check 'dataset' field in suites/{suite_id}/suite.json.\n"
-                f"Available datasets: "
-                + ", ".join(
-                    p.name for p in (_REPO_ROOT / "datasets").iterdir()
-                    if p.is_dir() and (p / "requests.jsonl").exists()
-                )
+                f"Suite '{suite_id}' is missing the 'dataset' key in suite.json. "
+                f"Add e.g. \"dataset\": \"sharegpt_standard_v1\" so the runner "
+                f"can find datasets/<name>/requests.jsonl."
             )
 
-        # Legacy path — suite has its own requests.jsonl
-        legacy_path = _REPO_ROOT / "suites" / suite_id / "requests.jsonl"
-        if legacy_path.exists():
-            return legacy_path
+        dataset_path = _REPO_ROOT / "datasets" / dataset / "requests.jsonl"
+        if dataset_path.exists():
+            return dataset_path
 
         raise FileNotFoundError(
-            f"No requests.jsonl found for suite '{suite_id}'.\n"
-            f"Either add 'dataset' key to suite.json or create "
-            f"suites/{suite_id}/requests.jsonl."
+            f"Dataset '{dataset}' not found at {dataset_path}.\n"
+            f"Check 'dataset' field in suites/{suite_id}/suite.json.\n"
+            f"Available datasets: "
+            + ", ".join(
+                p.name for p in (_REPO_ROOT / "datasets").iterdir()
+                if p.is_dir() and (p / "requests.jsonl").exists()
+            )
         )
 
     def _generate_output_dir(self, args, env_info: dict) -> str:
diff --git a/runners/collect_env.py b/runners/collect_env.py
index 2da166c9..3b5f1491 100644
--- a/runners/collect_env.py
+++ b/runners/collect_env.py
@@ -1,7 +1,29 @@
 """
 AccelMark Environment Collector
-Automatically collects hardware and software environment information.
-Called automatically by the benchmark script — no need to run manually.
+===============================
+
+Detects the host's accelerators, CPU, memory, runtime stack, network
+fabric, and OS, and writes a normalised JSON file used to qualify and
+contextualise benchmark results.
+
+This script is intentionally *vendor-agnostic*: every accelerator family
+is implemented as an independent plug-in under ``runners/platforms/``.
+The top-level collector here only handles the parts that are common to
+all platforms (CPU model, system memory, OS, network interfaces) plus
+some lightweight orchestration:
+
+* discover all plug-ins
+* call ``collect()`` in priority order; the first plug-in that returns
+  accelerator records becomes the *active* platform
+* ask the active platform for its runtime version, PCIe generation,
+  topology and intra-node interconnect; fall back to the union of
+  remaining plug-ins when the active one declines to answer
+* aggregate per-plug-in diagnostics into warning messages
+
+To support a new accelerator family, drop a new file at
+``runners/platforms/<my_platform>.py`` exporting the optional functions
+documented in ``runners/platforms/__init__.py``. **No change to this
+file is required.**
 
 Usage:
     python runners/collect_env.py --output ./results/community/<dir>/env_info.json
@@ -17,6 +39,16 @@
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
+from types import ModuleType
+from typing import Iterable
+
+# Make ``runners.platforms`` importable when this script is run directly.
+_RUNNERS_DIR = Path(__file__).resolve().parent
+_REPO_ROOT = _RUNNERS_DIR.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+from runners.platforms import discover_plugins  # noqa: E402
 
 
 def _print_warning(message: str) -> None:
@@ -31,683 +63,56 @@ def _have_psutil() -> bool:
     return True
 
 
-def _warn_for_incomplete_env_report(env: dict, accelerators: list[dict]) -> None:
-    """Emit warnings when the report used fallbacks or missing optional tools."""
-    if not _have_psutil():
-        _print_warning(
-            "Package 'psutil' is not installed — CPU physical/logical core counts may "
-            "default to 1; install with: pip install psutil"
-        )
-
-    mem = float(env.get("system_memory_gb") or 0)
-    if mem == 0.0:
-        _print_warning(
-            "system_memory_gb is 0 — RAM could not be determined. "
-            "Install psutil (pip install psutil) or ensure /proc/meminfo (Linux) / "
-            "sysctl hw.memsize (macOS) is available."
-        )
-
-    if not accelerators:
-        if os.environ.get("TPU_NAME") or os.environ.get("CLOUD_TPU_TASK"):
-            _print_warning(
-                "TPU-related environment variables are set but no TPU devices were detected — "
-                "install jax / tpu_inference when running on Cloud TPU."
-            )
-        return
-
-    vendor = (accelerators[0].get("vendor") or "").strip()
-    pytorch_v = env.get("pytorch_version") or ""
-    runtime = env.get("runtime_version") or ""
-    pcie = env.get("pcie_generation") or ""
-
-    if pytorch_v == "unknown" and vendor in ("NVIDIA", "AMD", "Huawei"):
-        _print_warning(
-            "PyTorch is not installed — pytorch_version is unknown. "
-            "For GPU stack metadata: pip install torch (match your CUDA/ROCm/CANN environment)."
-        )
-
-    if runtime == "unknown":
-        if vendor == "NVIDIA":
-            _print_warning(
-                "Could not detect CUDA/runtime (tried PyTorch CUDA, nvcc, CUDA_HOME, nvidia-smi paths). "
-                "runtime_version is unknown — install a CUDA toolkit or PyTorch with CUDA."
-            )
-        elif vendor == "AMD":
-            _print_warning(
-                "Could not detect ROCm runtime (rocm-smi / PyTorch ROCm). "
-                "runtime_version is unknown."
-            )
-        elif vendor == "Huawei":
-            _print_warning(
-                "Could not detect CANN/runtime from npu-smi / install paths. "
-                "runtime_version is unknown."
-            )
-        elif vendor == "Google":
-            _print_warning(
-                "Could not detect JAX/runtime for TPU. runtime_version is unknown — "
-                "install jax if you use Cloud TPU."
-            )
-
-    if pcie == "unknown" and vendor == "NVIDIA":
-        _print_warning(
-            "Could not read PCIe generation from nvidia-smi — pcie_generation is unknown."
-        )
-
-    if env.get("accelerator_topology") is None and vendor in ("NVIDIA", "AMD"):
-        _print_warning(
-            "accelerator_topology is null — nvidia-smi topo / rocm-smi --showtopo did not return data."
-        )
-
-    if vendor == "Apple":
-        if runtime.startswith("macOS ") and "MLX" not in runtime and "Metal MPS" not in runtime:
-            _print_warning(
-                "Neither MLX nor PyTorch with MPS is available — runtime_version only reflects "
-                "macOS build. For ML stack: pip install mlx  or  pip install torch (with MPS)."
-            )
-
-    if vendor == "Huawei":
-        for a in accelerators:
-            if a.get("memory_gb") is None:
-                try:
-                    import torch_npu  # noqa: F401
-                except ImportError:
-                    _print_warning(
-                        "Ascend HBM memory could not be parsed from npu-smi — optional "
-                        "pip install torch_npu may fill memory_gb via the runtime API."
-                    )
-                else:
-                    _print_warning(
-                        "Ascend HBM memory_gb is still unknown (torch_npu is importable) — "
-                        "check ASCEND_VISIBLE_DEVICES, driver, and npu-smi output."
-                    )
-                break
-
-
-def collect_nvidia() -> list[dict]:
-    try:
-        out = subprocess.check_output(
-            ["nvidia-smi", "--query-gpu=index,name,memory.total,driver_version,compute_cap",
-             "--format=csv,noheader,nounits"],
-            text=True
-        )
-        accelerators = []
-        for line in out.strip().splitlines():
-            idx, name, mem, driver, compute_cap = [x.strip() for x in line.split(",")]
-            try:
-                cc_float = float(compute_cap) if compute_cap else 0.0
-                supports_bf16 = cc_float >= 8.0
-            except (ValueError, TypeError):
-                supports_bf16 = True  # unknown — assume capable, runner will handle
-            accelerators.append({
-                "index": int(idx),
-                "name": name,
-                "vendor": "NVIDIA",
-                "memory_gb": round(float(mem) / 1024, 1),
-                "driver_version": driver,
-                "firmware_version": None,
-                "compute_capability": compute_cap,
-                "supports_bf16": supports_bf16,
-            })
-        return accelerators
-    except Exception:
-        return []
-
-
-# Known AMD architectures with BF16 support
-_AMD_BF16_SUPPORTED = {
-    "cdna2", "cdna3",                          # MI200, MI300 series
-    "rdna3", "rdna4",                          # RX 7000+ series
-    "gfx90a",                                  # MI250X arch code
-    "gfx940", "gfx941", "gfx942",             # MI300 arch codes
-    "gfx1100", "gfx1101", "gfx1102",          # RDNA3 arch codes
-}
-
-# Known AMD architectures WITHOUT BF16
-_AMD_NO_BF16 = {
-    "cdna1",                                   # MI100
-    "rdna", "rdna1", "rdna2",                 # RX 5000, RX 6000 series
-    "gfx908",                                  # MI100 arch code
-    "gfx1030", "gfx1031",                     # RDNA2 arch codes
-}
-
-
-def _amd_supports_bf16(arch_str: str) -> bool:
-    """Determine BF16 support from AMD architecture string."""
-    if not arch_str:
-        return True   # unknown — assume capable
-    arch_lower = arch_str.lower()
-    for known in _AMD_BF16_SUPPORTED:
-        if known in arch_lower:
-            return True
-    for known in _AMD_NO_BF16:
-        if known in arch_lower:
-            return False
-    return True   # unrecognized — assume capable
-
-
-def collect_amd() -> list[dict]:
-    try:
-        out = subprocess.check_output(
-            ["rocm-smi", "--showproductname", "--showmeminfo", "vram",
-             "--showdriverversion", "--json"],
-            text=True, stderr=subprocess.DEVNULL
-        )
-        data = json.loads(out)
-
-        # Try to get architecture string for BF16 detection
-        arch_str = ""
-        try:
-            arch_out = subprocess.check_output(
-                ["rocm-smi", "--showallinfo"],
-                text=True, stderr=subprocess.DEVNULL
-            )
-            import re as _re
-            gfx_matches = _re.findall(r'gfx\d+[a-z]?', arch_out.lower())
-            arch_str = gfx_matches[0] if gfx_matches else ""
-        except Exception:
-            pass
-
-        accelerators = []
-        for idx, (card_id, info) in enumerate(data.items()):
-            # Skip non-card keys (e.g. "system" metadata in some versions)
-            if not isinstance(info, dict):
-                continue
-            # Field names vary across rocm-smi versions — try all known variants
-            name = (
-                info.get("Card Series") or
-                info.get("Card series") or
-                info.get("Product Name") or
-                info.get("product_name") or
-                "AMD GPU"
-            )
-            mem_bytes = int(
-                info.get("VRAM Total Memory (B)") or
-                info.get("vram_total_memory_b") or
-                info.get("VRAM Total Memory") or
-                0
-            )
-            driver = (
-                info.get("Driver version") or
-                info.get("driver_version") or
-                info.get("Driver Version") or
-                "unknown"
-            )
-            accelerators.append({
-                "index": idx,
-                "name": name,
-                "vendor": "AMD",
-                "memory_gb": round(mem_bytes / (1024**3), 1),
-                "driver_version": driver,
-                "firmware_version": None,
-                "supports_bf16": _amd_supports_bf16(arch_str),
-            })
-        return accelerators
-    except Exception:
-        return []
-
-
-_ASCEND_BF16_SUPPORTED = {
-    "910b", "atlas 800t a2", "910b1", "910b2", "910b3", "910b4",
-}
-_ASCEND_NO_BF16 = {
-    "310", "310p", "atlas 300",
-}
-
-
-def _ascend_supports_bf16(chip_name: str) -> bool:
-    if not chip_name:
-        return True
-    name_lower = chip_name.lower()
-    for known in _ASCEND_BF16_SUPPORTED:
-        if known in name_lower:
-            return True
-    for known in _ASCEND_NO_BF16:
-        if known in name_lower:
-            return False
-    return True   # unknown Ascend chip — assume capable
-
-
-def _ascend_enrich_via_torch_npu(accelerators: list[dict]) -> None:
-    """Backfill memory_gb and name via torch_npu runtime API.
-
-    torch_npu.npu.get_device_properties(i) mirrors torch.cuda:
-      - .total_memory  — total HBM bytes
-      - .name          — chip name string (e.g. "910B2")
-
-    Logical indices 0..N-1 map positionally to npu-smi enumeration order
-    when all devices are visible (respects ASCEND_VISIBLE_DEVICES masking).
-    Only fills fields still None so parsed values are never overwritten.
-    """
-    try:
-        import torch_npu
-        logical_count = torch_npu.npu.device_count()
-    except Exception:
-        return
-    for logical_idx in range(min(logical_count, len(accelerators))):
-        rec = accelerators[logical_idx]
-        try:
-            props = torch_npu.npu.get_device_properties(logical_idx)
-            if rec.get("memory_gb") is None and props.total_memory:
-                rec["memory_gb"] = round(props.total_memory / (1024 ** 3), 1)
-            if rec.get("name") in (None, "Huawei Ascend NPU") and props.name:
-                rec["name"] = f"Huawei Ascend {props.name.strip()}"
-        except Exception:
-            continue
-
-
-def _parse_npu_smi_table(out: str, cann_version: str) -> list[dict]:
-    """Parse the tabular output of plain `npu-smi info`.
-
-    The table format has two data rows per device:
-      Row 1: | <NPU_ID>  <ChipName>  | <Health> | <Power> <Temp> <Hugepages> |
-      Row 2: | <ChipID>              | <Bus-Id> | <AICore> <Mem-Usage>  <HBM-Usage(used/total MB)> |
-
-    Example:
-      | 7     910B2               | OK            | 96.5        49                0    / 0             |
-      | 0                         | 0000:42:00.0  | 0           0    / 0          3389 / 65536         |
-    """
-    import re
-    accelerators = []
-    lines = out.splitlines()
-
-    i = 0
-    while i < len(lines):
-        line = lines[i]
-        # Row 1: starts with "| <int>  <ChipName>" — NPU ID and chip name
-        row1 = re.match(r'\|\s*(\d+)\s+(\S+)\s*\|', line)
-        if row1:
-            npu_id = int(row1.group(1))
-            chip_name = row1.group(2).strip()
-            # Row 2 is the very next table row — HBM total is second number in
-            # the last "used / total" pair on that line
-            hbm_total_mb = None
-            if i + 1 < len(lines):
-                row2 = lines[i + 1]
-                # Match "used / total" at end of line, e.g. "3389 / 65536"
-                hbm_match = re.search(r'(\d+)\s*/\s*(\d+)\s*\|?\s*$', row2)
-                if hbm_match:
-                    hbm_total_mb = int(hbm_match.group(2))
-                i += 1  # consume row 2
-
-            memory_gb = round(hbm_total_mb / 1024, 1) if hbm_total_mb else None
-            name = f"Huawei Ascend {chip_name}" if chip_name else "Huawei Ascend NPU"
-            accelerators.append({
-                "index": npu_id,
-                "name": name,
-                "vendor": "Huawei",
-                "memory_gb": memory_gb,
-                "driver_version": cann_version,
-                "firmware_version": None,
-                "supports_bf16": _ascend_supports_bf16(name),
-            })
-        i += 1
-
-    return accelerators
-
-
-def collect_ascend() -> list[dict]:
-    import re
-
-    try:
-        # Primary: plain `npu-smi info` — tabular format with chip name + HBM.
-        # `npu-smi info -l` only returns NPU ID and Chip Count on some firmware
-        # versions (e.g. 24.1.x on openEuler/aarch64) so it is not reliable.
-        out = subprocess.check_output(
-            ["npu-smi", "info"], text=True, stderr=subprocess.DEVNULL
-        )
-        # Parse with a placeholder driver/firmware — filled in per-device below
-        accelerators = _parse_npu_smi_table(out, "unknown")
-
-        if not accelerators:
-            # Secondary: try -l in case this firmware uses key-value format
-            out_l = subprocess.check_output(
-                ["npu-smi", "info", "-l"], text=True, stderr=subprocess.DEVNULL
-            )
-            current_npu: dict | None = None
-            for line in out_l.splitlines():
-                npu_match = re.search(r'NPU\s+ID\s*:\s*(\d+)', line, re.IGNORECASE)
-                if npu_match:
-                    if current_npu:
-                        current_npu["supports_bf16"] = _ascend_supports_bf16(current_npu.get("name", ""))
-                        accelerators.append(current_npu)
-                    current_npu = {
-                        "index": int(npu_match.group(1)),
-                        "name": "Huawei Ascend NPU",
-                        "vendor": "Huawei",
-                        "memory_gb": None,
-                        "driver_version": "unknown",
-                        "firmware_version": None,
-                    }
-                if current_npu is None:
-                    continue
-                chip_match = re.search(r'Chip\s+Name\s*:\s*(.+)', line, re.IGNORECASE)
-                if chip_match:
-                    current_npu["name"] = f"Huawei Ascend {chip_match.group(1).strip()}"
-                mem_match = re.search(r'HBM\s+Capacity.*?:\s*(\d+)', line, re.IGNORECASE)
-                if mem_match:
-                    current_npu["memory_gb"] = round(int(mem_match.group(1)) / 1024, 1)
-                if current_npu["memory_gb"] is None:
-                    mem_match2 = re.search(r'Memory\s+Capacity.*?:\s*(\d+)\s*MB', line, re.IGNORECASE)
-                    if mem_match2:
-                        current_npu["memory_gb"] = round(int(mem_match2.group(1)) / 1024, 1)
-                if current_npu["firmware_version"] is None:
-                    fw_match = re.search(r'Firmware\s+Version\s*:\s*(.+)', line, re.IGNORECASE)
-                    if fw_match:
-                        current_npu["firmware_version"] = fw_match.group(1).strip()
-            if current_npu:
-                current_npu["supports_bf16"] = _ascend_supports_bf16(current_npu.get("name", ""))
-                accelerators.append(current_npu)
-
-        if accelerators:
-            # Enrich driver_version and firmware_version per device via -t board
-            for rec in accelerators:
-                board_info = _get_npu_board_info(str(rec["index"]))
-                rec["driver_version"] = board_info["driver_version"]
-                rec["firmware_version"] = board_info["firmware_version"]
-            # Enrich any still-missing memory/name via torch_npu runtime API
-            _ascend_enrich_via_torch_npu(accelerators)
-            return accelerators
-
-    except Exception:
-        pass
+# ─── Plug-in invocation helpers ───────────────────────────────────────────────
 
-    return []
 
+def _call_optional(mod: ModuleType, name: str, *args, default=None):
+    """Call an optional plug-in function defensively.
 
-def _get_npu_board_info(npu_id: str) -> dict:
-    """Query driver version and firmware version for a single NPU via -t board.
-
-    `npu-smi info -t board -i <NPU_ID>` returns key-value fields including:
-      Software Version  : 24.1.0.3   (driver / npu-smi package version)
-      Firmware Version  : NA          (NA means not available on this board)
-
-    Returns dict with keys "driver_version" and "firmware_version".
-    Falls back to CANN install-path files for driver_version if the command
-    fails or produces no match.
+    Plug-ins are third-party-ish code paths: any exception from one
+    must not break the whole environment report. Missing attributes are
+    treated identically to functions that returned ``default``.
     """
-    import re
-
-    result = {"driver_version": "unknown", "firmware_version": None}
-
-    try:
-        out = subprocess.check_output(
-            ["npu-smi", "info", "-t", "board", "-i", npu_id],
-            text=True, stderr=subprocess.DEVNULL
-        )
-        for line in out.splitlines():
-            # Software Version is the driver/npu-smi package version
-            sw_match = re.search(r'Software\s+Version\s*:\s*(.+)', line, re.IGNORECASE)
-            if sw_match:
-                result["driver_version"] = sw_match.group(1).strip()
-            # Firmware Version — treat "NA" as not available
-            fw_match = re.search(r'Firmware\s+Version\s*:\s*(.+)', line, re.IGNORECASE)
-            if fw_match:
-                fw = fw_match.group(1).strip()
-                result["firmware_version"] = None if fw.upper() == "NA" else fw
-    except Exception:
-        pass
-
-    # Fallback for driver_version: CANN toolkit install path
-    if result["driver_version"] == "unknown":
-        for cann_path in ["/usr/local/Ascend/ascend-toolkit/latest", "/usr/local/Ascend/nnae/latest"]:
-            version_file = Path(cann_path) / "version.cfg"
-            if version_file.exists():
-                try:
-                    text = version_file.read_text()
-                    m = re.search(r'Version=(.+)', text)
-                    if m:
-                        result["driver_version"] = f"CANN {m.group(1).strip()}"
-                        break
-                except Exception:
-                    pass
-
-    return result
-
-def _apple_supports_bf16(chip_name: str) -> bool:
-    """M1 has limited/slow BF16. M2+ has full hardware BF16."""
-    if not chip_name:
-        return True
-    name_lower = chip_name.lower()
-    # M1 variants: "Apple M1", "Apple M1 Pro", "Apple M1 Max", "Apple M1 Ultra"
-    if "m1" in name_lower and "m10" not in name_lower:  # avoid matching "m10x"
-        return False
-    return True  # M2, M3, M4 and unknown — assume supported
-
-
-def _apple_silicon_brand() -> str | None:
-    """Return SoC marketing name (e.g. 'Apple M3 Pro') if this is Apple Silicon, else None."""
-    try:
-        chip = subprocess.check_output(
-            ["sysctl", "-n", "machdep.cpu.brand_string"],
-            text=True,
-            stderr=subprocess.DEVNULL,
-        ).strip()
-        return chip if "Apple" in chip else None
-    except Exception:
-        return None
-
-
-def _macos_build_string() -> str:
-    """Product + build for reproducibility on local Macs."""
+    fn = getattr(mod, name, None)
+    if fn is None:
+        return default
     try:
-        ver = subprocess.check_output(
-            ["sw_vers", "-productVersion"], text=True, stderr=subprocess.DEVNULL
-        ).strip()
-        build = subprocess.check_output(
-            ["sw_vers", "-buildVersion"], text=True, stderr=subprocess.DEVNULL
-        ).strip()
-        return f"macOS {ver} (build {build})"
+        return fn(*args)
     except Exception:
-        v = platform.mac_ver()[0]
-        return f"macOS {v}" if v else "macOS"
+        return default
 
 
-def _apple_metal_summary() -> str | None:
-    """Best-effort Metal support line from system_profiler (may take a few seconds)."""
-    try:
-        proc = subprocess.run(
-            ["system_profiler", "SPDisplaysDataType", "-json"],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.DEVNULL,
-            text=True,
-            timeout=30,
-        )
-        if proc.returncode != 0 or not proc.stdout:
-            return None
-        data = json.loads(proc.stdout)
-        displays = data.get("SPDisplaysDataType") or []
-        for disp in displays:
-            if not isinstance(disp, dict):
-                continue
-            for key, val in disp.items():
-                kl = key.lower()
-                if ("metal" in kl or "mtl" in kl) and val:
-                    return f"GPU runtime ({key}): {val}"
-        return None
-    except Exception:
-        return None
-
+def _collect_accelerators(plugins: Iterable[ModuleType]) -> tuple[list[dict], ModuleType | None]:
+    """Try each plug-in's ``collect()`` and return the first non-empty result."""
+    for mod in plugins:
+        result = _call_optional(mod, "collect", default=[]) or []
+        if result:
+            return list(result), mod
+    return [], None
 
-def collect_apple() -> list[dict]:
-    """Detect Apple Silicon chips (M1/M2/M3/M4 series) in the local environment."""
-    try:
-        chip = _apple_silicon_brand()
-        if not chip:
-            return []
-        mem_bytes = int(
-            subprocess.check_output(
-                ["sysctl", "-n", "hw.memsize"],
-                text=True,
-                stderr=subprocess.DEVNULL,
-            ).strip()
-        )
-        return [{
-            "index": 0,
-            "name": chip,
-            "vendor": "Apple",
-            "memory_gb": round(mem_bytes / (1024**3), 1),
-            "driver_version": _macos_build_string(),
-            "firmware_version": None,
-            "compute_capability": None,
-            "supports_bf16": _apple_supports_bf16(chip),
-        }]
-    except Exception:
-        return []
 
+def _detect_first(plugins: Iterable[ModuleType], fn_name: str, active: ModuleType | None) -> str | None:
+    """Return the first non-empty answer to ``fn_name()`` across plug-ins.
 
-def collect_tpu() -> list[dict]:
-    """Detect Google Cloud TPU chips via tpu_info and JAX."""
-    try:
-        from tpu_inference import tpu_info
-
-        num_chips = tpu_info.get_num_chips()
-        tpu_type  = tpu_info.get_tpu_type()   # e.g. "v5e-1", "v5litepod-4", "v6e-8"
-        node_name = tpu_info.get_node_name()  # GCE instance ID, or None on Colab
-
-        if not num_chips or num_chips == 0:
-            return []
-
-        # Resolve human-readable chip name and HBM per chip.
-        # tpu_type on Colab is "v5e-1" (not "v5litepod-1") — handled by the mapping.
-        chip_name, memory_gb = _tpu_chip_name_and_memory(tpu_type)
-
-        # get_num_cores_per_chip() in tpu_info checks for "v5litepod" or "v6e"
-        # but Colab returns "v5e-1", so it falls through to the default of 2,
-        # which is wrong. Correct it by checking the tpu_type string ourselves.
-        t = (tpu_type or "").lower()
-        if "v5litepod" in t or "v5e" in t or "v6e" in t or "trillium" in t:
-            num_cores_per_chip = 1
-        else:
-            num_cores_per_chip = tpu_info.get_num_cores_per_chip()
-
-        # Try to get more detail from JAX (version, memory stats)
-        jax_version = "unknown"
-        try:
-            import jax
-            jax_version = jax.__version__
-            jax_devices = jax.devices()
-            # jax.devices() on v5e-1:
-            # [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0)]
-            if jax_devices and memory_gb is None:
-                mem = getattr(jax_devices[0], "memory_stats", None)
-                if mem and "bytes_limit" in mem:
-                    memory_gb = round(mem["bytes_limit"] / (1024 ** 3), 1)
-        except Exception:
-            pass
-
-        # Build one record per chip
-        accelerators = []
-        for i in range(num_chips):
-            accelerators.append({
-                "index":              i,
-                "name":               chip_name,
-                "vendor":             "Google",
-                "memory_gb":          memory_gb,
-                "driver_version":     f"JAX {jax_version}" if jax_version != "unknown" else "unknown",
-                "firmware_version":   None,
-                "tpu_type":           tpu_type,      # raw type string e.g. "v5e-1"
-                "tpu_node_name":      node_name,     # None on Colab (GCE metadata 404)
-                "num_cores_per_chip": num_cores_per_chip,
-                "supports_bf16":      True,          # all modern TPU generations
-            })
-        return accelerators
-
-    except Exception:
-        return []
-
-
-def _tpu_chip_name_and_memory(tpu_type: str | None) -> tuple[str, float | None]:
+    The active plug-in (the one whose ``collect()`` succeeded) is tried
+    first so vendor-specific information is preferred over generic
+    fallbacks.
     """
-    Map GCE accelerator-type string to a human-readable chip name and HBM size.
-
-    tpu_type examples and their meanings:
-      "v5litepod-1"  → v5e, 1 chip,  16 GiB HBM per chip
-      "v5litepod-4"  → v5e, 4 chips, 16 GiB HBM per chip
-      "v5litepod-8"  → v5e, 8 chips, 16 GiB HBM per chip
-      "v6e-1"        → v6e (Trillium), 1 chip, 32 GiB HBM per chip
-      "v6e-8"        → v6e, 8 chips, 32 GiB HBM per chip
-      "v7x-1"        → v7x (Ironwood), 1 chip, 192 GiB HBM per chip
-      "v4-8"         → v4, 8 chips, 32 GiB HBM per chip
-      "v5p-8"        → v5p, 8 chips, 95 GiB HBM per chip
-
-    Returns (chip_name, memory_gb_per_chip). memory_gb may be None for
-    unknown types.
-    """
-    if not tpu_type:
-        return "Google TPU", None
-
-    t = tpu_type.lower()
-
-    if "v5litepod" in t or "v5e" in t:
-        return "Google TPU v5e", 16.0
-    if "v6e" in t or "trillium" in t:
-        return "Google TPU v6e", 32.0
-    if "v7x" in t or "ironwood" in t:
-        return "Google TPU v7x", 192.0
-    if "v5p" in t:
-        return "Google TPU v5p", 95.0
-    if "v4" in t:
-        return "Google TPU v4", 32.0
-    if "v3" in t:
-        return "Google TPU v3", 16.0
-    if "v2" in t:
-        return "Google TPU v2", 8.0
-
-    # Unknown type — return the raw string as name, no memory info
-    return f"Google TPU ({tpu_type})", None
-
-
-def collect_topology() -> str | None:
-    brand = _apple_silicon_brand()
-    if brand:
-        lines = [
-            f"Apple Silicon — integrated GPU in {brand} (unified system memory).",
-            f"Machine: {platform.machine()}",
-        ]
-        metal = _apple_metal_summary()
-        if metal:
-            lines.append(metal)
-        return "\n".join(lines)
-
-    # Use --no-color flag for nvidia-smi to strip ANSI escape codes from output
-    for cmd in [["nvidia-smi", "topo", "-m", "--no-color"], ["rocm-smi", "--showtopo"]]:
-        try:
-            return subprocess.check_output(cmd, text=True)
-        except Exception:
-            continue
-    # Fallback: try without --no-color and strip ANSI manually
-    try:
-        import re
-        out = subprocess.check_output(["nvidia-smi", "topo", "-m"], text=True)
-        ansi_escape = re.compile(r'\x1b\[[0-9;]*m')
-        return ansi_escape.sub('', out)
-    except Exception:
-        return None
-
+    ordered: list[ModuleType] = []
+    if active is not None:
+        ordered.append(active)
+    ordered.extend(p for p in plugins if p is not active)
+    for mod in ordered:
+        answer = _call_optional(mod, fn_name)
+        if answer:
+            return answer
+    return None
 
-def collect_cpu() -> dict:
-    # Try to get real CPU model name from /proc/cpuinfo (Linux)
-    cpu_model = _get_cpu_model()
 
-    try:
-        import psutil
-        return {
-            "model": cpu_model,
-            "physical_cores": psutil.cpu_count(logical=False) or 1,
-            "logical_cores": psutil.cpu_count(logical=True) or 1,
-            "numa_nodes": _get_numa_nodes(),
-        }
-    except ImportError:
-        return {
-            "model": cpu_model,
-            "physical_cores": 1,
-            "logical_cores": 1,
-            "numa_nodes": _get_numa_nodes(),
-        }
+# ─── CPU / memory / OS — vendor-independent ─────────────────────────────────
 
 
 def _get_cpu_model() -> str:
-    # Linux: read from /proc/cpuinfo
     try:
         with open("/proc/cpuinfo") as f:
             for line in f:
@@ -720,7 +125,6 @@ def _get_cpu_model() -> str:
                     return line.split(":", 1)[1].strip()
     except Exception:
         pass
-    # Try lscpu for model name (works on aarch64)
     try:
         out = subprocess.check_output(["lscpu"], text=True, stderr=subprocess.DEVNULL)
         for line in out.splitlines():
@@ -728,11 +132,11 @@ def _get_cpu_model() -> str:
                 return line.split(":", 1)[1].strip()
     except Exception:
         pass
-    # macOS fallback
     try:
         out = subprocess.check_output(
             ["sysctl", "-n", "machdep.cpu.brand_string"],
-            text=True, stderr=subprocess.DEVNULL
+            text=True,
+            stderr=subprocess.DEVNULL,
         )
         return out.strip()
     except Exception:
@@ -741,7 +145,6 @@ def _get_cpu_model() -> str:
 
 
 def _get_numa_nodes() -> int:
-    # Try lscpu for NUMA node count
     try:
         out = subprocess.check_output(["lscpu"], text=True, stderr=subprocess.DEVNULL)
         for line in out.splitlines():
@@ -749,7 +152,6 @@ def _get_numa_nodes() -> int:
                 return int(line.split(":")[1].strip())
     except Exception:
         pass
-    # Fallback: count /sys/devices/system/node/node* directories
     try:
         nodes = list(Path("/sys/devices/system/node").glob("node[0-9]*"))
         if nodes:
@@ -759,13 +161,33 @@ def _get_numa_nodes() -> int:
     return 1
 
 
+def collect_cpu() -> dict:
+    cpu_model = _get_cpu_model()
+    try:
+        import psutil
+
+        return {
+            "model": cpu_model,
+            "physical_cores": psutil.cpu_count(logical=False) or 1,
+            "logical_cores": psutil.cpu_count(logical=True) or 1,
+            "numa_nodes": _get_numa_nodes(),
+        }
+    except ImportError:
+        return {
+            "model": cpu_model,
+            "physical_cores": 1,
+            "logical_cores": 1,
+            "numa_nodes": _get_numa_nodes(),
+        }
+
+
 def collect_memory_gb() -> float:
     try:
         import psutil
-        return round(psutil.virtual_memory().total / (1024**3), 1)
+
+        return round(psutil.virtual_memory().total / (1024 ** 3), 1)
     except ImportError:
         pass
-    # macOS / Apple Silicon: sysctl when psutil is not installed
     if platform.system() == "Darwin":
         try:
             mem = int(
@@ -775,177 +197,47 @@ def collect_memory_gb() -> float:
                     stderr=subprocess.DEVNULL,
                 ).strip()
             )
-            return round(mem / (1024**3), 1)
+            return round(mem / (1024 ** 3), 1)
         except Exception:
             pass
-    # Linux: MemTotal in kB when psutil is unavailable
     if platform.system() == "Linux":
         try:
             with open("/proc/meminfo") as f:
                 for line in f:
                     if line.startswith("MemTotal:"):
                         kb = int(line.split()[1])
-                        return round(kb / (1024**2), 1)
+                        return round(kb / (1024 ** 2), 1)
         except Exception:
             pass
     return 0.0
 
 
-def detect_pcie_gen() -> str:
-    if _apple_silicon_brand():
-        return "SoC integrated (no discrete PCIe GPU)"
-
-    # Try nvidia-smi for PCIe generation
-    try:
-        out = subprocess.check_output(
-            ["nvidia-smi", "--query-gpu=pcie.link.gen.current",
-             "--format=csv,noheader"],
-            text=True
-        )
-        gen = out.strip().splitlines()[0].strip()
-        if gen.isdigit():
-            return f"PCIe Gen {gen}"
-    except Exception:
-        pass
-    return "unknown"
-
-
-def detect_runtime_version() -> str:
-    # Try torch first — most reliable when vLLM is installed
-    try:
-        import torch
-        if torch.version.cuda:
-            return f"CUDA {torch.version.cuda}"
-    except ImportError:
-        pass
-
-    # Try nvcc
-    try:
-        out = subprocess.check_output(
-            ["nvcc", "--version"],
-            text=True, stderr=subprocess.STDOUT
-        )
-        for line in out.splitlines():
-            if "release" in line.lower():
-                # e.g. "Cuda compilation tools, release 12.2, V12.2.140"
-                parts = line.split("release")
-                if len(parts) > 1:
-                    version = parts[1].split(",")[0].strip()
-                    return f"CUDA {version}"
-    except Exception:
-        pass
-
-    # Try reading from CUDA_HOME
-    for env_var in ["CUDA_HOME", "CUDA_PATH"]:
-        cuda_home = os.environ.get(env_var)
-        if cuda_home:
-            version_file = Path(cuda_home) / "version.txt"
-            if version_file.exists():
-                return version_file.read_text().strip()
-            version_json = Path(cuda_home) / "version.json"
-            if version_json.exists():
-                try:
-                    data = json.loads(version_json.read_text())
-                    cuda = data.get("cuda", {}).get("version", "")
-                    if cuda:
-                        return f"CUDA {cuda}"
-                except Exception:
-                    pass
-
-    # Try ROCm
-    try:
-        out = subprocess.check_output(
-            ["rocm-smi", "--version"],
-            text=True, stderr=subprocess.STDOUT
-        )
-        return f"ROCm {out.strip().splitlines()[-1]}"
-    except Exception:
-        pass
-
-    # Try Ascend/CANN — reuse board info from npu-smi
-    try:
-        import re as _re
-        info_out = subprocess.check_output(
-            ["npu-smi", "info"], text=True, stderr=subprocess.DEVNULL
-        )
-        m = _re.search(r'\|\s*(\d+)\s+\S+\s*\|', info_out)
-        if m:
-            board = _get_npu_board_info(m.group(1))
-            if board["driver_version"] != "unknown":
-                return f"CANN {board['driver_version']}"
-    except Exception:
-        pass
-
-    # Try JAX/TPU
-    try:
-        import jax
-        return f"JAX {jax.__version__}"
-    except ImportError:
-        pass
-
-    # Apple Silicon — local MLX / Metal (PyTorch MPS) / bare macOS
-    if platform.system() == "Darwin" and _apple_silicon_brand():
-        try:
-            import mlx
-
-            ver = getattr(mlx, "__version__", None)
-            if ver is None:
-                try:
-                    from importlib.metadata import version as _pkg_version
-
-                    ver = _pkg_version("mlx")
-                except Exception:
-                    ver = None
-            return f"MLX {ver}" if ver else "MLX"
-        except ImportError:
-            pass
-        try:
-            import torch
-            if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
-                return f"Metal MPS (PyTorch {torch.__version__})"
-        except ImportError:
-            pass
-        return _macos_build_string()
-
-    return "unknown"
-
-
 def collect_network_interfaces() -> list[dict] | None:
     """Collect high-speed network interfaces (InfiniBand, RoCE)."""
-    interfaces = []
-
-    # Check for InfiniBand via ibstat
+    interfaces: list[dict] = []
     try:
         out = subprocess.check_output(
             ["ibstat"], text=True, stderr=subprocess.DEVNULL
         )
-        # Count CA (Channel Adapter) entries
         import re
+
         cas = re.findall(r"^CA '(.+)'", out, re.MULTILINE)
         for ca in cas:
-            interfaces.append({
-                "name": ca,
-                "type": "InfiniBand",
-                "bandwidth_gbps": None,  # would need ibstatus for detailed info
-            })
+            interfaces.append(
+                {"name": ca, "type": "InfiniBand", "bandwidth_gbps": None}
+            )
     except Exception:
         pass
 
-    # Check for mlx5 devices from topology output (already collected)
-    # These appear as NIC0: mlx5_0 etc. in nvidia-smi topo output
-    # We just note their presence here
     try:
         out = subprocess.check_output(
-            ["ls", "/sys/class/infiniband"],
-            text=True, stderr=subprocess.DEVNULL
+            ["ls", "/sys/class/infiniband"], text=True, stderr=subprocess.DEVNULL
         )
         for dev in out.strip().splitlines():
             if dev and not any(i["name"] == dev for i in interfaces):
-                interfaces.append({
-                    "name": dev,
-                    "type": "InfiniBand/RoCE",
-                    "bandwidth_gbps": None,
-                })
+                interfaces.append(
+                    {"name": dev, "type": "InfiniBand/RoCE", "bandwidth_gbps": None}
+                )
     except Exception:
         pass
 
@@ -953,10 +245,9 @@ def collect_network_interfaces() -> list[dict] | None:
 
 
 def detect_os_version() -> str:
-    # Try /etc/os-release first (Linux standard)
     try:
         with open("/etc/os-release") as f:
-            info = {}
+            info: dict[str, str] = {}
             for line in f:
                 if "=" in line:
                     k, v = line.strip().split("=", 1)
@@ -965,7 +256,6 @@ def detect_os_version() -> str:
                 return info["PRETTY_NAME"]
     except Exception:
         pass
-    # macOS fallback
     mac_ver = platform.mac_ver()[0]
     if mac_ver:
         return f"macOS {mac_ver}"
@@ -976,85 +266,86 @@ def detect_python_version() -> str:
     return platform.python_version()
 
 
-def detect_intra_node_interconnect() -> str | None:
-    """Detect intra-node GPU interconnect type from nvidia-smi topology output.
-
-    Returns e.g. 'NVLink' if NVLink connections exist between GPUs, None otherwise.
-    The topology is already collected by collect_topology(), but we re-query here
-    to keep this function self-contained and callable independently.
-    """
-    try:
-        import re
-        out = subprocess.check_output(
-            ["nvidia-smi", "topo", "-m", "--no-color"],
-            text=True, stderr=subprocess.DEVNULL,
-        )
-        # NV# entries (e.g. NV12, NV18) in the topology matrix indicate NVLink
-        if re.search(r'\bNV\d+\b', out):
-            return "NVLink"
-    except Exception:
-        pass
-    # Fallback: try without --no-color
-    try:
-        import re
-        out = subprocess.check_output(
-            ["nvidia-smi", "topo", "-m"],
-            text=True, stderr=subprocess.DEVNULL,
-        )
-        if re.search(r'\bNV\d+\b', out):
-            return "NVLink"
-    except Exception:
-        pass
-    if _apple_silicon_brand():
-        return "SoC unified memory (no GPU-GPU interconnect)"
-    return None
-
-
 def detect_pytorch_version() -> str:
     try:
         import torch
+
         return torch.__version__
     except ImportError:
         return "unknown"
 
 
-def main():
+# ─── Warning aggregation ────────────────────────────────────────────────────
+
+
+def _emit_global_warnings(env: dict, accelerators: list[dict]) -> None:
+    """Warnings independent of any specific accelerator family."""
+    if not _have_psutil():
+        _print_warning(
+            "Package 'psutil' is not installed — CPU physical/logical core counts may "
+            "default to 1; install with: pip install psutil"
+        )
+    if float(env.get("system_memory_gb") or 0) == 0.0:
+        _print_warning(
+            "system_memory_gb is 0 — RAM could not be determined. "
+            "Install psutil (pip install psutil) or ensure /proc/meminfo (Linux) / "
+            "sysctl hw.memsize (macOS) is available."
+        )
+    if not accelerators and (
+        os.environ.get("TPU_NAME") or os.environ.get("CLOUD_TPU_TASK")
+    ):
+        _print_warning(
+            "TPU-related environment variables are set but no TPU devices were detected — "
+            "install jax / tpu_inference when running on Cloud TPU."
+        )
+
+
+# ─── Main orchestration ─────────────────────────────────────────────────────
+
+
+def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("--output", required=True)
     args = parser.parse_args()
 
     print("Collecting environment info...")
 
-    # Try each vendor in order
-    accelerators = (
-        collect_nvidia() or
-        collect_amd() or
-        collect_ascend() or
-        collect_apple() or
-        collect_tpu() or
-        []
-    )
+    plugins = discover_plugins()
+    accelerators, active = _collect_accelerators(plugins)
     if not accelerators:
         _print_warning("No accelerators detected. Collecting CPU-only info.")
 
+    runtime_version = _detect_first(plugins, "detect_runtime_version", active) or "unknown"
+    pcie_generation = _detect_first(plugins, "detect_pcie_gen", active) or "unknown"
+    topology = _detect_first(plugins, "detect_topology", active)
+    intra_node = _detect_first(plugins, "detect_intra_node_interconnect", active)
+
     env = {
         "collected_at": datetime.now(timezone.utc).isoformat(),
         "accelerators": accelerators,
-        "accelerator_topology": collect_topology(),
-        "intra_node_interconnect": detect_intra_node_interconnect(),
+        "accelerator_platform": getattr(active, "ID", None) if active else None,
+        "accelerator_topology": topology,
+        "intra_node_interconnect": intra_node,
         "cpu": collect_cpu(),
         "system_memory_gb": collect_memory_gb(),
-        "pcie_generation": detect_pcie_gen(),
+        "pcie_generation": pcie_generation,
         "cpu_accelerator_bandwidth_gbs": None,
         "network_interfaces": collect_network_interfaces(),
         "os": detect_os_version(),
         "python_version": detect_python_version(),
         "kernel_version": platform.release(),
-        "runtime_version": detect_runtime_version(),
+        "runtime_version": runtime_version,
         "pytorch_version": detect_pytorch_version(),
     }
 
-    _warn_for_incomplete_env_report(env, accelerators)
+    _emit_global_warnings(env, accelerators)
+    # Plug-in-specific diagnostics — only the *active* plug-in (the one
+    # whose accelerators were collected) gets to emit hardware-specific
+    # advice, so we do not produce a wall of irrelevant warnings on
+    # hosts that happen to have e.g. nvidia-smi installed but no GPU.
+    if active is not None:
+        for note in _call_optional(active, "diagnostics", env, accelerators, default=[]) or []:
+            _print_warning(note)
 
     out_path = Path(args.output)
     out_path.parent.mkdir(parents=True, exist_ok=True)
@@ -1062,7 +353,6 @@ def main():
         json.dump(env, f, indent=2)
     print(f"Environment info written to {out_path}")
 
-    # Print summary
     chip_names = [a["name"] for a in accelerators]
     print(f"  Accelerators: {chip_names}")
     print(f"  CPU: {env['cpu']['model']}")
@@ -1075,4 +365,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/runners/google_vllm_tpu_68cc9ffa/meta.json b/runners/google_vllm_tpu_68cc9ffa/meta.json
index 14fb3d89..8093ca20 100644
--- a/runners/google_vllm_tpu_68cc9ffa/meta.json
+++ b/runners/google_vllm_tpu_68cc9ffa/meta.json
@@ -7,5 +7,15 @@
   "description": "AccelMark runner for Google Cloud TPU using vllm-tpu (tpu-inference JAX/XLA backend). BF16 only, offline scenario. Supports JAX-native architectures: LlamaForCausalLM, Qwen3ForCausalLM. Compatible with v5e, v6e, v7x.",
   "supersedes_chain": [],
   "notes": "Qwen2ForCausalLM (Qwen2.5-0.5B-Instruct) is not supported by tpu-inference JAX registry — use models_local.yaml to substitute Qwen3-0.6B for Suite F. Two Colab/Jupyter patches applied at module level: VLLM_ENABLE_V1_MULTIPROCESSING=0 and suppress_stdout no-op. Validated on v5e-1 (Colab); should work on v6e and v7x without code changes.",
-  "created": "2026-04-18"
+  "created": "2026-04-18",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "validated",
+    "B": "unsupported",
+    "C": "unsupported",
+    "D": "validated",
+    "E": "unsupported",
+    "F": "validated",
+    "G": "unsupported"
+  }
 }
\ No newline at end of file
diff --git a/runners/meta.schema.json b/runners/meta.schema.json
index 2e3c1b7d..b50603bc 100644
--- a/runners/meta.schema.json
+++ b/runners/meta.schema.json
@@ -1,6 +1,7 @@
 {
   "$schema": "http://json-schema.org/draft-07/schema#",
   "title": "AccelMark runner meta.json",
+  "$comment": "Schema for runners/*/meta.json. Note that `platform` is validated by a regex pattern rather than an enum — adding a runner for a new accelerator family does NOT require modifying this schema. See schema/platforms.json for the curated presentation catalog (display name, sort order) that the README matrix generator consults.",
   "type": "object",
   "required": ["id", "platform", "name", "framework", "submitted_by", "description"],
   "additionalProperties": true,
@@ -12,8 +13,8 @@
     },
     "platform": {
       "type": "string",
-      "enum": ["nvidia", "amd", "ascend", "apple", "google", "other"],
-      "description": "Hardware platform this runner targets"
+      "pattern": "^[a-z][a-z0-9]*$",
+      "description": "Hardware platform identifier (lowercase, no underscores). Should match the prefix of the runner folder name. Known platforms (nvidia, amd, ascend, apple, google, moorethreads, other) are catalogued in schema/platforms.json — using a new identifier is allowed and only triggers a non-fatal warning from validate_runners.py prompting an optional follow-up PR."
     },
     "name": {
       "type": "string",
@@ -53,6 +54,32 @@
       "type": ["string", "null"],
       "description": "ID of the runner that supersedes this one. Set by maintainer after a newer runner is merged.",
       "pattern": "^[a-z][a-z0-9]*_[a-z][a-z0-9_]*_[0-9a-f]{8}$"
+    },
+    "hardware_label": {
+      "type": ["string", "null"],
+      "description": "Optional override for the 'Hardware' column in the README matrix. When null/missing, the generator falls back to the display_name of this platform from schema/platforms.json. Use this when a runner targets a more specific scope than the platform (e.g. 'NVIDIA V100 (SM70)' for a V100-only fork).",
+      "maxLength": 80
+    },
+    "suite_support": {
+      "type": ["object", "null"],
+      "description": "Self-declared support level for each AccelMark suite. Keys are suite letters (A, B, C, D, E, F, G); values describe support status. The README platforms matrix is generated from these fields — runners do not have to manually edit README.md.",
+      "additionalProperties": false,
+      "properties": {
+        "A": { "$ref": "#/definitions/suiteStatus" },
+        "B": { "$ref": "#/definitions/suiteStatus" },
+        "C": { "$ref": "#/definitions/suiteStatus" },
+        "D": { "$ref": "#/definitions/suiteStatus" },
+        "E": { "$ref": "#/definitions/suiteStatus" },
+        "F": { "$ref": "#/definitions/suiteStatus" },
+        "G": { "$ref": "#/definitions/suiteStatus" }
+      }
+    }
+  },
+  "definitions": {
+    "suiteStatus": {
+      "type": "string",
+      "enum": ["validated", "pending", "unsupported"],
+      "description": "validated → suite has been smoke-tested on this runner and produces results (rendered as ✓). pending → runner author believes it should work but has not validated end-to-end yet (rendered as ⋯). unsupported → suite cannot run on this runner due to hardware/framework limitations (rendered as —)."
     }
   }
 }
diff --git a/runners/nvidia_sglang_6da83845/meta.json b/runners/nvidia_sglang_6da83845/meta.json
deleted file mode 100644
index 91bf373a..00000000
--- a/runners/nvidia_sglang_6da83845/meta.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "id": "nvidia_sglang_6da83845",
-  "platform": "nvidia",
-  "name": "SGLang on NVIDIA",
-  "framework": "SGLang",
-  "submitted_by": "JuhaoLiang1997",
-  "description": "AccelMark runner for NVIDIA GPUs using SGLang. Supports all suites (A–F). Enables direct throughput and latency comparison between SGLang and vLLM on the same hardware.",
-  "supersedes_chain": [],
-  "notes": "Decouple runners from suite and scenario knowledge — load_model() uses use_async from parallelism dict instead of checking scenario name.",
-  "created": "2026-04-03"
-}
diff --git a/runners/nvidia_sglang_6da83845/requirements.txt b/runners/nvidia_sglang_6da83845/requirements.txt
deleted file mode 100644
index ce614e36..00000000
--- a/runners/nvidia_sglang_6da83845/requirements.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-# AccelMark — NVIDIA SGLang runner dependencies
-# Tested combination: torch 2.5.1 + sglang 0.4.x + CUDA 12.1
-#
-# SGLang install — use the official release wheel:
-#   pip install sglang[all]
-# or for a pinned version:
-#   pip install "sglang[all]==0.4.0"
-#
-# See https://sgl-project.github.io/start/install.html for full options.
-
-# Core
-torch==2.5.1
-torchvision==0.20.1
-torchaudio==2.5.1
-
-# LLM inference
-sglang[all]>=0.4.0
-
-# Transformers (for tokenizer)
-transformers==4.46.3
-tokenizers>=0.20.0
-huggingface-hub>=0.25.0
-accelerate>=0.34.0
-
-# AccelMark dependencies
-numpy>=1.24.0
-jsonschema>=4.0.0
-psutil>=5.9.0
-tqdm>=4.66.0
-
-# NVIDIA monitoring
-nvidia-ml-py>=12.0.0
-
-# Async support
-aiohttp>=3.9.0
-
-# Config file parsing
-pyyaml>=6.0
diff --git a/runners/nvidia_sglang_6da83845/runner.py b/runners/nvidia_sglang_6da83845/runner.py
deleted file mode 100644
index aa49b270..00000000
--- a/runners/nvidia_sglang_6da83845/runner.py
+++ /dev/null
@@ -1,365 +0,0 @@
-"""
-AccelMark — NVIDIA SGLang benchmark runner.
-
-Implements BenchmarkRunner for SGLang on NVIDIA GPUs.
-All orchestration logic lives in runners/benchmark_runner.py.
-
-SGLang differences from vLLM:
-  - Offline: uses sglang.Engine (sync, batched)
-  - Online/interactive: uses sglang.AsyncEngine with async generator streaming
-  - Streaming output is cumulative (same as vLLM) — delta sliced by prev_length
-  - Quantization loaded via engine_kwargs["quantization"] or dtype="fp8"
-  - Memory query uses torch.cuda.max_memory_allocated (same as vLLM)
-"""
-
-import asyncio
-import sys
-import time
-from pathlib import Path
-from typing import Optional
-
-# Add repo root to path
-_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-sys.path.insert(0, str(_REPO_ROOT))
-
-import torch
-from transformers import AutoTokenizer
-
-from runners.benchmark_runner import BenchmarkRunner, InferenceRequest
-from loadgen.types import InferenceResult
-
-
-
-# Suppress per-request SGLang logs
-import logging
-logging.getLogger("sglang").setLevel(logging.WARNING)
-
-
-class SGLangRunner(BenchmarkRunner):
-    """AccelMark benchmark runner using SGLang on NVIDIA GPUs."""
-
-    SUPPORTS_STREAMING  = True
-    SUPPORTS_BATCHING   = True
-    SUPPORTS_ONLINE     = True
-    SUPPORTS_MULTI_CHIP = True
-
-    # SGLang on NVIDIA supports all standard precisions.
-    # Hardware detection in BenchmarkRunner will automatically restrict to
-    # FP16 on older chips (V100, T4) that don't support BF16.
-    SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"]
-
-    # SGLang supports the same quantization formats as vLLM via pre-quantized
-    # checkpoints. FP8 requires H100 or newer (native FP8 tensor cores).
-    # On A100, FP8 weights compute in BF16 — list it here; the leaderboard
-    # will show effective_dtype to clarify.
-    SUPPORTED_QUANTIZATION_BACKENDS = ["fp8", "compressed-tensors", "gptq_marlin"]
-
-    def __init__(self):
-        self.engine        = None   # sglang.Engine (offline/accuracy)
-        self.async_engine  = None   # sglang.AsyncEngine (online/interactive)
-        self.tokenizer     = None
-        self._loop: asyncio.AbstractEventLoop = None
-        self._sampling_params: dict = {}
-
-    def _get_chip_count(self) -> int:
-        """Return the number of available CUDA GPUs."""
-        try:
-            import torch
-            n = torch.cuda.device_count()
-            return n if n > 0 else 1
-        except Exception:
-            return 1
-
-    def _get_framework_name(self) -> str:
-        return "SGLang"
-
-    def _get_framework_version(self) -> str:
-        try:
-            import sglang
-            return sglang.__version__
-        except Exception:
-            return "unknown"
-
-    def load_model(self, model_path: str, parallelism: dict) -> None:
-        """Load model — sync Engine for batch inference, AsyncEngine for streaming."""
-        tp_size       = parallelism["tensor_parallel_size"]
-        ep_size       = parallelism.get("expert_parallel_size", 1)
-        max_tokens    = parallelism["max_tokens"]
-        max_model_len = parallelism["max_model_len"]
-        use_async     = parallelism["use_async"]
-        enforce_eager = getattr(self, "_enforce_eager", False)
-
-        cfg                  = getattr(self, "_runner_config", {})
-        mem_fraction_static  = getattr(self, "_mem_fraction_static", 0.88)
-        extra_kwargs         = dict(cfg.get("engine_kwargs") or {})
-
-        effective_precision = getattr(self, "_effective_precision", "BF16").upper()
-        precision           = getattr(self, "_precision", None) or effective_precision
-
-        _dtype_override  = getattr(self, "_precision_dtype_override", None)
-        _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
-        quantization     = _prec_eng_kwargs.pop("quantization", None)
-
-        _NATIVE_DTYPE_MAP = {"BF16": "bfloat16", "FP16": "float16", "FP32": "float32"}
-        dtype = _NATIVE_DTYPE_MAP.get(precision, "auto")
-        self._quantization_method = quantization
-
-        if _dtype_override:
-            dtype = _dtype_override
-        if _prec_eng_kwargs:
-            _prec_eng_kwargs.update(extra_kwargs)
-            extra_kwargs = _prec_eng_kwargs
-
-        print(
-            f"Loading model: precision={precision}, dtype={dtype}"
-            + (f", quantization_method={self._quantization_method}"
-               if self._quantization_method else "")
-        )
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_path, trust_remote_code=False
-        )
-
-        self._sampling_params = {
-            "max_new_tokens": max_tokens,
-            "temperature":    0.0,
-        }
-
-        engine_kwargs = dict(
-            model_path=model_path,
-            dtype=dtype,
-            tp_size=tp_size,
-            trust_remote_code=False,
-            disable_cuda_graph=enforce_eager,
-            mem_fraction_static=mem_fraction_static,
-            **extra_kwargs,
-        )
-        if ep_size > 1:
-            engine_kwargs["ep_size"] = ep_size
-        if quantization:
-            engine_kwargs["quantization"] = quantization
-        if max_model_len:
-            engine_kwargs["context_length"] = max_model_len
-
-        import sglang as sgl
-
-        if not use_async:
-            self.engine = sgl.Engine(**engine_kwargs)
-        else:
-            self._loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(self._loop)
-            self.async_engine = sgl.AsyncEngine(**engine_kwargs)
-
-    def get_effective_dtype(self) -> Optional[str]:
-        """
-        Report the actual compute dtype SGLang resolved after model loading.
-        SGLang exposes the resolved dtype via engine.server_args.dtype.
-        """
-        try:
-            eng = self.engine or self.async_engine
-            if eng is not None:
-                return str(eng.server_args.dtype).replace("torch.", "")
-        except Exception:
-            pass
-        return getattr(self, "_effective_dtype", None)
-
-    def inference_fn_offline(
-        self, requests: list[InferenceRequest]
-    ) -> list[InferenceResult]:
-        """
-        Synchronous batch inference via sglang.Engine.generate().
-
-        SGLang Engine accepts a list of prompt strings and sampling params,
-        returns results in the same order. total_time_ms is set to the
-        wall-clock elapsed time of the entire batch — the correct denominator
-        for throughput = total_tokens / elapsed.
-        """
-        formatted  = [self.format_prompt(r.prompt) for r in requests]
-        t_start    = time.perf_counter()
-        outputs    = self.engine.generate(
-            prompts=formatted,
-            sampling_params=self._sampling_params,
-        )
-        elapsed_ms = (time.perf_counter() - t_start) * 1000
-
-        results = []
-        for req, out in zip(requests, outputs):
-            # SGLang output format: {"text": str, "meta_info": {...}}
-            text         = out.get("text", "") if isinstance(out, dict) else str(out)
-            output_tokens = len(self.tokenizer.encode(text)) if text else 0
-            input_tokens  = req.input_tokens or 0
-            results.append(InferenceResult(
-                first_token_time_ms=None,
-                total_time_ms=elapsed_ms,
-                output_tokens=output_tokens,
-                input_tokens=input_tokens,
-                success=True,
-                output_text=text,
-            ))
-        return results
-
-    async def inference_fn_streaming(
-        self, request: InferenceRequest
-    ) -> InferenceResult:
-        """
-        Async streaming inference via sglang.AsyncEngine for TTFT measurement.
-
-        SGLang's async generator yields cumulative text (same as vLLM).
-        We track prev_length and slice off deltas to count output tokens.
-        first_token_time_ms is set on the first non-empty yield.
-        """
-        formatted           = self.format_prompt(request.prompt)
-        t_start             = time.perf_counter()
-        first_token_time_ms = None
-        output_text         = ""
-        prev_length         = 0
-
-        async for chunk in self.async_engine.async_generate(
-            prompt=formatted,
-            sampling_params=self._sampling_params,
-        ):
-            # chunk is {"text": cumulative_text, "meta_info": {...}, "finished": bool}
-            current_text = chunk.get("text", "") if isinstance(chunk, dict) else str(chunk)
-            delta        = current_text[prev_length:]
-
-            if delta and first_token_time_ms is None:
-                first_token_time_ms = (time.perf_counter() - t_start) * 1000
-
-            output_text  = current_text
-            prev_length  = len(current_text)
-
-        total_time_ms  = (time.perf_counter() - t_start) * 1000
-        output_tokens  = len(self.tokenizer.encode(output_text)) if output_text else 0
-
-        return InferenceResult(
-            first_token_time_ms=first_token_time_ms,
-            total_time_ms=total_time_ms,
-            output_tokens=output_tokens,
-            input_tokens=0,
-            success=True,
-            output_text=output_text,
-        )
-
-    async def inference_fn_token_stream(self, request: InferenceRequest):
-        """
-        Async generator yielding decoded text deltas for serve-layer SSE streaming.
-
-        SGLang yields cumulative output — we slice off only the new delta each
-        step so the serve layer receives incremental chunks, not repeated text.
-        """
-        formatted   = self.format_prompt(request.prompt)
-        prev_length = 0
-
-        async for chunk in self.async_engine.async_generate(
-            prompt=formatted,
-            sampling_params=self._sampling_params,
-        ):
-            current_text = chunk.get("text", "") if isinstance(chunk, dict) else str(chunk)
-            delta        = current_text[prev_length:]
-            if delta:
-                yield delta
-                prev_length = len(current_text)
-
-    def get_peak_memory_gb(self) -> Optional[float]:
-        try:
-            return torch.cuda.max_memory_allocated() / (1024 ** 3)
-        except Exception:
-            return None
-
-    def release_resources(self) -> None:
-        """Shut down SGLang engines and release GPU memory."""
-        if self.engine is not None:
-            try:
-                self.engine.shutdown()
-            except Exception:
-                pass
-            try:
-                del self.engine
-            except Exception:
-                pass
-            self.engine = None
-
-        if self.async_engine is not None:
-            try:
-                if self._loop and not self._loop.is_closed():
-                    self._loop.run_until_complete(self.async_engine.shutdown())
-            except Exception:
-                pass
-            try:
-                del self.async_engine
-            except Exception:
-                pass
-            self.async_engine = None
-
-        # SGLang uses torch.distributed internally for tensor parallelism.
-        # Destroy process group so the next engine init creates a fresh one.
-        try:
-            if torch.distributed.is_initialized():
-                torch.distributed.destroy_process_group()
-        except Exception:
-            pass
-
-        import gc
-        gc.collect()
-        try:
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
-        except Exception:
-            pass
-
-    def parse_args(self):
-        """Add SGLang/NVIDIA-specific CLI flags. Base class pre-loads runner config."""
-        args = super().parse_args()
-        cfg = self._runner_config
-
-        import argparse
-        parser = argparse.ArgumentParser(add_help=False)
-        parser.add_argument("--tensor-parallel-size", type=int, default=None,
-                            dest="tensor_parallel_size")
-        parser.add_argument("--expert-parallel-size", type=int, default=None,
-                            dest="expert_parallel_size")
-        parser.add_argument("--disable-cuda-graph", action="store_true", default=False,
-                            dest="disable_cuda_graph")
-        extra, _ = parser.parse_known_args()
-
-        # Priority: CLI flag > yaml config > required_chips > auto-detected > default 1
-        # Fully resolved by base class.
-        tp_size, _tp_source = self._resolve_tensor_parallel_size(
-            extra.tensor_parallel_size
-        )
-        ep_size = (extra.expert_parallel_size
-                   if extra.expert_parallel_size is not None
-                   else cfg.get("expert_parallel_size", 1))
-
-        self._enforce_eager       = extra.disable_cuda_graph or cfg.get("disable_cuda_graph", False)
-        self._mem_fraction_static = cfg.get("mem_fraction_static", 0.88)
-
-        print(f"  tensor_parallel_size = {tp_size}  [{_tp_source}]")
-        if ep_size > 1:
-            print(f"  expert_parallel_size = {ep_size}  [cli/yaml]")
-
-        self._parallelism = {
-            "tensor_parallel_size":   tp_size,
-            "pipeline_parallel_size": 1,
-            "expert_parallel_size":   ep_size,
-            "data_parallel_size":     1,
-        }
-        self._chip_count = tp_size
-        return args
-
-    def get_extra_subprocess_args(self, args) -> list[str]:
-        """Forward SGLang/NVIDIA-specific flags to subprocess invocations."""
-        extra = [
-            "--tensor-parallel-size",
-            str(self._parallelism.get("tensor_parallel_size", 1)),
-        ]
-        if self._parallelism.get("expert_parallel_size", 1) > 1:
-            extra += ["--expert-parallel-size",
-                      str(self._parallelism["expert_parallel_size"])]
-        if self._enforce_eager:
-            extra += ["--disable-cuda-graph"]
-        return extra
-
-
-if __name__ == "__main__":
-    SGLangRunner().main()
diff --git a/runners/nvidia_sglang_c43a8309/meta.json b/runners/nvidia_sglang_c43a8309/meta.json
index eb80df61..24e0b98d 100644
--- a/runners/nvidia_sglang_c43a8309/meta.json
+++ b/runners/nvidia_sglang_c43a8309/meta.json
@@ -7,5 +7,15 @@
   "description": "AccelMark runner for NVIDIA GPUs using SGLang. Supports all suites (A–G). Enables direct throughput and latency comparison between SGLang and vLLM on the same hardware.",
   "supersedes_chain": [],
   "notes": "Decouple runners from suite and scenario knowledge — load_model() uses use_async from parallelism dict instead of checking scenario name.",
-  "created": "2026-04-03"
+  "created": "2026-04-03",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "validated",
+    "B": "validated",
+    "C": "validated",
+    "D": "validated",
+    "E": "validated",
+    "F": "validated",
+    "G": "validated"
+  }
 }
diff --git a/runners/nvidia_vllm_47f5d58e/meta.json b/runners/nvidia_vllm_47f5d58e/meta.json
index ada75f63..bb94db9d 100644
--- a/runners/nvidia_vllm_47f5d58e/meta.json
+++ b/runners/nvidia_vllm_47f5d58e/meta.json
@@ -7,5 +7,15 @@
   "description": "Official AccelMark reference runner for NVIDIA GPUs using vLLM. Supports all suites (A–F). Standard vLLM configuration, no custom patches.",
   "supersedes_chain": [],
   "notes": "Decouple runners from suite and scenario knowledge — load_model() uses use_async from parallelism dict instead of checking scenario name.",
-  "created": "2026-04-03"
+  "created": "2026-04-03",
+  "hardware_label": null,
+  "suite_support": {
+    "A": "validated",
+    "B": "validated",
+    "C": "validated",
+    "D": "validated",
+    "E": "validated",
+    "F": "validated",
+    "G": "validated"
+  }
 }
diff --git a/runners/platforms/__init__.py b/runners/platforms/__init__.py
new file mode 100644
index 00000000..472cbba1
--- /dev/null
+++ b/runners/platforms/__init__.py
@@ -0,0 +1,67 @@
+"""
+Platform plug-ins for the AccelMark environment collector.
+
+Each file in this package (other than the ones starting with ``_``) is a
+self-contained detector for one accelerator family. ``collect_env.py``
+imports them all at runtime and asks them, in priority order, whether
+their accelerator is present on the host. The first plug-in that returns
+a non-empty ``collect()`` result owns the environment report.
+
+To add support for a new platform you only need to drop a single file
+``runners/platforms/<my_platform>.py`` exporting one or more of the
+following module-level attributes:
+
+    ID:               str    — short identifier, lowercase, matches the
+                              ``platform`` field used in meta.json
+    DISPLAY_NAME:     str    — human-readable label used in warnings
+    PRIORITY:         int    — lower numbers are tried first; default 50
+    VENDOR_LABEL:     str    — string written into accelerator["vendor"]
+                              by ``collect()`` (informational)
+
+    def collect() -> list[dict]: ...
+    def detect_runtime_version() -> str | None: ...
+    def detect_pcie_gen() -> str | None: ...
+    def detect_topology() -> str | None: ...
+    def detect_intra_node_interconnect() -> str | None: ...
+    def diagnostics(env, accelerators) -> list[str]: ...
+
+All functions are optional. The collector skips any that are missing
+and silently swallows any plug-in import errors so a broken third-party
+plug-in cannot block detection of the platforms that ship with the
+repository.
+"""
+from __future__ import annotations
+
+import importlib
+import pkgutil
+from types import ModuleType
+from typing import List
+
+
+def _plugin_sort_key(mod: ModuleType) -> tuple[int, str]:
+    return (int(getattr(mod, "PRIORITY", 50)), str(getattr(mod, "ID", mod.__name__)))
+
+
+def discover_plugins() -> List[ModuleType]:
+    """Import every ``runners.platforms.<name>`` module and return them
+    sorted by ``PRIORITY`` (lower first). Modules whose name begins with
+    an underscore are treated as internal/helper and skipped.
+
+    Plug-ins that fail to import (for example due to a missing optional
+    dependency at import time) are silently ignored — detection of
+    other platforms must not be blocked by one broken plug-in.
+    """
+    plugins: list[ModuleType] = []
+    for info in pkgutil.iter_modules(__path__):
+        if info.name.startswith("_"):
+            continue
+        try:
+            mod = importlib.import_module(f"{__name__}.{info.name}")
+        except Exception:
+            continue
+        plugins.append(mod)
+    plugins.sort(key=_plugin_sort_key)
+    return plugins
+
+
+__all__ = ["discover_plugins"]
diff --git a/runners/platforms/amd.py b/runners/platforms/amd.py
new file mode 100644
index 00000000..9e1ff731
--- /dev/null
+++ b/runners/platforms/amd.py
@@ -0,0 +1,143 @@
+"""AMD GPU (ROCm) platform plug-in."""
+from __future__ import annotations
+
+import json
+import re
+import subprocess
+
+ID = "amd"
+DISPLAY_NAME = "AMD"
+VENDOR_LABEL = "AMD"
+PRIORITY = 20
+
+# Architectures that natively support BF16.
+_BF16_SUPPORTED = {
+    "cdna2", "cdna3",                          # MI200, MI300 series
+    "rdna3", "rdna4",                          # RX 7000+ series
+    "gfx90a",                                  # MI250X arch code
+    "gfx940", "gfx941", "gfx942",             # MI300 arch codes
+    "gfx1100", "gfx1101", "gfx1102",          # RDNA3 arch codes
+}
+
+# Architectures known to lack hardware BF16.
+_NO_BF16 = {
+    "cdna1",                                   # MI100
+    "rdna", "rdna1", "rdna2",                 # RX 5000, RX 6000 series
+    "gfx908",                                  # MI100 arch code
+    "gfx1030", "gfx1031",                     # RDNA2 arch codes
+}
+
+
+def _supports_bf16(arch_str: str) -> bool:
+    if not arch_str:
+        return True
+    arch_lower = arch_str.lower()
+    if any(k in arch_lower for k in _BF16_SUPPORTED):
+        return True
+    if any(k in arch_lower for k in _NO_BF16):
+        return False
+    return True
+
+
+def collect() -> list[dict]:
+    try:
+        out = subprocess.check_output(
+            [
+                "rocm-smi",
+                "--showproductname",
+                "--showmeminfo", "vram",
+                "--showdriverversion",
+                "--json",
+            ],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        )
+    except Exception:
+        return []
+
+    try:
+        data = json.loads(out)
+    except Exception:
+        return []
+
+    arch_str = ""
+    try:
+        arch_out = subprocess.check_output(
+            ["rocm-smi", "--showallinfo"], text=True, stderr=subprocess.DEVNULL
+        )
+        gfx_matches = re.findall(r"gfx\d+[a-z]?", arch_out.lower())
+        arch_str = gfx_matches[0] if gfx_matches else ""
+    except Exception:
+        pass
+
+    accelerators: list[dict] = []
+    for idx, (_card_id, info) in enumerate(data.items()):
+        if not isinstance(info, dict):
+            continue
+        name = (
+            info.get("Card Series")
+            or info.get("Card series")
+            or info.get("Product Name")
+            or info.get("product_name")
+            or "AMD GPU"
+        )
+        mem_bytes = int(
+            info.get("VRAM Total Memory (B)")
+            or info.get("vram_total_memory_b")
+            or info.get("VRAM Total Memory")
+            or 0
+        )
+        driver = (
+            info.get("Driver version")
+            or info.get("driver_version")
+            or info.get("Driver Version")
+            or "unknown"
+        )
+        accelerators.append(
+            {
+                "index": idx,
+                "name": name,
+                "vendor": VENDOR_LABEL,
+                "memory_gb": round(mem_bytes / (1024 ** 3), 1),
+                "driver_version": driver,
+                "firmware_version": None,
+                "supports_bf16": _supports_bf16(arch_str),
+            }
+        )
+    return accelerators
+
+
+def detect_runtime_version() -> str | None:
+    try:
+        out = subprocess.check_output(
+            ["rocm-smi", "--version"], text=True, stderr=subprocess.STDOUT
+        )
+        return f"ROCm {out.strip().splitlines()[-1]}"
+    except Exception:
+        return None
+
+
+def detect_topology() -> str | None:
+    try:
+        return subprocess.check_output(["rocm-smi", "--showtopo"], text=True)
+    except Exception:
+        return None
+
+
+def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
+    notes: list[str] = []
+    if (env.get("pytorch_version") or "") == "unknown":
+        notes.append(
+            "PyTorch is not installed — pytorch_version is unknown. For GPU stack "
+            "metadata: pip install torch (match your ROCm environment)."
+        )
+    if (env.get("runtime_version") or "") == "unknown":
+        notes.append(
+            "Could not detect ROCm runtime (rocm-smi / PyTorch ROCm). "
+            "runtime_version is unknown."
+        )
+    if env.get("accelerator_topology") is None and accelerators:
+        notes.append(
+            "accelerator_topology is null — rocm-smi --showtopo did not return data."
+        )
+    return notes
diff --git a/runners/platforms/apple.py b/runners/platforms/apple.py
new file mode 100644
index 00000000..6f542d94
--- /dev/null
+++ b/runners/platforms/apple.py
@@ -0,0 +1,165 @@
+"""Apple Silicon (M-series SoC) platform plug-in."""
+from __future__ import annotations
+
+import json
+import platform
+import subprocess
+
+ID = "apple"
+DISPLAY_NAME = "Apple Silicon"
+VENDOR_LABEL = "Apple"
+PRIORITY = 40
+
+
+def _silicon_brand() -> str | None:
+    """Return SoC marketing name (e.g. 'Apple M3 Pro') on Apple Silicon, else None."""
+    try:
+        chip = subprocess.check_output(
+            ["sysctl", "-n", "machdep.cpu.brand_string"],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        ).strip()
+        return chip if "Apple" in chip else None
+    except Exception:
+        return None
+
+
+def _supports_bf16(chip_name: str) -> bool:
+    """M1 has limited/slow BF16. M2+ has full hardware BF16."""
+    if not chip_name:
+        return True
+    name_lower = chip_name.lower()
+    if "m1" in name_lower and "m10" not in name_lower:
+        return False
+    return True
+
+
+def _macos_build_string() -> str:
+    """Product + build for reproducibility on local Macs."""
+    try:
+        ver = subprocess.check_output(
+            ["sw_vers", "-productVersion"], text=True, stderr=subprocess.DEVNULL
+        ).strip()
+        build = subprocess.check_output(
+            ["sw_vers", "-buildVersion"], text=True, stderr=subprocess.DEVNULL
+        ).strip()
+        return f"macOS {ver} (build {build})"
+    except Exception:
+        v = platform.mac_ver()[0]
+        return f"macOS {v}" if v else "macOS"
+
+
+def _metal_summary() -> str | None:
+    """Best-effort Metal support line from system_profiler (may take a few seconds)."""
+    try:
+        proc = subprocess.run(
+            ["system_profiler", "SPDisplaysDataType", "-json"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            text=True,
+            timeout=30,
+        )
+        if proc.returncode != 0 or not proc.stdout:
+            return None
+        data = json.loads(proc.stdout)
+        displays = data.get("SPDisplaysDataType") or []
+        for disp in displays:
+            if not isinstance(disp, dict):
+                continue
+            for key, val in disp.items():
+                kl = key.lower()
+                if ("metal" in kl or "mtl" in kl) and val:
+                    return f"GPU runtime ({key}): {val}"
+        return None
+    except Exception:
+        return None
+
+
+def collect() -> list[dict]:
+    chip = _silicon_brand()
+    if not chip:
+        return []
+    try:
+        mem_bytes = int(
+            subprocess.check_output(
+                ["sysctl", "-n", "hw.memsize"], text=True, stderr=subprocess.DEVNULL
+            ).strip()
+        )
+    except Exception:
+        return []
+    return [
+        {
+            "index": 0,
+            "name": chip,
+            "vendor": VENDOR_LABEL,
+            "memory_gb": round(mem_bytes / (1024 ** 3), 1),
+            "driver_version": _macos_build_string(),
+            "firmware_version": None,
+            "compute_capability": None,
+            "supports_bf16": _supports_bf16(chip),
+        }
+    ]
+
+
+def detect_runtime_version() -> str | None:
+    if not _silicon_brand():
+        return None
+    try:
+        import mlx
+
+        ver = getattr(mlx, "__version__", None)
+        if ver is None:
+            try:
+                from importlib.metadata import version as _pkg_version
+
+                ver = _pkg_version("mlx")
+            except Exception:
+                ver = None
+        return f"MLX {ver}" if ver else "MLX"
+    except ImportError:
+        pass
+    try:
+        import torch
+
+        if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+            return f"Metal MPS (PyTorch {torch.__version__})"
+    except ImportError:
+        pass
+    return _macos_build_string()
+
+
+def detect_pcie_gen() -> str | None:
+    if _silicon_brand():
+        return "SoC integrated (no discrete PCIe GPU)"
+    return None
+
+
+def detect_topology() -> str | None:
+    brand = _silicon_brand()
+    if not brand:
+        return None
+    lines = [
+        f"Apple Silicon — integrated GPU in {brand} (unified system memory).",
+        f"Machine: {platform.machine()}",
+    ]
+    metal = _metal_summary()
+    if metal:
+        lines.append(metal)
+    return "\n".join(lines)
+
+
+def detect_intra_node_interconnect() -> str | None:
+    if _silicon_brand():
+        return "SoC unified memory (no GPU-GPU interconnect)"
+    return None
+
+
+def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
+    notes: list[str] = []
+    runtime = env.get("runtime_version") or ""
+    if runtime.startswith("macOS ") and "MLX" not in runtime and "Metal MPS" not in runtime:
+        notes.append(
+            "Neither MLX nor PyTorch with MPS is available — runtime_version only reflects "
+            "macOS build. For ML stack: pip install mlx  or  pip install torch (with MPS)."
+        )
+    return notes
diff --git a/runners/platforms/ascend.py b/runners/platforms/ascend.py
new file mode 100644
index 00000000..430d4939
--- /dev/null
+++ b/runners/platforms/ascend.py
@@ -0,0 +1,256 @@
+"""Huawei Ascend NPU platform plug-in."""
+from __future__ import annotations
+
+import re
+import subprocess
+from pathlib import Path
+
+ID = "ascend"
+DISPLAY_NAME = "Huawei Ascend"
+VENDOR_LABEL = "Huawei"
+PRIORITY = 30
+
+_BF16_SUPPORTED = {"910b", "atlas 800t a2", "910b1", "910b2", "910b3", "910b4"}
+_NO_BF16 = {"310", "310p", "atlas 300"}
+
+
+def _supports_bf16(chip_name: str) -> bool:
+    if not chip_name:
+        return True
+    name_lower = chip_name.lower()
+    if any(k in name_lower for k in _BF16_SUPPORTED):
+        return True
+    if any(k in name_lower for k in _NO_BF16):
+        return False
+    return True
+
+
+def _enrich_via_torch_npu(accelerators: list[dict]) -> None:
+    """Backfill memory_gb and name via torch_npu runtime API.
+
+    torch_npu.npu.get_device_properties(i) mirrors torch.cuda:
+        .total_memory  — total HBM bytes
+        .name          — chip name string (e.g. "910B2")
+    Logical indices 0..N-1 map positionally to npu-smi enumeration order
+    when all devices are visible. Only fills fields still None so parsed
+    values are never overwritten.
+    """
+    try:
+        import torch_npu
+
+        logical_count = torch_npu.npu.device_count()
+    except Exception:
+        return
+
+    for logical_idx in range(min(logical_count, len(accelerators))):
+        rec = accelerators[logical_idx]
+        try:
+            props = torch_npu.npu.get_device_properties(logical_idx)
+            if rec.get("memory_gb") is None and props.total_memory:
+                rec["memory_gb"] = round(props.total_memory / (1024 ** 3), 1)
+            if rec.get("name") in (None, "Huawei Ascend NPU") and props.name:
+                rec["name"] = f"Huawei Ascend {props.name.strip()}"
+        except Exception:
+            continue
+
+
+def _parse_npu_smi_table(out: str, cann_version: str) -> list[dict]:
+    """Parse the tabular output of plain ``npu-smi info``.
+
+    The table format has two data rows per device:
+        Row 1: | <NPU_ID>  <ChipName>  | <Health> | <Power> <Temp> <Hugepages> |
+        Row 2: | <ChipID>              | <Bus-Id> | <AICore> <Mem-Usage>  <HBM-Usage(used/total MB)> |
+
+    Example:
+        | 7     910B2               | OK            | 96.5        49                0    / 0             |
+        | 0                         | 0000:42:00.0  | 0           0    / 0          3389 / 65536         |
+    """
+    accelerators: list[dict] = []
+    lines = out.splitlines()
+
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        row1 = re.match(r"\|\s*(\d+)\s+(\S+)\s*\|", line)
+        if row1:
+            npu_id = int(row1.group(1))
+            chip_name = row1.group(2).strip()
+            hbm_total_mb = None
+            if i + 1 < len(lines):
+                row2 = lines[i + 1]
+                hbm_match = re.search(r"(\d+)\s*/\s*(\d+)\s*\|?\s*$", row2)
+                if hbm_match:
+                    hbm_total_mb = int(hbm_match.group(2))
+                i += 1
+
+            memory_gb = round(hbm_total_mb / 1024, 1) if hbm_total_mb else None
+            name = f"Huawei Ascend {chip_name}" if chip_name else "Huawei Ascend NPU"
+            accelerators.append(
+                {
+                    "index": npu_id,
+                    "name": name,
+                    "vendor": VENDOR_LABEL,
+                    "memory_gb": memory_gb,
+                    "driver_version": cann_version,
+                    "firmware_version": None,
+                    "supports_bf16": _supports_bf16(name),
+                }
+            )
+        i += 1
+
+    return accelerators
+
+
+def _get_board_info(npu_id: str) -> dict:
+    """Query driver and firmware version for a single NPU via ``-t board``.
+
+    Returns dict with keys ``driver_version`` and ``firmware_version``.
+    Falls back to CANN install-path files for driver_version if the command
+    fails or produces no match.
+    """
+    result = {"driver_version": "unknown", "firmware_version": None}
+
+    try:
+        out = subprocess.check_output(
+            ["npu-smi", "info", "-t", "board", "-i", npu_id],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        )
+        for line in out.splitlines():
+            sw_match = re.search(r"Software\s+Version\s*:\s*(.+)", line, re.IGNORECASE)
+            if sw_match:
+                result["driver_version"] = sw_match.group(1).strip()
+            fw_match = re.search(r"Firmware\s+Version\s*:\s*(.+)", line, re.IGNORECASE)
+            if fw_match:
+                fw = fw_match.group(1).strip()
+                result["firmware_version"] = None if fw.upper() == "NA" else fw
+    except Exception:
+        pass
+
+    if result["driver_version"] == "unknown":
+        for cann_path in (
+            "/usr/local/Ascend/ascend-toolkit/latest",
+            "/usr/local/Ascend/nnae/latest",
+        ):
+            version_file = Path(cann_path) / "version.cfg"
+            if version_file.exists():
+                try:
+                    text = version_file.read_text()
+                    m = re.search(r"Version=(.+)", text)
+                    if m:
+                        result["driver_version"] = f"CANN {m.group(1).strip()}"
+                        break
+                except Exception:
+                    pass
+
+    return result
+
+
+def collect() -> list[dict]:
+    try:
+        out = subprocess.check_output(
+            ["npu-smi", "info"], text=True, stderr=subprocess.DEVNULL
+        )
+    except Exception:
+        return []
+
+    accelerators = _parse_npu_smi_table(out, "unknown")
+
+    if not accelerators:
+        try:
+            out_l = subprocess.check_output(
+                ["npu-smi", "info", "-l"], text=True, stderr=subprocess.DEVNULL
+            )
+        except Exception:
+            return []
+        current_npu: dict | None = None
+        for line in out_l.splitlines():
+            npu_match = re.search(r"NPU\s+ID\s*:\s*(\d+)", line, re.IGNORECASE)
+            if npu_match:
+                if current_npu:
+                    current_npu["supports_bf16"] = _supports_bf16(current_npu.get("name", ""))
+                    accelerators.append(current_npu)
+                current_npu = {
+                    "index": int(npu_match.group(1)),
+                    "name": "Huawei Ascend NPU",
+                    "vendor": VENDOR_LABEL,
+                    "memory_gb": None,
+                    "driver_version": "unknown",
+                    "firmware_version": None,
+                }
+            if current_npu is None:
+                continue
+            chip_match = re.search(r"Chip\s+Name\s*:\s*(.+)", line, re.IGNORECASE)
+            if chip_match:
+                current_npu["name"] = f"Huawei Ascend {chip_match.group(1).strip()}"
+            mem_match = re.search(r"HBM\s+Capacity.*?:\s*(\d+)", line, re.IGNORECASE)
+            if mem_match:
+                current_npu["memory_gb"] = round(int(mem_match.group(1)) / 1024, 1)
+            if current_npu.get("memory_gb") is None:
+                mem_match2 = re.search(
+                    r"Memory\s+Capacity.*?:\s*(\d+)\s*MB", line, re.IGNORECASE
+                )
+                if mem_match2:
+                    current_npu["memory_gb"] = round(int(mem_match2.group(1)) / 1024, 1)
+            if current_npu.get("firmware_version") is None:
+                fw_match = re.search(r"Firmware\s+Version\s*:\s*(.+)", line, re.IGNORECASE)
+                if fw_match:
+                    current_npu["firmware_version"] = fw_match.group(1).strip()
+        if current_npu:
+            current_npu["supports_bf16"] = _supports_bf16(current_npu.get("name", ""))
+            accelerators.append(current_npu)
+
+    if accelerators:
+        for rec in accelerators:
+            board = _get_board_info(str(rec["index"]))
+            rec["driver_version"] = board["driver_version"]
+            rec["firmware_version"] = board["firmware_version"]
+        _enrich_via_torch_npu(accelerators)
+
+    return accelerators
+
+
+def detect_runtime_version() -> str | None:
+    try:
+        info_out = subprocess.check_output(
+            ["npu-smi", "info"], text=True, stderr=subprocess.DEVNULL
+        )
+    except Exception:
+        return None
+    m = re.search(r"\|\s*(\d+)\s+\S+\s*\|", info_out)
+    if not m:
+        return None
+    board = _get_board_info(m.group(1))
+    if board["driver_version"] != "unknown":
+        return f"CANN {board['driver_version']}"
+    return None
+
+
+def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
+    notes: list[str] = []
+    if (env.get("pytorch_version") or "") == "unknown":
+        notes.append(
+            "PyTorch is not installed — pytorch_version is unknown. For GPU stack "
+            "metadata: pip install torch (with torch_npu)."
+        )
+    if (env.get("runtime_version") or "") == "unknown":
+        notes.append(
+            "Could not detect CANN/runtime from npu-smi / install paths. "
+            "runtime_version is unknown."
+        )
+    for a in accelerators:
+        if a.get("memory_gb") is None:
+            try:
+                import torch_npu  # noqa: F401
+            except ImportError:
+                notes.append(
+                    "Ascend HBM memory could not be parsed from npu-smi — optional "
+                    "pip install torch_npu may fill memory_gb via the runtime API."
+                )
+            else:
+                notes.append(
+                    "Ascend HBM memory_gb is still unknown (torch_npu is importable) — "
+                    "check ASCEND_VISIBLE_DEVICES, driver, and npu-smi output."
+                )
+            break
+    return notes
diff --git a/runners/platforms/google.py b/runners/platforms/google.py
new file mode 100644
index 00000000..afc288fa
--- /dev/null
+++ b/runners/platforms/google.py
@@ -0,0 +1,126 @@
+"""Google Cloud TPU platform plug-in."""
+from __future__ import annotations
+
+import os
+
+ID = "google"
+DISPLAY_NAME = "Google TPU"
+VENDOR_LABEL = "Google"
+PRIORITY = 50
+
+
+def _chip_name_and_memory(tpu_type: str | None) -> tuple[str, float | None]:
+    """Map a TPU GCE accelerator-type string to chip name and HBM per chip.
+
+    Examples:
+        v5litepod-*  → v5e, 16 GiB / chip
+        v5e-*        → v5e (alias), 16 GiB / chip
+        v6e-*        → v6e Trillium, 32 GiB / chip
+        v7x-*        → v7x Ironwood, 192 GiB / chip
+        v5p-*        → v5p, 95 GiB / chip
+        v4-*         → v4, 32 GiB / chip
+        v3-*         → v3, 16 GiB / chip
+        v2-*         → v2, 8 GiB / chip
+    """
+    if not tpu_type:
+        return "Google TPU", None
+
+    t = tpu_type.lower()
+    if "v5litepod" in t or "v5e" in t:
+        return "Google TPU v5e", 16.0
+    if "v6e" in t or "trillium" in t:
+        return "Google TPU v6e", 32.0
+    if "v7x" in t or "ironwood" in t:
+        return "Google TPU v7x", 192.0
+    if "v5p" in t:
+        return "Google TPU v5p", 95.0
+    if "v4" in t:
+        return "Google TPU v4", 32.0
+    if "v3" in t:
+        return "Google TPU v3", 16.0
+    if "v2" in t:
+        return "Google TPU v2", 8.0
+    return f"Google TPU ({tpu_type})", None
+
+
+def collect() -> list[dict]:
+    try:
+        from tpu_inference import tpu_info
+
+        num_chips = tpu_info.get_num_chips()
+        tpu_type = tpu_info.get_tpu_type()
+        node_name = tpu_info.get_node_name()
+
+        if not num_chips or num_chips == 0:
+            return []
+
+        chip_name, memory_gb = _chip_name_and_memory(tpu_type)
+
+        # tpu_info.get_num_cores_per_chip() misclassifies the GCE alias
+        # form "v5e-1" (vs the canonical "v5litepod-1"), so disambiguate
+        # via the raw tpu_type string ourselves.
+        t = (tpu_type or "").lower()
+        if "v5litepod" in t or "v5e" in t or "v6e" in t or "trillium" in t:
+            num_cores_per_chip = 1
+        else:
+            num_cores_per_chip = tpu_info.get_num_cores_per_chip()
+
+        jax_version = "unknown"
+        try:
+            import jax
+
+            jax_version = jax.__version__
+            jax_devices = jax.devices()
+            if jax_devices and memory_gb is None:
+                mem = getattr(jax_devices[0], "memory_stats", None)
+                if mem and "bytes_limit" in mem:
+                    memory_gb = round(mem["bytes_limit"] / (1024 ** 3), 1)
+        except Exception:
+            pass
+
+        accelerators: list[dict] = []
+        for i in range(num_chips):
+            accelerators.append(
+                {
+                    "index": i,
+                    "name": chip_name,
+                    "vendor": VENDOR_LABEL,
+                    "memory_gb": memory_gb,
+                    "driver_version": f"JAX {jax_version}" if jax_version != "unknown" else "unknown",
+                    "firmware_version": None,
+                    "tpu_type": tpu_type,
+                    "tpu_node_name": node_name,
+                    "num_cores_per_chip": num_cores_per_chip,
+                    "supports_bf16": True,
+                }
+            )
+        return accelerators
+
+    except Exception:
+        return []
+
+
+def detect_runtime_version() -> str | None:
+    try:
+        import jax
+
+        return f"JAX {jax.__version__}"
+    except ImportError:
+        return None
+
+
+def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
+    notes: list[str] = []
+    if not accelerators and (
+        os.environ.get("TPU_NAME") or os.environ.get("CLOUD_TPU_TASK")
+    ):
+        notes.append(
+            "TPU-related environment variables are set but no TPU devices were detected — "
+            "install jax / tpu_inference when running on Cloud TPU."
+        )
+    if accelerators and (env.get("runtime_version") or "") == "unknown":
+        notes.append(
+            "Could not detect JAX/runtime for TPU. runtime_version is unknown — "
+            "install jax if you use Cloud TPU."
+        )
+    return notes
diff --git a/runners/platforms/moorethreads.py b/runners/platforms/moorethreads.py
new file mode 100644
index 00000000..708db1b5
--- /dev/null
+++ b/runners/platforms/moorethreads.py
@@ -0,0 +1,190 @@
+"""Moore Threads MUSA GPU platform plug-in.
+
+Moore Threads ships its own driver and management tooling:
+
+* ``mthreads-gmi`` — the moral equivalent of ``nvidia-smi`` / ``rocm-smi``.
+* ``pymtml`` — Python bindings analogous to NVML / pynvml.
+* ``torchada`` — a CUDA→MUSA compatibility shim that exposes the standard
+  ``torch.cuda`` API, with the real backend version available via
+  ``torch.version.musa``.
+
+This plug-in first tries the Python bindings (best machine-readable
+output) and falls back to scraping ``mthreads-gmi`` text output. Both
+paths are best-effort: when none of the tools are installed the plug-in
+silently reports zero accelerators and the collector moves on.
+"""
+from __future__ import annotations
+
+import re
+import subprocess
+
+ID = "moorethreads"
+DISPLAY_NAME = "Moore Threads"
+VENDOR_LABEL = "Moore Threads"
+PRIORITY = 60
+
+# S5000 / S4000 datacenter SKUs ship with native BF16 support; the older
+# consumer-class MTT S80/S70 cards are FP16-only.
+_BF16_SUPPORTED_HINTS = ("s5000", "s4000", "s3000")
+_NO_BF16_HINTS = ("s80", "s70", "s60", "s50")
+
+
+def _supports_bf16(chip_name: str) -> bool:
+    if not chip_name:
+        return True
+    name_lower = chip_name.lower()
+    if any(k in name_lower for k in _BF16_SUPPORTED_HINTS):
+        return True
+    if any(k in name_lower for k in _NO_BF16_HINTS):
+        return False
+    return True
+
+
+def _collect_via_pymtml() -> list[dict]:
+    try:
+        import pymtml as mtml  # type: ignore[import-not-found]
+    except ImportError:
+        return []
+
+    try:
+        mtml.mtmlInit()
+    except Exception:
+        return []
+
+    accelerators: list[dict] = []
+    try:
+        count = mtml.mtmlDeviceGetCount()
+    except Exception:
+        try:
+            mtml.mtmlShutdown()
+        except Exception:
+            pass
+        return []
+
+    for idx in range(int(count)):
+        try:
+            handle = mtml.mtmlDeviceGetHandleByIndex(idx)
+            name = mtml.mtmlDeviceGetName(handle)
+            mem = mtml.mtmlDeviceGetMemoryInfo(handle)
+            total_mb = getattr(mem, "total", None) or mem.get("total", 0)
+            driver = mtml.mtmlSystemGetDriverVersion()
+        except Exception:
+            continue
+        accelerators.append(
+            {
+                "index": idx,
+                "name": name if isinstance(name, str) else name.decode("utf-8", "ignore"),
+                "vendor": VENDOR_LABEL,
+                "memory_gb": round(int(total_mb) / 1024, 1) if total_mb else None,
+                "driver_version": driver if isinstance(driver, str) else driver.decode("utf-8", "ignore"),
+                "firmware_version": None,
+                "supports_bf16": _supports_bf16(str(name)),
+            }
+        )
+
+    try:
+        mtml.mtmlShutdown()
+    except Exception:
+        pass
+
+    return accelerators
+
+
+def _collect_via_smi() -> list[dict]:
+    """Fallback parser for ``mthreads-gmi`` text output.
+
+    The output format mirrors nvidia-smi: a header with the driver / MUSA
+    versions followed by per-device blocks listing the product name and
+    memory usage. We only need the device name and total memory.
+    """
+    try:
+        out = subprocess.check_output(
+            ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+        )
+    except Exception:
+        return []
+
+    driver = "unknown"
+    m = re.search(r"Driver\s+Version\s*:\s*(\S+)", out, re.IGNORECASE)
+    if m:
+        driver = m.group(1)
+
+    accelerators: list[dict] = []
+    # Per-device rows look like:
+    #   |   0  MTT S4000                  ...     | 0000:65:00.0  Off |   ... |
+    # followed by:
+    #   |   0%   45C    P0    ... /   ... |    234MiB / 49152MiB |    ... |
+    for match in re.finditer(
+        r"\|\s*(\d+)\s+(MTT\s+\S+(?:\s+\S+)?)\s*", out
+    ):
+        idx = int(match.group(1))
+        name = match.group(2).strip()
+        # Search downstream of this match for the memory line
+        tail = out[match.end():]
+        mem_match = re.search(r"(\d+)MiB\s*/\s*(\d+)MiB", tail)
+        memory_gb = None
+        if mem_match:
+            memory_gb = round(int(mem_match.group(2)) / 1024, 1)
+        accelerators.append(
+            {
+                "index": idx,
+                "name": name,
+                "vendor": VENDOR_LABEL,
+                "memory_gb": memory_gb,
+                "driver_version": driver,
+                "firmware_version": None,
+                "supports_bf16": _supports_bf16(name),
+            }
+        )
+    return accelerators
+
+
+def collect() -> list[dict]:
+    accelerators = _collect_via_pymtml()
+    if accelerators:
+        return accelerators
+    return _collect_via_smi()
+
+
+def detect_runtime_version() -> str | None:
+    """Prefer torch.version.musa (most reliable when torchada is installed),
+    fall back to scraping ``mthreads-gmi`` header.
+    """
+    try:
+        import torch
+
+        ver = getattr(torch.version, "musa", None)
+        if ver:
+            return f"MUSA {ver}"
+    except ImportError:
+        pass
+
+    try:
+        out = subprocess.check_output(
+            ["mthreads-gmi"], text=True, stderr=subprocess.DEVNULL
+        )
+        m = re.search(r"MUSA\s+Version\s*:\s*(\S+)", out, re.IGNORECASE)
+        if m:
+            return f"MUSA {m.group(1)}"
+        m = re.search(r"Driver\s+Version\s*:\s*(\S+)", out, re.IGNORECASE)
+        if m:
+            return f"Moore Threads Driver {m.group(1)}"
+    except Exception:
+        pass
+    return None
+
+
+def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
+    notes: list[str] = []
+    if accelerators and (env.get("pytorch_version") or "") == "unknown":
+        notes.append(
+            "PyTorch (with the torchada MUSA shim) is not installed — "
+            "pytorch_version is unknown."
+        )
+    if accelerators and (env.get("runtime_version") or "") == "unknown":
+        notes.append(
+            "Could not detect MUSA runtime (tried torch.version.musa and "
+            "mthreads-gmi). runtime_version is unknown — install torchada "
+            "or the Moore Threads MUSA toolkit."
+        )
+    return notes
diff --git a/runners/platforms/nvidia.py b/runners/platforms/nvidia.py
new file mode 100644
index 00000000..f7fcca2a
--- /dev/null
+++ b/runners/platforms/nvidia.py
@@ -0,0 +1,163 @@
+"""NVIDIA GPU platform plug-in."""
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+from pathlib import Path
+
+ID = "nvidia"
+DISPLAY_NAME = "NVIDIA"
+VENDOR_LABEL = "NVIDIA"
+PRIORITY = 10
+
+
+def collect() -> list[dict]:
+    try:
+        out = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-gpu=index,name,memory.total,driver_version,compute_cap",
+                "--format=csv,noheader,nounits",
+            ],
+            text=True,
+        )
+    except Exception:
+        return []
+
+    accelerators: list[dict] = []
+    for line in out.strip().splitlines():
+        idx, name, mem, driver, compute_cap = [x.strip() for x in line.split(",")]
+        try:
+            cc_float = float(compute_cap) if compute_cap else 0.0
+            supports_bf16 = cc_float >= 8.0
+        except (ValueError, TypeError):
+            supports_bf16 = True
+        accelerators.append(
+            {
+                "index": int(idx),
+                "name": name,
+                "vendor": VENDOR_LABEL,
+                "memory_gb": round(float(mem) / 1024, 1),
+                "driver_version": driver,
+                "firmware_version": None,
+                "compute_capability": compute_cap,
+                "supports_bf16": supports_bf16,
+            }
+        )
+    return accelerators
+
+
+def detect_runtime_version() -> str | None:
+    try:
+        import torch
+
+        if torch.version.cuda:
+            return f"CUDA {torch.version.cuda}"
+    except ImportError:
+        pass
+
+    try:
+        out = subprocess.check_output(
+            ["nvcc", "--version"], text=True, stderr=subprocess.STDOUT
+        )
+        for line in out.splitlines():
+            if "release" in line.lower():
+                parts = line.split("release")
+                if len(parts) > 1:
+                    version = parts[1].split(",")[0].strip()
+                    return f"CUDA {version}"
+    except Exception:
+        pass
+
+    for env_var in ("CUDA_HOME", "CUDA_PATH"):
+        cuda_home = os.environ.get(env_var)
+        if not cuda_home:
+            continue
+        version_file = Path(cuda_home) / "version.txt"
+        if version_file.exists():
+            return version_file.read_text().strip()
+        version_json = Path(cuda_home) / "version.json"
+        if version_json.exists():
+            try:
+                data = json.loads(version_json.read_text())
+                cuda = data.get("cuda", {}).get("version", "")
+                if cuda:
+                    return f"CUDA {cuda}"
+            except Exception:
+                pass
+
+    return None
+
+
+def detect_pcie_gen() -> str | None:
+    try:
+        out = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-gpu=pcie.link.gen.current",
+                "--format=csv,noheader",
+            ],
+            text=True,
+        )
+        gen = out.strip().splitlines()[0].strip()
+        if gen.isdigit():
+            return f"PCIe Gen {gen}"
+    except Exception:
+        pass
+    return None
+
+
+def detect_topology() -> str | None:
+    try:
+        return subprocess.check_output(
+            ["nvidia-smi", "topo", "-m", "--no-color"], text=True
+        )
+    except Exception:
+        pass
+    try:
+        out = subprocess.check_output(["nvidia-smi", "topo", "-m"], text=True)
+        return re.sub(r"\x1b\[[0-9;]*m", "", out)
+    except Exception:
+        return None
+
+
+def detect_intra_node_interconnect() -> str | None:
+    """Returns 'NVLink' when nvidia-smi topology contains NV# fabric links."""
+    for cmd in (["nvidia-smi", "topo", "-m", "--no-color"], ["nvidia-smi", "topo", "-m"]):
+        try:
+            out = subprocess.check_output(cmd, text=True, stderr=subprocess.DEVNULL)
+        except Exception:
+            continue
+        if re.search(r"\bNV\d+\b", out):
+            return "NVLink"
+    return None
+
+
+def diagnostics(env: dict, accelerators: list[dict]) -> list[str]:
+    notes: list[str] = []
+    pytorch_v = env.get("pytorch_version") or ""
+    runtime = env.get("runtime_version") or ""
+    pcie = env.get("pcie_generation") or ""
+
+    if pytorch_v == "unknown":
+        notes.append(
+            "PyTorch is not installed — pytorch_version is unknown. For GPU stack "
+            "metadata: pip install torch (match your CUDA environment)."
+        )
+    if runtime == "unknown":
+        notes.append(
+            "Could not detect CUDA/runtime (tried PyTorch CUDA, nvcc, CUDA_HOME, "
+            "nvidia-smi paths). runtime_version is unknown — install a CUDA toolkit "
+            "or PyTorch with CUDA."
+        )
+    if pcie == "unknown":
+        notes.append(
+            "Could not read PCIe generation from nvidia-smi — pcie_generation is unknown."
+        )
+    if env.get("accelerator_topology") is None and accelerators:
+        notes.append(
+            "accelerator_topology is null — nvidia-smi topo did not return data."
+        )
+    return notes
diff --git a/runners/validate_runners.py b/runners/validate_runners.py
index 66a02d19..14db0a9a 100644
--- a/runners/validate_runners.py
+++ b/runners/validate_runners.py
@@ -41,6 +41,7 @@
 
 RUNNERS_DIR = Path(__file__).parent
 SCHEMA_PATH = RUNNERS_DIR / "meta.schema.json"
+PLATFORMS_CATALOG_PATH = RUNNERS_DIR.parent / "schema" / "platforms.json"
 
 # Files that live flat in runners/ — not runner folders
 BASE_FILES = {
@@ -49,9 +50,11 @@
     "validate_submission.py",
     "validate_runners.py",
     "hash_runner.py",
+    "gen_pr_summary.py",
     "meta.schema.json",
     "protocol.py",
     "template",
+    "platforms",
     "__pycache__",
     "__init__.py",
 }
@@ -60,6 +63,18 @@
 if HAS_JSONSCHEMA and SCHEMA_PATH.exists():
     schema = json.loads(SCHEMA_PATH.read_text())
 
+known_platforms: set[str] = set()
+if PLATFORMS_CATALOG_PATH.exists():
+    try:
+        _catalog = json.loads(PLATFORMS_CATALOG_PATH.read_text())
+        known_platforms = {
+            p["id"]
+            for p in (_catalog.get("platforms") or [])
+            if isinstance(p, dict) and p.get("id")
+        }
+    except Exception:
+        known_platforms = set()
+
 
 def compute_hash(runner_py: Path) -> str:
     return hashlib.sha256(runner_py.read_bytes()).hexdigest()[:8]
@@ -328,6 +343,16 @@ def do_rename(folder: Path, correct_hash: str, dry_run: bool) -> Path:
             if not meta.get(field):
                 err(f"meta.json missing required field: {field}")
 
+    # ── Platform catalogue check (warning only) ──────────────────────────────
+    platform_id = meta.get("platform") or ""
+    if known_platforms and platform_id and platform_id not in known_platforms:
+        warn(
+            f"Platform '{platform_id}' is not catalogued in schema/platforms.json. "
+            f"The runner still validates (the schema accepts any lowercase identifier), "
+            f"but please consider adding an entry so the README matrix can render a "
+            f"human-readable label and stable sort order for this platform."
+        )
+
     # ── meta.id must match folder name ────────────────────────────────────────
     if meta.get("id") != folder.name:
         err(
diff --git a/schema/env.schema.json b/schema/env.schema.json
index a5d25e9a..60fc5e8d 100644
--- a/schema/env.schema.json
+++ b/schema/env.schema.json
@@ -31,6 +31,10 @@
         }
       }
     },
+    "accelerator_platform": {
+      "type": ["string","null"],
+      "description": "Identifier of the platform plug-in that produced the accelerators array (e.g. 'nvidia', 'amd', 'ascend', 'apple', 'google', 'moorethreads'). Null when no accelerator was detected."
+    },
     "accelerator_topology": { "type": ["string","null"] },
     "cpu": {
       "type": "object",
diff --git a/schema/platforms.json b/schema/platforms.json
new file mode 100644
index 00000000..8ebcc45e
--- /dev/null
+++ b/schema/platforms.json
@@ -0,0 +1,89 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$comment": "Catalog of known accelerator platforms used by AccelMark. The `platform` field in runners/meta.json is validated by a regex pattern (^[a-z][a-z0-9]*$), NOT by this list — so contributors can ship a runner for a new platform without ever editing this file. The catalog here is used purely for presentation: display names and sort order in the README/leaderboard matrix. tools/generate_platforms_matrix.py warns (does not error) when it encounters a platform id that is not listed here, prompting an optional follow-up PR to add it.",
+  "type": "object",
+  "required": ["platforms"],
+  "properties": {
+    "platforms": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["id", "display_name", "sort_order"],
+        "properties": {
+          "id": {
+            "type": "string",
+            "pattern": "^[a-z][a-z0-9]*$",
+            "description": "Platform identifier; matches the prefix of a runner folder name (e.g. nvidia in nvidia_vllm_47f5d58e)."
+          },
+          "display_name": {
+            "type": "string",
+            "description": "Human-readable hardware label used in the README matrix when a runner does not override it via meta.hardware_label."
+          },
+          "vendor": {
+            "type": ["string", "null"],
+            "description": "Vendor / manufacturer name. Informational only."
+          },
+          "accelerator_kind": {
+            "type": ["string", "null"],
+            "description": "Free-form category (GPU, NPU, TPU, SoC, ASIC, etc.). Informational only."
+          },
+          "sort_order": {
+            "type": "integer",
+            "description": "Sort key for the README matrix; lower values render first."
+          }
+        }
+      }
+    }
+  },
+  "platforms": [
+    {
+      "id": "nvidia",
+      "display_name": "NVIDIA GPU",
+      "vendor": "NVIDIA",
+      "accelerator_kind": "GPU",
+      "sort_order": 10
+    },
+    {
+      "id": "amd",
+      "display_name": "AMD GPU",
+      "vendor": "AMD",
+      "accelerator_kind": "GPU",
+      "sort_order": 20
+    },
+    {
+      "id": "ascend",
+      "display_name": "Huawei Ascend NPU",
+      "vendor": "Huawei",
+      "accelerator_kind": "NPU",
+      "sort_order": 30
+    },
+    {
+      "id": "apple",
+      "display_name": "Apple Silicon",
+      "vendor": "Apple",
+      "accelerator_kind": "SoC",
+      "sort_order": 40
+    },
+    {
+      "id": "google",
+      "display_name": "Google TPU",
+      "vendor": "Google",
+      "accelerator_kind": "TPU",
+      "sort_order": 50
+    },
+    {
+      "id": "moorethreads",
+      "display_name": "Moore Threads GPU",
+      "vendor": "Moore Threads",
+      "accelerator_kind": "GPU",
+      "sort_order": 60
+    },
+    {
+      "id": "other",
+      "display_name": "Other accelerator",
+      "vendor": null,
+      "accelerator_kind": null,
+      "sort_order": 999
+    }
+  ]
+}
diff --git a/suites/README.md b/suites/README.md
index 103e2ad1..e3cdbfb6 100644
--- a/suites/README.md
+++ b/suites/README.md
@@ -714,7 +714,7 @@ submissions.
 
 1. Open a GitHub Issue using the "Request new suite" template
 2. Specify: model, chip count, scenarios, and rationale
-3. Maintainers review and add to the roadmap
+3. Discuss the proposal in the issue thread — interested contributors weigh in
 4. Create `suites/suite_X/suite.json` referencing a shared dataset
    (or add a new dataset to `datasets/`)
 5. If custom orchestration is needed, add `suites/suite_X/suite.py`
diff --git a/suites/suite_C/suite.py b/suites/suite_C/suite.py
index b08c3cde..d3602262 100644
--- a/suites/suite_C/suite.py
+++ b/suites/suite_C/suite.py
@@ -80,9 +80,8 @@ def _run_suite_c(br, args, suite: dict, env_info: dict) -> None:
     hw_precisions     = br._detect_supported_precisions(env_info)
     baseline_precision = "BF16" if "BF16" in hw_precisions else "FP16"
 
-    # runner_backends   = [b.lower() for b in br.SUPPORTED_QUANTIZATION_BACKENDS]
     precisions_to_run = []
-    skipped           = []   # runner doesn't declare support for this backend
+    skipped           = []   # other full-precision baseline that isn't the hw default
     for p in all_precisions:
         if p == baseline_precision:
             precisions_to_run.append(p)
@@ -90,22 +89,16 @@ def _run_suite_c(br, args, suite: dict, env_info: dict) -> None:
             # The other full-precision baseline — skip silently, not the hw baseline.
             skipped.append(p)
         else:
-            # # Quantized format — only gate on whether the runner declares support
-            # # for this backend. Hardware compatibility (e.g. FP8 on V100) is left
-            # # to the inference engine: if the hardware can't run it, the subprocess
-            # # fails with the engine's own error, which is recorded in the summary.
-            # fmt_entry = precision_model_map.get(p, {})
-            # backend   = (fmt_entry.get("engine_kwargs") or {}).get("quantization", "")
-            # if not backend or backend.lower() not in runner_backends:
-            #     skipped.append(p)
-            # else:
+            # Quantized format. Hardware compatibility (e.g. FP8 on V100) is left
+            # to the inference engine: if the hardware can't run it, the subprocess
+            # fails with the engine's own error, which is recorded in the summary.
             precisions_to_run.append(p)
 
     print(f"\n{'='*60}")
     print(f"  Suite C — Quantization Efficiency Benchmark")
     print(f"  Formats to run : {precisions_to_run}")
     if skipped:
-        print(f"  Skipped        : {skipped} (backend not in SUPPORTED_QUANTIZATION_BACKENDS)")
+        print(f"  Skipped        : {skipped} (other full-precision baseline)")
     print(f"  Base output    : {base_dir}")
     print(f"{'='*60}\n")
 
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/generate_platforms_matrix.py b/tools/generate_platforms_matrix.py
new file mode 100644
index 00000000..5e24b126
--- /dev/null
+++ b/tools/generate_platforms_matrix.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""
+Regenerate the "Supported platforms" matrix in the top-level README.md
+from runner metadata.
+
+Each runner's ``meta.json`` declares which AccelMark suites it supports
+via ``suite_support``; the table here is a simple projection of that
+data and never needs to be hand-edited. ``schema/platforms.json``
+provides the human-readable hardware label and the row ordering.
+
+Modes:
+
+    # rewrite README.md in place
+    python tools/generate_platforms_matrix.py
+
+    # fail (exit 1) if README.md is out of sync with the runner metadata —
+    # intended for CI so a PR that adds a runner without updating the
+    # README is rejected automatically
+    python tools/generate_platforms_matrix.py --check
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Iterable
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+_RUNNERS_DIR = _REPO_ROOT / "runners"
+_PLATFORMS_CATALOG = _REPO_ROOT / "schema" / "platforms.json"
+_README = _REPO_ROOT / "README.md"
+
+START_MARKER = "<!-- platforms-matrix:start -->"
+END_MARKER = "<!-- platforms-matrix:end -->"
+
+# Files that live flat in runners/ — not runner folders.
+_BASE_FILES = {
+    "benchmark_runner.py",
+    "collect_env.py",
+    "validate_submission.py",
+    "validate_runners.py",
+    "hash_runner.py",
+    "gen_pr_summary.py",
+    "meta.schema.json",
+    "protocol.py",
+    "template",
+    "platforms",
+    "__pycache__",
+    "__init__.py",
+    "README.md",
+}
+
+SUITE_KEYS = ["A", "B", "C", "D", "E", "F", "G"]
+
+STATUS_GLYPH = {
+    "validated": "✓",
+    "pending": "⋯",
+    "unsupported": "—",
+}
+
+
+def _load_platforms_catalog() -> dict[str, dict]:
+    if not _PLATFORMS_CATALOG.exists():
+        return {}
+    try:
+        data = json.loads(_PLATFORMS_CATALOG.read_text())
+    except Exception:
+        return {}
+    out: dict[str, dict] = {}
+    for entry in data.get("platforms") or []:
+        if not isinstance(entry, dict):
+            continue
+        pid = entry.get("id")
+        if pid:
+            out[pid] = entry
+    return out
+
+
+def _iter_runner_metas() -> Iterable[dict]:
+    for folder in sorted(_RUNNERS_DIR.iterdir()):
+        if not folder.is_dir():
+            continue
+        if folder.name in _BASE_FILES or folder.name.startswith("."):
+            continue
+        meta_path = folder / "meta.json"
+        if not meta_path.exists():
+            continue
+        try:
+            meta = json.loads(meta_path.read_text())
+        except Exception:
+            continue
+        # Skip runners that have been superseded — they should not clutter
+        # the matrix once a successor is merged.
+        if meta.get("deprecated_by"):
+            continue
+        yield meta
+
+
+def _runner_row(meta: dict, catalog: dict[str, dict]) -> tuple[tuple, list[str]]:
+    """Return (sort_key, row_cells) for a runner."""
+    platform_id = meta.get("platform") or "other"
+    platform_entry = catalog.get(platform_id) or {}
+    hardware_label = (
+        meta.get("hardware_label")
+        or platform_entry.get("display_name")
+        or platform_id.capitalize()
+    )
+
+    runner_id = meta.get("id") or "?"
+    framework = meta.get("framework") or "?"
+
+    suite_support = meta.get("suite_support") or {}
+    suite_cells = [STATUS_GLYPH.get(suite_support.get(s), "?") for s in SUITE_KEYS]
+
+    sort_order = int(platform_entry.get("sort_order", 999))
+    sort_key = (sort_order, platform_id, hardware_label, runner_id)
+
+    row = [
+        hardware_label,
+        f"`{runner_id}`",
+        framework,
+        *suite_cells,
+    ]
+    return sort_key, row
+
+
+def _build_table() -> str:
+    catalog = _load_platforms_catalog()
+    rows = []
+    seen_unknown_platforms: set[str] = set()
+    for meta in _iter_runner_metas():
+        if (meta.get("platform") or "") not in catalog:
+            seen_unknown_platforms.add(meta.get("platform") or "?")
+        rows.append(_runner_row(meta, catalog))
+
+    rows.sort(key=lambda kr: kr[0])
+
+    header = (
+        "| Hardware | Runner folder | Framework "
+        + "".join(f"| {s} " for s in SUITE_KEYS)
+        + "|"
+    )
+    sep = (
+        "|---|---|---"
+        + "".join("|:-:" for _ in SUITE_KEYS)
+        + "|"
+    )
+
+    body_lines = []
+    for _key, cells in rows:
+        body_lines.append("| " + " | ".join(cells) + " |")
+
+    legend = (
+        "_Legend: ✓ validated · ⋯ author-declared "
+        "(not smoke-tested in this repo yet) · — unsupported._"
+    )
+
+    parts = [header, sep, *body_lines, "", legend]
+
+    if seen_unknown_platforms:
+        unknown = ", ".join(sorted(seen_unknown_platforms))
+        print(
+            f"WARNING: encountered platform id(s) not catalogued in "
+            f"schema/platforms.json: {unknown}. The matrix still renders "
+            f"using fallbacks, but please consider opening a small PR "
+            f"adding them to the catalog.",
+            file=sys.stderr,
+        )
+
+    return "\n".join(parts).rstrip() + "\n"
+
+
+def _splice_into_readme(table: str) -> str:
+    src = _README.read_text()
+    if START_MARKER not in src or END_MARKER not in src:
+        raise SystemExit(
+            f"README.md is missing the platforms-matrix markers. "
+            f"Expected '{START_MARKER}' and '{END_MARKER}' on their own lines."
+        )
+    pre, _rest = src.split(START_MARKER, 1)
+    _mid, post = _rest.split(END_MARKER, 1)
+    return f"{pre}{START_MARKER}\n{table}{END_MARKER}{post}"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Exit non-zero if README.md is out of sync with runner metadata "
+        "(does not write).",
+    )
+    args = parser.parse_args()
+
+    table = _build_table()
+    new_readme = _splice_into_readme(table)
+    current = _README.read_text()
+
+    if new_readme == current:
+        print("README.md platforms matrix is up to date.")
+        return 0
+
+    if args.check:
+        print(
+            "ERROR: README.md platforms matrix is out of sync with "
+            "runners/*/meta.json. Run:\n"
+            "    python tools/generate_platforms_matrix.py\n"
+            "and commit the result.",
+            file=sys.stderr,
+        )
+        return 1
+
+    _README.write_text(new_readme)
+    print("README.md platforms matrix regenerated.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/utils/run_all_4gpu.sh b/utils/run_all_4gpu.sh
deleted file mode 100644
index d79cb682..00000000
--- a/utils/run_all_4gpu.sh
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/bin/bash
-# AccelMark full benchmark run — 4 GPU
-#
-# Stage 2 uses a global GPU pool scheduler rather than per-suite GPU blocks.
-#
-# Stage 2 GPU assignment (A100-80GB timing estimates):
-#   GPU 0 (~68m): D/online(38m) → C/online(20m) → F/interactive(6m) → A/offline(3m) → F/offline(1m)
-#   GPU 1 (~73m): A/interactive(35m) → C/offline(20m) → D/offline(18m)
-#   GPU 2 (~71m): D/sustained(31m) → C/sustained(25m) → F/sustained(15m)
-#   GPU 3 (~68m): A/sustained(30m) → D/interactive(27m) → A/online(8m) → F/online(3m)
-#
-#   Makespan: ~73 min  (vs ~114 min with per-suite blocking)
-#   GPU utilization: ~96%
-#
-# Suite C note: each scenario runs all 5 precision formats internally.
-#   No separate accuracy gate or merge_suite needed for Suite C.
-#
-# All scenarios run to completion even if some fail (no set -e).
-
-RUNNER_ID='nvidia_vllm_47f5d58e'
-
-log() { echo "[$(date '+%H:%M:%S')] $*"; }
-
-run_scenario() {
-    local suite=$1 scenario=$2 gpus=$3
-    log "START  $suite/$scenario (GPU $gpus)"
-    CUDA_VISIBLE_DEVICES=$gpus \
-        python run.py \
-            --runner "$RUNNER_ID" \
-            --suite "$suite" \
-            --tier verified \
-            --scenario "$scenario" \
-        && log "OK     $suite/$scenario" \
-        || log "FAILED $suite/$scenario (exit $?)"
-}
-
-merge_suite() {
-    local suite=$1 gpus=$2
-    log "MERGE  $suite"
-    CUDA_VISIBLE_DEVICES=$gpus \
-        python run.py \
-            --runner "$RUNNER_ID" \
-            --suite "$suite" \
-            --tier verified \
-            --scenario all \
-            --skip-accuracy-gate \
-        && log "OK     $suite merge" \
-        || log "FAILED $suite merge (exit $?)"
-}
-
-# ── Stage 1: Suite B ──────────────────────────────────────────────────────────
-log "===== Stage 1: Suite B (GPUs 0-3, ~76 min) ====="
-
-run_scenario suite_B accuracy    0,1,2,3
-run_scenario suite_B offline     0,1,2,3   # ~6m
-run_scenario suite_B online      0,1,2,3   # ~14m
-run_scenario suite_B interactive 0,1,2,3   # ~26m
-run_scenario suite_B sustained   0,1,2,3   # ~31m
-merge_suite  suite_B             0,1,2,3
-
-log "Stage 1 complete."
-
-# ── Stage 2a: Accuracy gates ──────────────────────────────────────────────────
-log "===== Stage 2a: Accuracy gates (A/D/F in parallel) ====="
-
-run_scenario suite_A accuracy 0 &
-run_scenario suite_D accuracy 1 &
-run_scenario suite_F accuracy 2 &
-wait
-
-# ── Stage 2b: Global scheduler — all 4 GPUs, ~73 min ─────────────────────────
-log "===== Stage 2b: Benchmark scenarios — global schedule (GPUs 0-3, ~73 min) ====="
-
-(                                               # GPU 0: ~68m
-    run_scenario suite_D online       0   # ~38m
-    run_scenario suite_C online       0   # ~20m
-    run_scenario suite_F interactive  0   # ~6m
-    run_scenario suite_A offline      0   # ~3m
-    run_scenario suite_F offline      0   # ~1m
-) &
-
-(                                               # GPU 1: ~73m
-    run_scenario suite_A interactive  1   # ~35m
-    run_scenario suite_C offline      1   # ~20m
-    run_scenario suite_D offline      1   # ~18m
-) &
-
-(                                               # GPU 2: ~71m
-    run_scenario suite_D sustained    2   # ~31m
-    run_scenario suite_C sustained    2   # ~25m
-    run_scenario suite_F sustained    2   # ~15m
-) &
-
-(                                               # GPU 3: ~68m
-    run_scenario suite_A sustained    3   # ~30m
-    run_scenario suite_D interactive  3   # ~27m
-    run_scenario suite_A online       3   # ~8m
-    run_scenario suite_F online       3   # ~3m
-) &
-
-wait
-
-# Final clean merge (no-op on scenarios, just rebuilds result.json).
-# Suite C is exempt — it has its own merge logic.
-log "===== Stage 2c: Final merge ====="
-merge_suite suite_A 0 &
-merge_suite suite_D 1 &
-merge_suite suite_F 2 &
-wait
-
-log "Stage 2 complete."
-
-# ── Stage 3: Suite E ──────────────────────────────────────────────────────────
-log "===== Stage 3: Suite E (GPUs 0-3, chip-count sweep 1x/2x/4x) ====="
-
-run_scenario suite_E accuracy 0,1,2,3
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
-    python run.py \
-        --runner "$RUNNER_ID" \
-        --suite suite_E \
-        --tier verified \
-        --scenario offline \
-        --max-chips 4 \
-    && log "OK     suite_E/offline (all chip counts)" \
-    || log "FAILED suite_E/offline (exit $?)"
-
-merge_suite suite_E 0,1,2,3
-
-log "Stage 3 complete."
-log "===== All Done ====="
diff --git a/utils/run_all_8gpu.sh b/utils/run_all_8gpu.sh
deleted file mode 100644
index 9cedf7ad..00000000
--- a/utils/run_all_8gpu.sh
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/bin/bash
-# AccelMark full benchmark run — 8 GPU
-#
-# Stage 2 uses a global GPU pool scheduler rather than per-suite GPU blocks.
-# Each GPU is assigned a sequence of scenarios (possibly from different suites)
-# so no GPU sits idle while another suite's long task is running.
-#
-# Scheduling is longest-first greedy across all 15 Stage 2 benchmark scenarios.
-# Each GPU runs its assigned scenarios sequentially; all 8 GPUs run in parallel.
-#
-# Stage 2 GPU assignment (A100-80GB timing estimates):
-#   GPU 0 (~38m): D/online(38m)
-#   GPU 1 (~35m): A/interactive(35m)
-#   GPU 2 (~33m): D/sustained(31m) → F/online(3m)
-#   GPU 3 (~34m): A/sustained(30m) → A/offline(3m) → F/offline(1m)
-#   GPU 4 (~33m): D/interactive(27m) → F/interactive(6m)
-#   GPU 5 (~33m): C/sustained(25m) → A/online(8m)
-#   GPU 6 (~38m): C/offline(20m) → D/offline(18m)
-#   GPU 7 (~35m): C/online(20m) → F/sustained(15m)
-#
-#   Makespan: ~38.5 min  (vs ~68 min with per-suite blocking)
-#   GPU utilization: ~91%
-#
-# Suite C note: each scenario (offline/online/sustained) runs all 5 precision
-#   formats internally (BF16/FP8/W8A8/W8A16/W4A16) — no separate accuracy gate.
-#   Suite C has its own merge logic; merge_suite is not called for it.
-#
-# Merge note: parallel scenarios within the same suite race on writing the
-#   suite-level result.json. merge_suite runs --scenario all after Stage 2
-#   completes — all subdirs already have result.json so the framework skips
-#   execution and does one clean uncontested merge.
-#
-# All scenarios run to completion even if some fail (no set -e).
-
-RUNNER_ID='nvidia_vllm_47f5d58e'
-
-log() { echo "[$(date '+%H:%M:%S')] $*"; }
-
-run_scenario() {
-    local suite=$1 scenario=$2 gpus=$3
-    log "START  $suite/$scenario (GPU $gpus)"
-    CUDA_VISIBLE_DEVICES=$gpus \
-        python run.py \
-            --runner "$RUNNER_ID" \
-            --suite "$suite" \
-            --tier verified \
-            --scenario "$scenario" \
-        && log "OK     $suite/$scenario" \
-        || log "FAILED $suite/$scenario (exit $?)"
-}
-
-merge_suite() {
-    local suite=$1 gpus=$2
-    log "MERGE  $suite"
-    CUDA_VISIBLE_DEVICES=$gpus \
-        python run.py \
-            --runner "$RUNNER_ID" \
-            --suite "$suite" \
-            --tier verified \
-            --scenario all \
-            --skip-accuracy-gate \
-        && log "OK     $suite merge" \
-        || log "FAILED $suite merge (exit $?)"
-}
-
-# ── Stage 1: Suite B ──────────────────────────────────────────────────────────
-# 70B model needs all 8 GPUs per scenario — strictly sequential.
-log "===== Stage 1: Suite B (GPUs 0-7, ~76 min) ====="
-
-run_scenario suite_B accuracy    0,1,2,3,4,5,6,7
-run_scenario suite_B offline     0,1,2,3,4,5,6,7   # ~6m
-run_scenario suite_B online      0,1,2,3,4,5,6,7   # ~14m
-run_scenario suite_B interactive 0,1,2,3,4,5,6,7   # ~26m
-run_scenario suite_B sustained   0,1,2,3,4,5,6,7   # ~31m
-merge_suite  suite_B             0,1,2,3,4,5,6,7
-
-log "Stage 1 complete."
-
-# ── Stage 2a: Accuracy gates ──────────────────────────────────────────────────
-# A/D/F accuracy in parallel. Suite C skips — accuracy runs per-precision
-# inside its scenario subprocesses.
-log "===== Stage 2a: Accuracy gates (A/D/F in parallel) ====="
-
-run_scenario suite_A accuracy 0 &
-run_scenario suite_D accuracy 1 &
-run_scenario suite_F accuracy 2 &
-wait
-
-# ── Stage 2b: Global scheduler — all 8 GPUs, ~38.5 min ───────────────────────
-# Each GPU runs its sequence of scenarios independently.
-# Different suites can share a GPU — they just run sequentially on that GPU.
-log "===== Stage 2b: Benchmark scenarios — global schedule (GPUs 0-7, ~38.5 min) ====="
-
-( run_scenario suite_D online       0 ) &   # GPU 0: ~38m
-
-( run_scenario suite_A interactive  1 ) &   # GPU 1: ~35m
-
-(                                           # GPU 2: ~33m
-    run_scenario suite_D sustained  2
-    run_scenario suite_F online     2
-) &
-
-(                                           # GPU 3: ~34m
-    run_scenario suite_A sustained  3
-    run_scenario suite_A offline    3
-    run_scenario suite_F offline    3
-) &
-
-(                                           # GPU 4: ~33m
-    run_scenario suite_D interactive 4
-    run_scenario suite_F interactive 4
-) &
-
-(                                           # GPU 5: ~33m
-    run_scenario suite_C sustained  5
-    run_scenario suite_A online     5
-) &
-
-(                                           # GPU 6: ~38m
-    run_scenario suite_C offline    6
-    run_scenario suite_D offline    6
-) &
-
-(                                           # GPU 7: ~35m
-    run_scenario suite_C online     7
-    run_scenario suite_F sustained  7
-) &
-
-wait
-
-# Final clean merge for each suite (no-op on scenarios, just rebuilds result.json).
-# Suite C is exempt — it has its own merge logic.
-log "===== Stage 2c: Final merge ====="
-merge_suite suite_A 0 &
-merge_suite suite_D 1 &
-merge_suite suite_F 2 &
-wait
-
-log "Stage 2 complete."
-
-# ── Stage 3: Suite E ──────────────────────────────────────────────────────────
-log "===== Stage 3: Suite E (GPUs 0-7, chip-count sweep 1x/2x/4x/8x) ====="
-
-run_scenario suite_E accuracy 0,1,2,3,4,5,6,7
-
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-    python run.py \
-        --runner "$RUNNER_ID" \
-        --suite suite_E \
-        --tier verified \
-        --scenario offline \
-        --max-chips 8 \
-    && log "OK     suite_E/offline (all chip counts)" \
-    || log "FAILED suite_E/offline (exit $?)"
-
-merge_suite suite_E 0,1,2,3,4,5,6,7
-
-log "Stage 3 complete."
-log "===== All Done ====="