diff --git a/.github/workflows/generate_leaderboard.yml b/.github/workflows/generate_leaderboard.yml index 550c13ba..31dec72d 100644 --- a/.github/workflows/generate_leaderboard.yml +++ b/.github/workflows/generate_leaderboard.yml @@ -1,15 +1,22 @@ name: Generate Leaderboard -# Triggers only when results/ changes land on main. -# This covers both direct merges and squash merges. +# Triggers when anything that can change the rendered leaderboard lands on +# main: new community/verified results, generator code, the static site, or +# the auto-generated README platforms matrix (so a runner/platform metadata +# change also redeploys). This covers both direct merges and squash merges. on: push: branches: - main paths: - 'results/**' + - 'leaderboard/**' + - 'tools/generate_platforms_matrix.py' + - 'schema/platforms.json' + - 'runners/*/meta.json' - # Allow manual trigger from Actions tab (useful for first deploy) + # Allow manual trigger from Actions tab (useful for first deploy or to + # force a redeploy when nothing in the watched paths changed). workflow_dispatch: jobs: diff --git a/.github/workflows/validate_pr.yml b/.github/workflows/validate_pr.yml index 0ce5ada4..36ea8561 100644 --- a/.github/workflows/validate_pr.yml +++ b/.github/workflows/validate_pr.yml @@ -1,12 +1,16 @@ name: Validate Submission -# Triggers only when results/ directory is touched in a PR. -# Schema changes, script changes, and doc changes do not trigger this. +# Triggers when results/, runner metadata, the platforms catalogue, the +# README (which contains the auto-generated matrix), or the matrix +# generator itself are touched in a PR. on: pull_request: paths: - 'results/**' - 'runners/**' + - 'schema/platforms.json' + - 'tools/generate_platforms_matrix.py' + - 'README.md' jobs: validate-runners: @@ -66,6 +70,24 @@ jobs: if: steps.changed.outputs.folders == '' run: echo "No runner folders changed in this PR — skipping." + # Always validate every runner folder (not just the ones touched in + # this PR). This catches drift introduced by shared changes — e.g. + # a meta.schema.json edit that breaks an unrelated existing runner. + - name: Validate all runner folders (drift check) + run: | + echo "::group::Validating every runner folder in the repo" + python runners/validate_runners.py + echo "::endgroup::" + + # README "Supported platforms" matrix is generated from each runner's + # meta.json. If a PR changes a runner's suite_support / hardware_label + # or adds a new runner without regenerating the table, fail. + - name: README platforms matrix is in sync + run: | + echo "::group::tools/generate_platforms_matrix.py --check" + python tools/generate_platforms_matrix.py --check + echo "::endgroup::" + validate: name: Validate result submissions runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 503a227f..2ea1c8c9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,27 +1,50 @@ +# ── Python ────────────────────────────────────────────────────────────────── __pycache__/ *.py[cod] +*.egg *.egg-info/ dist/ build/ .venv/ venv/ env/ -*.egg + +# ── Editor / IDE ──────────────────────────────────────────────────────────── +.idea/ +.vscode/ +*.swp +*.swo +*~ +*.tmp .DS_Store -*.log -my_submission/ -mini_result/ -/tmp/ -leaderboard/site/leaderboard.js -leaderboard/site/api/ + +# ── Test / lint caches ────────────────────────────────────────────────────── +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +.coverage.* +htmlcov/ +.tox/ + +# ── Jupyter ───────────────────────────────────────────────────────────────── +.ipynb_checkpoints/ + +# ── AccelMark local-only files ────────────────────────────────────────────── configs/models_local.yaml configs/submitter.yaml configs/runner_configs/*.yaml +leaderboard/site/leaderboard.js +leaderboard/site/api/ -# Local-only benchmark artifacts (not needed for submission) +# ── Benchmark artifacts (local-only — not part of submissions) ────────────── +samples.jsonl +samples.jsonl.ipynb_checkpoints/ accuracy_outputs.jsonl run.log -samples.jsonl.ipynb_checkpoints/ +*.log +my_submission/ +mini_result/ *_backup/ backup/ -.ipynb_checkpoints/ +/tmp/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..354654ba --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,144 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and maintainers pledge to make participation in +AccelMark a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic +status, nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances + of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Benchmark-specific expectations + +AccelMark is a results-driven leaderboard. The following are specifically +out of scope: + +* **Cherry-picked, doctored, or fabricated results.** Submitting a result + knowing it does not reflect the listed hardware / software is misconduct, + not a mistake. Mistakes are expected and welcome; fabrication is not. +* **Misrepresentation of affiliation.** Vendor employees may submit results + for their own hardware (it is encouraged) — but the `[vendor]` tag in the + submitter name must be present, per `CONTRIBUTING.md`. +* **Disparaging another vendor or contributor's hardware in PR/issue + comments.** Numbers speak; commentary should focus on methodology and + reproducibility, not on the entity behind a competing result. + +## Enforcement Responsibilities + +Project maintainers are responsible for clarifying and enforcing our +standards of acceptable behavior and will take appropriate and fair +corrective action in response to any behavior that they deem inappropriate, +threatening, offensive, or harmful. + +Maintainers have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that +are not aligned to this Code of Conduct, and will communicate reasons for +moderation decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies +when an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail +address, posting via an official social media account, or acting as an +appointed representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the project maintainers by opening a confidential security +advisory at +or, when GitHub access is not available, by emailing the maintainer listed +in the repository profile. All complaints will be reviewed and investigated +promptly and fairly. + +All community leaders are obligated to respect the privacy and security of +the reporter of any incident. + +## Enforcement Guidelines + +Project maintainers will follow these Community Impact Guidelines in +determining the consequences for any action they deem in violation of this +Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from a maintainer, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, for a specified period of time. +Violating these terms may lead to a temporary or permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public +or private interaction with the people involved, including unsolicited +interaction with those enforcing the Code of Conduct, is allowed during +this period. Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +For answers to common questions about this code of conduct, see the FAQ at +. Translations are available at +. + +[homepage]: https://www.contributor-covenant.org diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f4b2e798..5dda79da 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,9 +11,10 @@ in the leaderboard and submitting your results. **Got a GPU? Here's the shortest path to getting on the leaderboard:** ```bash -# 1. Clone and install -git clone https://github.com/JuhaoLiang1997/AccelMark.git +# 1. Fork the repo on GitHub, then clone your fork +git clone https://github.com//AccelMark.git cd AccelMark +pip install -e . pip install -r runners/nvidia_vllm_47f5d58e/requirements.txt # 2. Set your name (one-time setup) @@ -23,14 +24,19 @@ cp configs/submitter.yaml.example configs/submitter.yaml # 3. Run the benchmark (~11 min on A100 for default scenarios) # Accuracy gate runs automatically before the benchmark starts. # Output directory is auto-named using run_name, e.g.: - # results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm_47f5d58e_ed4b0557 - python run.py --runner nvidia_vllm_47f5d58e --suite suite_A +# results/community/nvidia_a100_sxm4_80gbx1_suite_A_nvidia_vllm_47f5d58e_ed4b0557 +python run.py --runner nvidia_vllm_47f5d58e --suite suite_A -# 4. Submit — open a GitHub Issue and paste your result.json -# https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md +# 4. Open a pull request with your result +git checkout -b submit/ +git add results/ && git commit -m "results: on suite_A" +git push origin submit/ +gh pr create # or open the PR via the GitHub web UI ``` -That's it. The CI bot handles the rest. +That's it. CI validates the result automatically; merging the PR publishes it to the leaderboard. + +> _Prefer not to use git?_ Open a [Community Submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md), paste your `result.json`, and the CI bot will draft the PR on your behalf. --- @@ -87,6 +93,22 @@ models: `configs/models_local.yaml` is gitignored. Once configured, you don't need `--model-path` on the command line. +### Per-runner config overrides (optional) + +If you want to permanently change a runner's defaults (e.g. raise +`max_num_seqs`, enable `enforce_eager`, set `tensor_parallel_size`) without +adding flags to every invocation, drop a yaml at +`configs/runner_configs/runner_.yaml`. The file is +**gitignored** — only `*.yaml.example` companions are checked into the +repo. That makes the override strictly local to your machine and keeps +the canonical defaults intact for everyone else. + +```bash +cp configs/runner_configs/runner_nvidia_vllm_47f5d58e.yaml.example \ + configs/runner_configs/runner_nvidia_vllm_47f5d58e.yaml +# edit freely — your benchmarks now pick up the overrides automatically +``` + --- ## Running the benchmark @@ -238,7 +260,7 @@ Add **speculative** (~3 min extra on Suite A, ~24 min extra on Suite D) or **bur --- -## Submitting your results +## Submitting a result ### Accuracy gate (automatic) @@ -271,28 +293,42 @@ framework, same precision, same inference stack. **Resuming an interrupted run:** Re-running the same command resumes from where it stopped. Completed steps are skipped automatically. -### Step 1: Open a GitHub Issue +### Recommended: open a pull request -Go to [Issues → New → Community Submission](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md). +After a successful run, validate locally and open a PR: -Paste the full contents of your `result.json` into the code block and submit. +```bash +# Validate the produced files against the schemas (the same check CI runs). +python runners/validate_submission.py \ + results/community//result.json + +# Stage just the new result and env file. +git checkout -b submit/ +git add results/community// +git commit -m "results: on suite_A" +git push origin submit/ + +# Open the PR — either via the GitHub web UI or: +gh pr create --fill +``` -> **The CI bot validates your result automatically** — recommend to run -> `validate_submission.py` locally first. If validation fails, the bot -> comments on your issue explaining what to fix. +What gets committed is *only* the new files under `results/community//`: +your `result.json`, `env_info.json`, and (optionally) `samples.jsonl`. Nothing +else in the repo should change. -> **Why paste instead of attach?** The CI bot reads `result.json` directly -> from the issue body. File attachments are not accessible to GitHub Actions. +CI then re-runs the schema validator and the runner-folder integrity check. +When both pass and a contributor reviews the diff, the PR is merged and your +result shows up on the leaderboard on the next site build. -### Step 2: Done +### Alternative: open a submission issue (no git required) -The CI bot will: -1. Validate your `result.json` against the schema -2. Open a PR with your result files -3. Comment on your issue with a link to the PR +If you'd rather not use git, paste your `result.json` into a +[Community Submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md). +A bot will validate the JSON, draft a PR with the files in the right place, +and link it back to your issue. You don't need to touch git or fork the repo. -Your result appears on the **Community** tab after the maintainer reviews -and merges the PR — usually within a day or two. +> **Why paste instead of attach?** The bot reads `result.json` directly from +> the issue body. File attachments are not accessible to GitHub Actions. --- @@ -300,10 +336,13 @@ and merges the PR — usually within a day or two. | Tier | How to get it | Leaderboard placement | |------|--------------|----------------------| -| **community** | Submit via GitHub Issue, passes CI validation | Community tab | -| **verified** | Maintainer reproduced your result within 5% | Main leaderboard | +| **community** | Submit a PR (or issue → bot-drafted PR) and pass CI validation | Community tab | +| **verified** | Independently reproduced on the same hardware/runner within 5% | Main leaderboard | -To request verification, comment on your submission issue. +To promote a community result to **verified**, anyone with the same hardware +and runner can run the same suite and open a follow-up PR that lands the +reproduction in `results/verified/`. Maintainers do not gate this — every +verified result is itself reproducible by definition. --- @@ -329,11 +368,29 @@ If your local copy was downloaded at a different revision, add a note in --- -## Adding support for a new platform - -Create a new runner folder under `runners/` by subclassing `BenchmarkRunner`. -See [DEVELOPMENT.md](DEVELOPMENT.md) for the full implementation guide including -how to compute your runner's hash ID. +## Adding a new runner + +A "runner" here is a Python class that wraps an inference framework (vLLM, +SGLang, mlx-lm, …) and exposes the AccelMark standard interface. Adding +one for an **existing** platform (NVIDIA, AMD, Ascend, Apple, Google TPU, +Moore Threads, …) does not require touching any shared file. The full +walk-through lives in [`runners/README.md`](runners/README.md); the short +version is: + +1. Copy `runners/template/runner.py` into a temporary folder and fill in + the three required methods (`load_model`, `inference_fn_offline`, + `release_resources`) plus `inference_fn_streaming` if your framework + has a streaming API. +2. Compute the hash and rename the folder: + `python runners/hash_runner.py runners/tmp/` + produces e.g. `nvidia_myframework_3f8a2c1d`. +3. Write `meta.json` next to it, including `suite_support` — that field + is **how the top-level `README.md` table picks up your runner**. You + never edit `README.md` yourself. +4. Add a `requirements.txt`. +5. Validate: `python runners/validate_runners.py --dir runners/`. +6. Regenerate the README matrix locally: + `python tools/generate_platforms_matrix.py`. ```python # runners/your_platform_{hash8}/runner.py @@ -347,7 +404,7 @@ class MyFrameworkRunner(BenchmarkRunner): SUPPORTS_ONLINE = True SUPPORTS_MULTI_CHIP = True # set False if no tensor parallelism - def load_model(self, model_path: str, suite: dict, parallelism: dict) -> None: + def load_model(self, model_path: str, parallelism: dict) -> None: tp_size = parallelism["tensor_parallel_size"] self.model = MyFramework.load(model_path, tp=tp_size) @@ -383,14 +440,37 @@ if __name__ == "__main__": All orchestration (result building, accuracy reuse, Suite E, etc.) is inherited from `BenchmarkRunner` automatically. -**Checklist for a new platform PR:** +**Checklist for a new-runner PR (existing platform):** - [ ] Runner folder named `{platform}_{name}_{hash8}` with correct hash - [ ] `runner.py` subclasses `BenchmarkRunner` and passes `runners/validate_runners.py` -- [ ] `meta.json` present and valid (see `runners/meta.schema.json`) +- [ ] `meta.json` present and valid (see `runners/meta.schema.json`), with + `suite_support` declared for every suite your runner can or cannot run - [ ] `requirements.txt` included -- [ ] At least one reference result in `results/community/` -- [ ] `runners/collect_env.py` updated to detect your hardware (see [DEVELOPMENT.md](DEVELOPMENT.md)) -- [ ] `README.md` supported platforms table updated +- [ ] `tools/generate_platforms_matrix.py --check` passes locally (CI also + enforces this) +- [ ] At least one reference result in `results/community/` (validated by CI) + +### Adding a new accelerator family + +If you are bringing up a **new platform** (e.g. a vendor not yet in +`schema/platforms.json`), the only additional file you need to ship is + +``` +runners/platforms/.py +``` + +which exports module-level `collect()`, `detect_runtime_version()` and a +few optional helpers. The collector at `runners/collect_env.py` +auto-discovers it; no change to that file is required. See +[`runners/README.md`](runners/README.md#adding-a-new-accelerator-family) +for the full protocol and a worked example. + +Optional polish steps when the new platform stabilises: + +- Add an entry to `schema/platforms.json` so the README matrix renders a + pretty hardware label and stable sort order. Until then, the matrix + renders the bare identifier and `validate_runners.py` emits a + non-fatal warning prompting this follow-up. See [DEVELOPMENT.md](DEVELOPMENT.md) for the full implementation reference. @@ -404,9 +484,10 @@ If a result looks wrong: 2. Include: the submission name, what looks wrong, and ideally your own run on the same hardware as evidence -Maintainers will investigate. If confirmed suspicious, the result's `meta.flagged` -field will be set to a reason string and it will appear with a ⚠️ badge on the -leaderboard. +The community discusses the report in the issue. If the consensus is that +the result is suspicious, a PR sets `meta.flagged` on that result to a +reason string and the entry shows up with a ⚠️ badge on the leaderboard. +Anyone can open that follow-up PR. --- diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 22d183c9..343a3132 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -714,7 +714,7 @@ Common patterns: # Resolve model path (checks models_local.yaml) path = br._resolve_model_path(model_id, args.model_path) -# Parse scenarios config (handles legacy flat array and new dict format) +# Parse scenarios config — expects {"default": [...], "extra": [...]} default, extra = br._parse_scenarios_config(suite) # Merge scenario results after running offline+online+interactive @@ -897,7 +897,7 @@ Key constraints: | `meta.run_id` | string\|null | 8-char hex hash of hardware+software+suite+submitter. Deterministic — same config always produces same `run_id`. Used for duplicate detection. | | `meta.run_name` | string\|null | Full directory name: `{chip}x{count}_{suite}_{runner}_{run_id}`. Used as the output directory name. | | `meta.time` | string\|null | Benchmark start time HH:MM:SS. | -| `meta.flagged` | string\|null | Null for normal results. Maintainer sets to a reason string if result is suspicious — triggers ⚠️ badge on leaderboard. | +| `meta.flagged` | string\|null | Null for normal results. Set to a reason string if community review concludes the result is suspicious (via a follow-up PR) — triggers ⚠️ badge on leaderboard. | These fields are optional in the schema for backward compatibility with older results. New benchmark runs populate all four automatically. diff --git a/README.md b/README.md index 88fbf406..ea9e2b65 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,27 @@ -# ⚡ AccelMark - -**Open benchmark leaderboard for AI accelerators on LLM workloads.** - -[![Live Leaderboard](https://img.shields.io/badge/leaderboard-live-brightgreen)](https://juhaoliang1997.github.io/AccelMark) -[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) -[![Contributions welcome](https://img.shields.io/badge/contributions-welcome-orange.svg)](CONTRIBUTING.md) - -[**→ Live Leaderboard**](https://juhaoliang1997.github.io/AccelMark) · [Contributing](CONTRIBUTING.md) · [Suites](suites/README.md) · [Development](DEVELOPMENT.md) +

+ + + AccelMark + +

+ +

+ Open benchmark leaderboard for AI accelerators on LLM workloads. +

+ +

+ Live Leaderboard + License: MIT + Contributions welcome +

+ +

+ → Live Leaderboard · + Contributing · + Suites · + Discussions · + Development +

--- @@ -36,11 +51,14 @@ cp configs/submitter.yaml.example configs/submitter.yaml # 3. Run the benchmark (~11 min on A100) python run.py --runner nvidia_vllm_47f5d58e --suite suite_A -# 4. Submit — open a GitHub Issue and paste your result.json -# https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md +# 4. Submit your result — open a pull request: +# git checkout -b submit/ +# cp results/your-result.json results/community//result.json +# git add results/ env_info.json && git commit -m "results: " +# gh pr create # or open via the GitHub web UI ``` -See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide. +See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide. If you'd rather skip the PR workflow, [open a submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md) instead and a bot will draft the PR for you. --- @@ -64,19 +82,26 @@ See [suites/README.md](suites/README.md) for full specs, time budgets, SLA defin ## Supported platforms -Reference runners live under `runners/` (see each folder’s `meta.json`). Checkmarks mark suites **implemented and runnable** with that runner in this repository. +Reference runners live under `runners/` (see each folder’s `meta.json`). The table below is **auto-generated** from each runner's `meta.json` — never hand-edited. Add a runner, declare its `suite_support` in `meta.json`, and the matrix updates on its own. + | Hardware | Runner folder | Framework | A | B | C | D | E | F | G | -|----------|---------------|-----------|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +|---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | -| NVIDIA GPU | `nvidia_sglang_6da83845` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | -| AMD GPU | `amd_vllm_rocm_5355c2c6` | vLLM (ROCm) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | -| Huawei Ascend NPU | `ascend_vllm_ascend_605db33a` | vLLM-Ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | -| Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu (JAX/XLA) | ✓ | — | — | ✓ | — | ✓ | — | +| AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | +| Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | +| Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — | + +_Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._ + + +> Regenerate locally with `python tools/generate_platforms_matrix.py`. CI runs `--check` and fails the PR if the README and runner metadata disagree. Other stacks (TensorRT-LLM, MindIE, mlx-lm, etc.) can be added as new runner folders; see the contributor guide. -Adding a new platform? See [CONTRIBUTING.md#adding-support-for-a-new-platform](CONTRIBUTING.md#adding-support-for-a-new-platform). +Adding a new runner? See [CONTRIBUTING.md#adding-a-new-runner](CONTRIBUTING.md#adding-a-new-runner). Adding a new accelerator family? See [`runners/README.md`](runners/README.md#adding-a-new-accelerator-family). --- @@ -84,53 +109,10 @@ Adding a new platform? See [CONTRIBUTING.md#adding-support-for-a-new-platform](C | Tier | How | Where | |------|-----|-------| -| **community** | Submit via GitHub Issue, passes CI validation | Community tab | -| **verified** | Independently reproduced by maintainer within 5% | Main leaderboard | - -Community results are fully visible and comparable — they just haven't been independently reproduced yet. +| **community** | Submitted by anyone via PR (or issue → bot-drafted PR) and passes CI validation | Community tab | +| **verified** | Independently reproduced on the same hardware/runner and matches the original within 5% | Main leaderboard | ---- - -## Repository structure - -``` -AccelMark/ -├── suites/ # Suite definitions — see suites/README.md -├── runners/ # Platform benchmark runners -│ ├── benchmark_runner.py # Shared base class — all orchestration logic -│ ├── collect_env.py # Hardware/software detection → env_info.json -│ ├── validate_submission.py -│ ├── validate_runners.py -│ ├── protocol.py # RunnerProtocol interface (serve layer) -│ ├── template/ # Annotated starter template for new runners -│ └── nvidia_vllm_{hash8}/ # Example: NVIDIA vLLM runner -│ ├── runner.py -│ ├── meta.json -│ └── requirements.txt -├── loadgen/ # Shared request sending and timing logic -│ ├── loadgen.py # Core timing engine — do not modify per-platform -│ └── types.py # InferenceResult, SampleRecord -├── serve/ # OpenAI-compatible inference server -│ ├── server.py # FastAPI app — wraps any runner as an HTTP API -│ └── adapter.py # OpenAI request/response models -├── schema/ # JSON schemas, accuracy subset, cloud pricing -│ ├── result.schema.json -│ ├── accuracy_subset.jsonl # immutable -│ └── cloud_pricing.json -├── results/ # Benchmark results -│ ├── verified/ # Maintainer-reproduced results -│ └── community/ # Community-submitted results -├── leaderboard/ # Static leaderboard site (GitHub Pages) -│ ├── generate.py # Reads results/, writes leaderboard.js + api/ -│ └── site/ -│ ├── index.html -│ └── leaderboard.js # Auto-generated — do not edit manually -├── run.py # Unified entry point — benchmark and serve -├── CONTRIBUTING.md -├── DEVELOPMENT.md -└── configs/ # Local config — gitignored - └── submitter.yaml.example -``` +Community results are fully visible and comparable — they just haven't been independently reproduced yet. Anyone with the listed hardware can promote a community result to verified by submitting a reproduction PR. --- @@ -138,11 +120,15 @@ AccelMark/ The most valuable contribution is running the benchmark on hardware not yet in the leaderboard. -- **Submit a result** → [Community Submission guide](CONTRIBUTING.md) +- **Submit a result** → [Submitting a result](CONTRIBUTING.md#submitting-a-result) +- **Add a new runner** → [Adding a new runner](CONTRIBUTING.md#adding-a-new-runner) +- **Add a new accelerator family** → [Platform plug-in guide](runners/README.md#adding-a-new-accelerator-family) - **Report a bug** → [Open an issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=bug_report.md) -- **Add platform support** → [Platform guide](CONTRIBUTING.md#adding-support-for-a-new-platform) +- **Ask a question / share results** → [Discussions](https://github.com/JuhaoLiang1997/AccelMark/discussions) - **Extend the leaderboard** → [Development guide](DEVELOPMENT.md) +> _Optional:_ AccelMark also ships a small voice-driven launcher for the [OpenClaw](https://clawhub.ai) ecosystem — see [`openclaw_skill/`](openclaw_skill/README.md). It's not required to run, contribute, or submit results. + --- ## Citation @@ -152,7 +138,7 @@ If you use AccelMark results in research, please cite: ```bibtex @misc{accelmark2026, title = {AccelMark: Open Benchmark Leaderboard for AI Accelerators on LLM Workloads}, - author = {Liang, Juhao}, + author = {Liang, Juhao and {The AccelMark Contributors}}, year = {2026}, url = {https://github.com/JuhaoLiang1997/AccelMark} } diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..67be232b --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,70 @@ +# Security Policy + +## Scope + +AccelMark is a benchmarking framework. The "interesting" security surface +is correspondingly small, but two areas matter: + +1. **Code that runs locally on contributor / maintainer machines.** + The repository ships Python that reads model files, parses third-party + tool output (`nvidia-smi`, `rocm-smi`, `npu-smi`, `mthreads-gmi`, etc.), + reads YAML configuration, and runs inference frameworks (vLLM, SGLang, + mlx-lm, …) under their own dependency stacks. A malicious config, + meta.json, or runner.py landing in `main` could compromise anyone who + pulls and runs the repo. + +2. **Submitted results.** + `results/community/**` is community-contributed JSON. A malicious + `result.json` cannot execute code on its own, but it can poison the + leaderboard if the validator can be bypassed. Bugs in + `runners/validate_submission.py` that allow obviously-fake results to + merge are treated as security issues. + +Outside of those two surfaces (in particular: bugs that produce wrong +benchmark *numbers* without a reproducibility problem) are normal bugs and +should be reported via a regular GitHub issue. + +## Supported versions + +AccelMark is pre-1.0 and ships from `main`. The latest commit on `main` is +the only "supported" version; we backport fixes to release tags only after +1.0. + +## Reporting a vulnerability + +**Please do not open a public GitHub issue for a security report.** + +Use GitHub's [private security advisory][advisory] form on this repository. +A maintainer will respond within **7 days** acknowledging the report and +providing an initial assessment. We aim to publish a fix and credit the +reporter within **30 days** of acknowledgement; if a fix is going to take +longer we will say so in the response. + +[advisory]: https://github.com/JuhaoLiang1997/AccelMark/security/advisories/new + +When reporting, please include: + +* The version (commit SHA on `main`, or release tag). +* A minimal reproduction — config files, the exact command, and the + observed behaviour. For supply-chain reports, the offending dependency + and version. +* Your assessment of the impact (e.g. "arbitrary file read at runner + startup", "validator accepts result with mismatched chip name", …). + +We do not currently run a paid bug bounty, but we are happy to credit +reporters in the release notes for the fix. + +## What is *not* a vulnerability + +For clarity, the following are explicitly out of scope: + +* **Results you disagree with.** Use the *Challenge a Result* GitHub + issue template; this is a leaderboard-policy matter, not a security one. +* **A runner that performs poorly on your hardware.** Open a regular issue + or PR. +* **Resource exhaustion when running a benchmark you started yourself.** + Benchmarks intentionally saturate the device; OOM and similar are + expected operating conditions. +* **Dependencies of a runner being slow / outdated.** The runner author + pins versions in `requirements.txt`; submit a PR for a new runner with + updated pins (immutability rule — see `runners/README.md`). diff --git a/configs/runner_configs/runner_amd_vllm_rocm_523da458.yaml.example b/configs/runner_configs/runner_amd_vllm_rocm_523da458.yaml.example deleted file mode 100644 index 352bf16a..00000000 --- a/configs/runner_configs/runner_amd_vllm_rocm_523da458.yaml.example +++ /dev/null @@ -1,31 +0,0 @@ -# AccelMark runner config — amd_vllm_rocm_d65c6686 (vLLM-ROCm on AMD) -# -# Copy this file to runner_amd_vllm_rocm_d65c6686.yaml (remove .example suffix) -# and edit as needed for your hardware. -# -# Merge priority: CLI flags > suite-specific > global defaults > runner defaults - -# ── Global defaults ──────────────────────────────────────────────────────────── - -# Tensor parallel size — number of GPUs to use (default: 1) -tensor_parallel_size: 1 - -# Disable HIP graph compilation (ROCm equivalent of enforce_eager). -enforce_eager: false - -# Maximum number of sequences in a batch (default: 512). -max_num_seqs: 512 - -# Fraction of GPU memory for the KV cache (default: 0.90). -gpu_memory_utilization: 0.90 - -# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs(). -# engine_kwargs: -# swap_space: 8 - -# ── Suite-specific overrides ─────────────────────────────────────────────────── - -suites: - suite_D: - max_num_seqs: 64 - gpu_memory_utilization: 0.85 diff --git a/configs/runner_configs/runner_ascend_vllm_ascend_605db33a.yaml.example b/configs/runner_configs/runner_ascend_vllm_ascend_605db33a.yaml.example deleted file mode 100644 index 6e1ac0ba..00000000 --- a/configs/runner_configs/runner_ascend_vllm_ascend_605db33a.yaml.example +++ /dev/null @@ -1,36 +0,0 @@ -# AccelMark runner config — ascend_vllm_ascend_605db33a (vllm-ascend on Ascend NPU) -# -# Copy this file to runner_ascend_vllm_ascend_605db33a.yaml (remove .example suffix) -# and edit as needed for your hardware. -# -# Merge priority: CLI flags > suite-specific > global defaults > runner defaults - -# ── Global defaults ──────────────────────────────────────────────────────────── - -# Tensor parallel size — number of NPUs to use (default: 1) -tensor_parallel_size: 1 - -# Disable graph compilation (default: false). -# Set to true if you encounter CANN graph compilation errors on your NPU model. -enforce_eager: false - -# Maximum number of sequences in a batch (default: 512). -# Reduce on lower-memory NPUs: 128 for 32 GB, 64 for 16 GB or less. -max_num_seqs: 512 - -# Fraction of NPU memory reserved for the KV cache (default: 0.90). -# Reduce if you get NPU OOM errors: try 0.85 for tighter memory budgets. -# vllm-ascend exposes this as gpu_memory_utilization (same parameter name as vLLM). -gpu_memory_utilization: 0.90 - -# Pass-through kwargs forwarded directly to vllm-ascend LLM() / AsyncEngineArgs(). -# engine_kwargs: -# block_size: 16 - -# ── Suite-specific overrides ─────────────────────────────────────────────────── - -suites: - suite_D: - # Long-context suite — reduce batch size and reserve more NPU memory. - max_num_seqs: 64 - gpu_memory_utilization: 0.85 diff --git a/configs/runner_configs/runner_nvidia_sglang_9f42fabb.yaml.example b/configs/runner_configs/runner_nvidia_sglang_9f42fabb.yaml.example deleted file mode 100644 index cd1fbcf9..00000000 --- a/configs/runner_configs/runner_nvidia_sglang_9f42fabb.yaml.example +++ /dev/null @@ -1,29 +0,0 @@ -# AccelMark runner config — nvidia_sglang_df27d2dd (SGLang on NVIDIA) -# -# Copy this file to runner_nvidia_sglang_df27d2dd.yaml (remove .example suffix) -# and edit as needed for your hardware. -# -# Merge priority: CLI flags > suite-specific > global defaults > runner defaults - -# ── Global defaults ──────────────────────────────────────────────────────────── - -# Tensor parallel size — number of GPUs to use (default: 1) -tensor_parallel_size: 1 - -# Disable CUDA graph. Equivalent to vLLM's enforce_eager. -# Set to true if you encounter CUDA graph errors on older hardware. -disable_cuda_graph: false - -# Fraction of GPU memory for the KV cache (SGLang: mem_fraction_static, default: 0.88) -mem_fraction_static: 0.88 - -# Pass-through kwargs forwarded directly to sglang.Engine() / sglang.AsyncEngine(). -# See SGLang docs for valid keys. -# engine_kwargs: -# chunked_prefill_size: 512 - -# ── Suite-specific overrides ─────────────────────────────────────────────────── - -suites: - suite_D: - mem_fraction_static: 0.80 diff --git a/docs/assets/logo-wordmark-dark.svg b/docs/assets/logo-wordmark-dark.svg new file mode 100644 index 00000000..ddc810da --- /dev/null +++ b/docs/assets/logo-wordmark-dark.svg @@ -0,0 +1,23 @@ + + AccelMark + AccelMark wordmark (dark theme): a lightning bolt over a speedometer arc, next to the project name. + + + + + + + + + + + + + + + + + + + AccelMark + diff --git a/docs/assets/logo-wordmark.svg b/docs/assets/logo-wordmark.svg new file mode 100644 index 00000000..985b1690 --- /dev/null +++ b/docs/assets/logo-wordmark.svg @@ -0,0 +1,23 @@ + + AccelMark + AccelMark wordmark (light theme): a lightning bolt over a speedometer arc, next to the project name. + + + + + + + + + + + + + + + + + + + AccelMark + diff --git a/docs/assets/logo.svg b/docs/assets/logo.svg new file mode 100644 index 00000000..bb6d81eb --- /dev/null +++ b/docs/assets/logo.svg @@ -0,0 +1,18 @@ + + AccelMark + Lightning bolt over a speedometer arc - the AccelMark mark for accelerator benchmarking. + + + + + + + + + + + + + + + diff --git a/leaderboard/site/favicon.svg b/leaderboard/site/favicon.svg new file mode 100644 index 00000000..bb6d81eb --- /dev/null +++ b/leaderboard/site/favicon.svg @@ -0,0 +1,18 @@ + + AccelMark + Lightning bolt over a speedometer arc - the AccelMark mark for accelerator benchmarking. + + + + + + + + + + + + + + + diff --git a/leaderboard/site/index.html b/leaderboard/site/index.html index caea8541..a890e5e2 100644 --- a/leaderboard/site/index.html +++ b/leaderboard/site/index.html @@ -3,6 +3,7 @@ + AccelMark — AI Accelerator Leaderboard