Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions .github/benchmark/oot_benchmark_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,25 @@
"excluded_input_output_pairs": ["1024x8192"]
},
{
"display": "Qwen3-Next-80B-A3B-Instruct-FP8 TP8",
"dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8",
"display": "Qwen3-Next-80B-A3B-Instruct-FP8 TP1",
"dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-tp1",
Comment thread
zejunchen-zejun marked this conversation as resolved.
"source_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
"path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
"prefix": "qwen3-next-80b-a3b-instruct-fp8",
"extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
"prefix": "qwen3-next-80b-a3b-instruct-fp8-tp1",
"extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 16384 --max-model-len 16384",
"bench_args": "",
"runner": "atom-mi355-8gpu.predownload",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1\nATOM_USE_CUSTOM_ALL_GATHER=0"
},
{
"display": "Qwen3-Next-80B-A3B-Instruct-FP8 TP4",
"dashboard_model": "Qwen3-Next-80B-A3B-Instruct-FP8-tp4",
"source_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
"path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
"prefix": "qwen3-next-80b-a3b-instruct-fp8-tp4",
"extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
"bench_args": "",
"runner": "atom-mi355-8gpu.predownload",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1"
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1\nATOM_USE_CUSTOM_ALL_GATHER=0"
}
]
101 changes: 101 additions & 0 deletions .github/benchmark/oot_models_accuracy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
[
{
"model_name": "Qwen3-235B-A22B-Instruct-2507-FP8 TP8+EP8",
"model_path": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8",
"extraArgs": "--tensor-parallel-size 8 --enable-expert-parallel",
"env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.87,
"accuracy_baseline": 0.87,
"accuracy_baseline_model": "Qwen/Qwen3-235B-A22B-Instruct-2507"
},
{
"model_name": "Qwen3-Next-80B-A3B-Instruct-FP8 TP4",
"model_path": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
"extraArgs": "--tensor-parallel-size 4",
"env_vars": "ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
"runner": "linux-atom-mi35x-4",
"test_level": "nightly",
"accuracy_threshold": 0.83,
"accuracy_baseline": 0.83,
"accuracy_baseline_model": "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
},
{
"model_name": "Qwen3.5-397B-A17B-FP8 TP8",
"model_path": "Qwen/Qwen3.5-397B-A17B-FP8",
"extraArgs": "--tensor-parallel-size 8",
"env_vars": "ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.83,
"accuracy_baseline": 0.83,
"accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B-FP8"
},
{
"model_name": "Qwen3.5-397B-A17B TP8",
"model_path": "Qwen/Qwen3.5-397B-A17B",
"extraArgs": "--tensor-parallel-size 8",
"env_vars": "ATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1\nATOM_USE_CUSTOM_ALL_GATHER=0",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.83,
"accuracy_baseline": 0.83,
"accuracy_baseline_model": "Qwen/Qwen3.5-397B-A17B"
},
{
"model_name": "Kimi-K2-Thinking-MXFP4 TP8",
"model_path": "amd/Kimi-K2-Thinking-MXFP4",
"extraArgs": "--tensor-parallel-size 8",
"env_vars": "",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.9,
"accuracy_baseline": 0.9,
"accuracy_baseline_model": "amd/Kimi-K2-Thinking-MXFP4"
},
{
"model_name": "DeepSeek-R1-FP8 TP8",
"model_path": "deepseek-ai/DeepSeek-R1-0528",
"extraArgs": "--tensor-parallel-size 8",
"env_vars": "",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
"accuracy_baseline": 0.9553,
"accuracy_baseline_model": "deepseek-ai/DeepSeek-R1-0528"
},
{
"model_name": "DeepSeek-R1-0528-MXFP4 TP8",
"model_path": "amd/DeepSeek-R1-0528-MXFP4",
"extraArgs": "--tensor-parallel-size 8",
"env_vars": "",
"runner": "linux-atom-mi35x-8",
"test_level": "nightly",
"accuracy_threshold": 0.93,
"accuracy_baseline": 0.93,
"accuracy_baseline_model": "deepseek-ai/DeepSeek-R1-0528"
},
{
"model_name": "gpt-oss-120b TP1",
"model_path": "openai/gpt-oss-120b",
"extraArgs": "--tensor-parallel-size 1",
"env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1",
"runner": "linux-atom-mi35x-1",
"test_level": "nightly",
"accuracy_threshold": 0.38,
"accuracy_baseline": 0.38,
"accuracy_baseline_model": "openai/gpt-oss-120b"
},
{
"model_name": "gpt-oss-120b TP2",
"model_path": "openai/gpt-oss-120b",
"extraArgs": "--tensor-parallel-size 2",
"env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1",
"runner": "linux-atom-mi35x-4",
"test_level": "nightly",
"accuracy_threshold": 0.38,
"accuracy_baseline": 0.38,
"accuracy_baseline_model": "openai/gpt-oss-120b"
}
]
27 changes: 19 additions & 8 deletions .github/dashboard/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,8 @@ <h1>Benchmark Dashboard</h1>
/* ================================================================
1. CONSTANTS & COLOR PALETTE
================================================================ */
// HSL-based palette: each model gets a distinct hue, bar alpha varies by concurrency
// HSL-based palette: each model gets a distinct base hue; backend offsets keep
// ATOM and ATOM-vLLM visually related but not identical.
const MODEL_HUES = {
'DeepSeek-R1-0528': 210, // blue
'DeepSeek-R1-0528-mtp3': 175, // cyan/teal — distinct from base DeepSeek
Expand All @@ -317,16 +318,26 @@ <h1>Benchmark Dashboard</h1>
'Meta-Llama-3-8B-Instruct': 270, // purple
'Qwen3-235B-A22B-Instruct-2507-FP8': 50, // gold
};
const BACKEND_HUE_OFFSETS = {
'ATOM': 0,
'ATOM-vLLM': 28,
};
const FALLBACK_HUES = [45, 330, 190, 30]; // yellow, pink, teal, amber
let fallbackHueIdx = 0;
function getModelHue(backend, model) {
const backendModelKey = backend ? `${backend}::${model}` : model;
const normalizedBackend = backend === 'OOT' ? 'ATOM-vLLM' : backend;
const backendModelKey = normalizedBackend ? `${normalizedBackend}::${model}` : model;
if (MODEL_HUES[backendModelKey] != null) return MODEL_HUES[backendModelKey];
if (MODEL_HUES[model] != null) return MODEL_HUES[model];
const h = FALLBACK_HUES[fallbackHueIdx % FALLBACK_HUES.length];
MODEL_HUES[backendModelKey] = h;
fallbackHueIdx++;
return h;
let baseHue = MODEL_HUES[model];
if (baseHue == null) {
baseHue = FALLBACK_HUES[fallbackHueIdx % FALLBACK_HUES.length];
MODEL_HUES[model] = baseHue;
fallbackHueIdx++;
}
const offset = BACKEND_HUE_OFFSETS[normalizedBackend] ?? 0;
const derivedHue = normalizedBackend ? (baseHue + offset) % 360 : baseHue;
if (normalizedBackend) MODEL_HUES[backendModelKey] = derivedHue;
return derivedHue;
}
function getModelColor(backend, model) {
const h = getModelHue(backend, model);
Expand Down Expand Up @@ -2123,7 +2134,7 @@ <h3 style="color:var(--text-primary);margin-bottom:var(--gap-md)">Model Accuracy
<div class="table-watermark">
<table class="perf-table">
<thead><tr>
<th>Mode</th><th>Model</th>
<th>Backend</th><th>Model</th>
<th class="num">flexible-extract</th><th class="num">strict-match</th>
<th class="num">Baseline</th><th class="num">Recovery</th>
<th class="num">Threshold</th><th class="num">Margin</th><th>Status</th>
Expand Down
26 changes: 7 additions & 19 deletions .github/scripts/atom_oot_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ set -euo pipefail
# accuracy - run gsm8k accuracy test and save result JSON
#
# MODE:
# ci - DeepSeek-R1 FP8, gpt-oss-120b, Kimi-K2 TP4, Qwen3.5-35B-A3B-FP8
# full - all OOT-supported models
# ci - workflow-provided OOT CI model entry
# full - workflow-provided OOT full-validation model entry
#
# Optional model_name can be used to run a single model in full mode.
# Optional model_name can be used to run a single model when a caller passes
# multiple explicit entries.
Comment thread
zejunchen-zejun marked this conversation as resolved.

TYPE=${1:-launch}
MODE=${2:-ci}
Expand Down Expand Up @@ -49,29 +50,16 @@ EXPLICIT_MODEL_PATH=${OOT_MODEL_PATH:-}
EXPLICIT_EXTRA_ARGS=${OOT_EXTRA_ARGS:-}
LAST_VLLM_LOG_LINE=0

# Model format: MODEL_NAME|MODEL_PATH|EXTRA_ARGS
# CI mode requires OOT_MODEL_NAME, OOT_MODEL_PATH (and optionally OOT_EXTRA_ARGS)
# to be set via the workflow matrix. Full mode uses the built-in list below.
FULL_MODE_MODELS=(
"Qwen3 MoE|Qwen/Qwen3-235B-A22B-Instruct-2507-FP8|--tensor-parallel-size 8 --enable-expert-parallel"
"DeepSeek-R1 FP8|deepseek-ai/DeepSeek-R1-0528|--tensor-parallel-size 8"
"DeepSeek-R1 MXFP4|amd/DeepSeek-R1-0528-MXFP4|--tensor-parallel-size 8"
"GPT-OSS|openai/gpt-oss-120b|--tensor-parallel-size 1"
"Kimi-K2|amd/Kimi-K2-Thinking-MXFP4|--tensor-parallel-size 4"
)

declare -a ACTIVE_MODELS=()
if [[ -n "${EXPLICIT_MODEL_NAME}" || -n "${EXPLICIT_MODEL_PATH}" || -n "${EXPLICIT_EXTRA_ARGS}" ]]; then
if [[ -z "${EXPLICIT_MODEL_NAME}" || -z "${EXPLICIT_MODEL_PATH}" ]]; then
echo "OOT_MODEL_NAME and OOT_MODEL_PATH must both be set when using explicit model overrides."
exit 2
fi
ACTIVE_MODELS=("${EXPLICIT_MODEL_NAME}|${EXPLICIT_MODEL_PATH}|${EXPLICIT_EXTRA_ARGS}")
elif [[ "$MODE" == "ci" ]]; then
echo "CI mode requires OOT_MODEL_NAME and OOT_MODEL_PATH env vars from the workflow matrix."
exit 2
else
ACTIVE_MODELS=("${FULL_MODE_MODELS[@]}")
echo "${MODE} mode requires OOT_MODEL_NAME and OOT_MODEL_PATH env vars from the workflow."
exit 2
fi

resolve_model_path() {
Expand Down Expand Up @@ -174,7 +162,7 @@ launch_one_model() {
if [[ -n "${OOT_ENV_VARS:-}" ]]; then
while IFS= read -r _env_line; do
[[ -n "${_env_line}" ]] && export "${_env_line}" && echo "Exported: ${_env_line}"
done <<< "$(echo -e "${OOT_ENV_VARS}")"
done <<< "$(printf '%b' "${OOT_ENV_VARS}")"
fi
rm -rf /root/.cache

Expand Down
17 changes: 12 additions & 5 deletions .github/workflows/atom-vllm-oot-benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@ on:
description: "Qwen3.5-397B-A17B-FP8 TP8"
type: boolean
default: false
qwen3-next-80b-a3b-instruct-fp8:
description: "Qwen3-Next-80B-A3B-Instruct-FP8 TP8"
qwen3-next-80b-a3b-instruct-fp8-tp1:
description: "Qwen3-Next-80B-A3B-Instruct-FP8 TP1"
type: boolean
default: false
qwen3-next-80b-a3b-instruct-fp8-tp4:
description: "Qwen3-Next-80B-A3B-Instruct-FP8 TP4"
type: boolean
default: false
oot_image:
Expand Down Expand Up @@ -213,7 +217,8 @@ jobs:
ENABLE_KIMI_K2_TP4: ${{ inputs.kimi-k2-thinking-mxfp4-tp4 }}
ENABLE_KIMI_K2_TP8: ${{ inputs.kimi-k2-thinking-mxfp4-tp8 }}
ENABLE_QWEN3_5_397B_A17B_FP8: ${{ inputs.qwen3-5-397b-a17b-fp8 }}
ENABLE_QWEN3_NEXT_80B_A3B_INSTRUCT_FP8: ${{ inputs.qwen3-next-80b-a3b-instruct-fp8 }}
ENABLE_QWEN3_NEXT_80B_A3B_INSTRUCT_FP8_TP1: ${{ inputs.qwen3-next-80b-a3b-instruct-fp8-tp1 }}
ENABLE_QWEN3_NEXT_80B_A3B_INSTRUCT_FP8_TP4: ${{ inputs.qwen3-next-80b-a3b-instruct-fp8-tp4 }}
run: |
MODELS_JSON="$(jq -c '
map(select(
Expand All @@ -223,7 +228,8 @@ jobs:
or (.prefix == "kimi-k2-thinking-mxfp4-tp4" and env.ENABLE_KIMI_K2_TP4 == "true")
or (.prefix == "kimi-k2-thinking-mxfp4-tp8" and env.ENABLE_KIMI_K2_TP8 == "true")
or (.prefix == "qwen3-5-397b-a17b-fp8" and env.ENABLE_QWEN3_5_397B_A17B_FP8 == "true")
or (.prefix == "qwen3-next-80b-a3b-instruct-fp8" and env.ENABLE_QWEN3_NEXT_80B_A3B_INSTRUCT_FP8 == "true")
or (.prefix == "qwen3-next-80b-a3b-instruct-fp8-tp1" and env.ENABLE_QWEN3_NEXT_80B_A3B_INSTRUCT_FP8_TP1 == "true")
or (.prefix == "qwen3-next-80b-a3b-instruct-fp8-tp4" and env.ENABLE_QWEN3_NEXT_80B_A3B_INSTRUCT_FP8_TP4 == "true")
))
' .github/benchmark/oot_benchmark_models.json)"
echo "models_json=${MODELS_JSON}" >> "$GITHUB_OUTPUT"
Expand Down Expand Up @@ -440,7 +446,8 @@ jobs:
kimi-k2-thinking-mxfp4-tp4) echo "enabled=${{ inputs.kimi-k2-thinking-mxfp4-tp4 }}" >> "$GITHUB_OUTPUT" ;;
kimi-k2-thinking-mxfp4-tp8) echo "enabled=${{ inputs.kimi-k2-thinking-mxfp4-tp8 }}" >> "$GITHUB_OUTPUT" ;;
qwen3-5-397b-a17b-fp8) echo "enabled=${{ inputs.qwen3-5-397b-a17b-fp8 }}" >> "$GITHUB_OUTPUT" ;;
qwen3-next-80b-a3b-instruct-fp8) echo "enabled=${{ inputs.qwen3-next-80b-a3b-instruct-fp8 }}" >> "$GITHUB_OUTPUT" ;;
qwen3-next-80b-a3b-instruct-fp8-tp1) echo "enabled=${{ inputs.qwen3-next-80b-a3b-instruct-fp8-tp1 }}" >> "$GITHUB_OUTPUT" ;;
qwen3-next-80b-a3b-instruct-fp8-tp4) echo "enabled=${{ inputs.qwen3-next-80b-a3b-instruct-fp8-tp4 }}" >> "$GITHUB_OUTPUT" ;;
*) echo "enabled=true" >> "$GITHUB_OUTPUT" ;;
esac

Expand Down
Loading
Loading