From 934cae450b0c7294e07697321e7eb75ad6e4c833 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Tue, 24 Mar 2026 17:14:32 +0800
Subject: [PATCH 1/9] [plugin][oot benchmark] refine the OOT benchmark workflow

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/benchmark/oot_benchmark_models.json   |  10 +-
 .../workflows/atom-vllm-oot-benchmark.yaml    | 132 +++++++-----------
 2 files changed, 58 insertions(+), 84 deletions(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index e2a54a609..3808e5896 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -7,7 +7,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   },
   {
     "display": "DeepSeek-R1 MXFP4",
@@ -17,7 +17,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   },
   {
     "display": "gpt-oss-120b",
@@ -27,7 +27,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "linux-atom-mi355-1",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   },
   {
     "display": "Kimi-K2-Thinking-MXFP4 TP4",
@@ -38,7 +38,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   },
   {
     "display": "Kimi-K2-Thinking-MXFP4 TP8",
@@ -49,6 +49,6 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   }
 ]
diff --git a/.github/workflows/atom-vllm-oot-benchmark.yaml b/.github/workflows/atom-vllm-oot-benchmark.yaml
index 321b5d1c4..5c844c07a 100644
--- a/.github/workflows/atom-vllm-oot-benchmark.yaml
+++ b/.github/workflows/atom-vllm-oot-benchmark.yaml
@@ -43,7 +43,7 @@ on:
           Example (single set): 1024,1024,128,0.8
           Example (multiple sets): 1024,1024,128,0.8;2048,1024,256,0.7"
         type: string
-        default: "1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,1024,128,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8;8192,1024,128,0.8"
+        default: "1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,1024,128,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;1024,8192,128,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8;8192,1024,128,0.8"
 
 env:
   BASE_IMAGE: ${{ inputs.base_image || 'rocm/atom-dev:latest' }}
@@ -62,7 +62,7 @@ jobs:
       - name: Parse parameter lists
         id: parse-param-lists
         run: |
-          PARAM_LISTS="${{ inputs.param_lists || '1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,1024,128,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8;8192,1024,128,0.8' }}"
+          PARAM_LISTS="${{ inputs.param_lists || '1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,1024,128,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;1024,8192,128,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8;8192,1024,128,0.8' }}"
           echo "Using param_lists: ${PARAM_LISTS}"
           printf 'param_lists=%s\n' "${PARAM_LISTS}" >> "$GITHUB_OUTPUT"
           IFS=';' read -ra SETS <<< "${PARAM_LISTS}"
@@ -165,7 +165,7 @@ jobs:
           docker push "${{ steps.meta.outputs.oot_image_tag }}"
 
   benchmark:
-    name: OOT ${{ matrix.model.display }}
+    name: OOT ${{ matrix.model.display }} ${{ matrix.params.input_length }}/${{ matrix.params.output_length }} c=${{ matrix.params.concurrency }}
     needs: [parse-param-lists, load-models, build-oot-image]
     if: >-
       always()
@@ -176,10 +176,10 @@ jobs:
       fail-fast: false
       matrix:
         model: ${{ fromJson(needs.load-models.outputs.models_json) }}
+        params: ${{ fromJson(needs.parse-param-lists.outputs.matrix_json) }}
     runs-on: ${{ matrix.model.runner }}
     timeout-minutes: 240
     env:
-      PARAM_LISTS: ${{ needs.parse-param-lists.outputs.param_lists }}
       MODEL_NAME: ${{ matrix.model.display }}
       DASHBOARD_MODEL_NAME: ${{ matrix.model.dashboard_model || '' }}
       MODEL_SOURCE_PATH: ${{ matrix.model.source_path || matrix.model.path }}
@@ -187,6 +187,11 @@ jobs:
       OOT_EXTRA_ARGS: ${{ matrix.model.extra_args }}
       BENCH_EXTRA_ARGS: ${{ matrix.model.bench_args }}
       RESULT_PREFIX: ${{ matrix.model.prefix }}
+      ISL: ${{ matrix.params.input_length }}
+      OSL: ${{ matrix.params.output_length }}
+      CONC: ${{ matrix.params.concurrency }}
+      RANDOM_RANGE_RATIO: ${{ matrix.params.random_range_ratio }}
+      RESULT_FILENAME: ${{ matrix.model.prefix }}-${{ matrix.params.input_length }}-${{ matrix.params.output_length }}-${{ matrix.params.concurrency }}-${{ matrix.params.random_range_ratio }}
       CONTAINER_NAME: atom_vllm_oot_benchmark_${{ strategy.job-index }}
       CONTAINER_RESULT_DIR: /tmp/oot-benchmark-results
       CONTAINER_BENCH_SERVING_DIR: /tmp/oot-benchmark/bench_serving
@@ -369,86 +374,55 @@ jobs:
             TRUST_REMOTE_CODE_ARG="--trust-remote-code"
           fi
 
-          FAIL_COUNT=0
-          shopt -s nullglob
-          rm -f "${RESULT_PREFIX}"-*.json
-          IFS=';' read -ra SETS <<< "${PARAM_LISTS}"
-          for SET in "${SETS[@]}"; do
-            IFS=',' read -ra PARAMS <<< "$SET"
-            ISL="${PARAMS[0]}"
-            OSL="${PARAMS[1]}"
-            CONC="${PARAMS[2]}"
-            RANDOM_RANGE_RATIO="${PARAMS[3]}"
-            RESULT_FILENAME="${RESULT_PREFIX}-${ISL}-${OSL}-${CONC}-${RANDOM_RANGE_RATIO}"
-
-            echo "=== Benchmark config: ${MODEL_NAME} ISL=${ISL} OSL=${OSL} CONC=${CONC} RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} ==="
-
-            if ! docker exec \
-              -e ISL="${ISL}" \
-              -e OSL="${OSL}" \
-              -e CONC="${CONC}" \
-              -e RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO}" \
-              -e RESULT_FILENAME="${RESULT_FILENAME}" \
-              -e BENCH_EXTRA_ARGS="${BENCH_EXTRA_ARGS}" \
-              "$CONTAINER_NAME" bash -lc "
-                set -euo pipefail
-                rm -rf \"${CONTAINER_RESULT_DIR}\"
-                mkdir -p \"${CONTAINER_RESULT_DIR}\"
-                PYTHONDONTWRITEBYTECODE=1 python \"${CONTAINER_BENCH_SERVING_DIR}/benchmark_serving.py\" \
-                  --model=\"${OOT_RESOLVED_MODEL_PATH:-$MODEL_PATH}\" \
-                  --backend=vllm \
-                  --base-url=http://127.0.0.1:8000 \
-                  --dataset-name=random \
-                  --random-input-len=\"${ISL}\" \
-                  --random-output-len=\"${OSL}\" \
-                  --random-range-ratio \"${RANDOM_RANGE_RATIO}\" \
-                  --num-prompts=\"$(( CONC * 10 ))\" \
-                  --max-concurrency=\"${CONC}\" \
-                  ${TRUST_REMOTE_CODE_ARG} \
-                  --num-warmups=\"$(( 2 * CONC ))\" \
-                  --request-rate=inf \
-                  --ignore-eos \
-                  --save-result \
-                  --percentile-metrics=\"ttft,tpot,itl,e2el\" \
-                  --result-dir=\"${CONTAINER_RESULT_DIR}\" \
-                  --result-filename=\"${RESULT_FILENAME}.json\" \
-                  ${BENCH_EXTRA_ARGS:-}
-              "; then
-              echo "::warning::Benchmark failed for ${MODEL_NAME} ISL=${ISL} OSL=${OSL} CONC=${CONC}"
-              FAIL_COUNT=$((FAIL_COUNT + 1))
-              continue
-            fi
+          echo "=== Benchmark config: ${MODEL_NAME} ISL=${ISL} OSL=${OSL} CONC=${CONC} RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} ==="
 
-            if ! docker exec \
-              -e RESULT_PATH="${CONTAINER_RESULT_DIR}/${RESULT_FILENAME}.json" \
-              -e ISL="${ISL}" \
-              -e OSL="${OSL}" \
-              -e EXTRA_ARGS_TEXT="${OOT_EXTRA_ARGS}" \
-              -e DASHBOARD_MODEL_NAME="${DASHBOARD_MODEL_NAME}" \
-              "$CONTAINER_NAME" python3 -c "import json, os, re; result_path = os.environ['RESULT_PATH']; data = json.load(open(result_path, encoding='utf-8')); data['random_input_len'] = int(os.environ['ISL']); data['random_output_len'] = int(os.environ['OSL']); data['benchmark_backend'] = 'ATOM-vLLM'; display_name = os.environ.get('DASHBOARD_MODEL_NAME', ''); display_name and data.__setitem__('benchmark_model_name', display_name); tp_match = re.search(r'--tensor-parallel-size\\s+(\\d+)', os.environ.get('EXTRA_ARGS_TEXT', '')); tp_match and data.__setitem__('tensor_parallel_size', int(tp_match.group(1))); json.dump(data, open(result_path, 'w', encoding='utf-8'), indent=2)"; then
-              echo "::warning::Failed to post-process ${RESULT_FILENAME}.json"
-              FAIL_COUNT=$((FAIL_COUNT + 1))
-              continue
-            fi
+          docker exec \
+            -e ISL="${ISL}" \
+            -e OSL="${OSL}" \
+            -e CONC="${CONC}" \
+            -e RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO}" \
+            -e RESULT_FILENAME="${RESULT_FILENAME}" \
+            -e BENCH_EXTRA_ARGS="${BENCH_EXTRA_ARGS}" \
+            "$CONTAINER_NAME" bash -lc "
+              set -euo pipefail
+              rm -rf \"${CONTAINER_RESULT_DIR}\"
+              mkdir -p \"${CONTAINER_RESULT_DIR}\"
+              PYTHONDONTWRITEBYTECODE=1 python \"${CONTAINER_BENCH_SERVING_DIR}/benchmark_serving.py\" \
+                --model=\"${OOT_RESOLVED_MODEL_PATH:-$MODEL_PATH}\" \
+                --backend=vllm \
+                --base-url=http://127.0.0.1:8000 \
+                --dataset-name=random \
+                --random-input-len=\"${ISL}\" \
+                --random-output-len=\"${OSL}\" \
+                --random-range-ratio \"${RANDOM_RANGE_RATIO}\" \
+                --num-prompts=\"$(( CONC * 10 ))\" \
+                --max-concurrency=\"${CONC}\" \
+                ${TRUST_REMOTE_CODE_ARG} \
+                --num-warmups=\"$(( 2 * CONC ))\" \
+                --request-rate=inf \
+                --ignore-eos \
+                --save-result \
+                --percentile-metrics=\"ttft,tpot,itl,e2el\" \
+                --result-dir=\"${CONTAINER_RESULT_DIR}\" \
+                --result-filename=\"${RESULT_FILENAME}.json\" \
+                ${BENCH_EXTRA_ARGS:-}
+            "
 
-            if ! docker cp "${CONTAINER_NAME}:${CONTAINER_RESULT_DIR}/${RESULT_FILENAME}.json" "./${RESULT_FILENAME}.json"; then
-              echo "::warning::Failed to copy ${RESULT_FILENAME}.json from container"
-              FAIL_COUNT=$((FAIL_COUNT + 1))
-              continue
-            fi
-          done
+          docker exec \
+            -e RESULT_PATH="${CONTAINER_RESULT_DIR}/${RESULT_FILENAME}.json" \
+            -e ISL="${ISL}" \
+            -e OSL="${OSL}" \
+            -e EXTRA_ARGS_TEXT="${OOT_EXTRA_ARGS}" \
+            -e DASHBOARD_MODEL_NAME="${DASHBOARD_MODEL_NAME}" \
+            "$CONTAINER_NAME" python3 -c "import json, os, re; result_path = os.environ['RESULT_PATH']; data = json.load(open(result_path, encoding='utf-8')); data['random_input_len'] = int(os.environ['ISL']); data['random_output_len'] = int(os.environ['OSL']); data['benchmark_backend'] = 'ATOM-vLLM'; display_name = os.environ.get('DASHBOARD_MODEL_NAME', ''); display_name and data.__setitem__('benchmark_model_name', display_name); tp_match = re.search(r'--tensor-parallel-size\\s+(\\d+)', os.environ.get('EXTRA_ARGS_TEXT', '')); tp_match and data.__setitem__('tensor_parallel_size', int(tp_match.group(1))); json.dump(data, open(result_path, 'w', encoding='utf-8'), indent=2)"
 
-          if [ "${FAIL_COUNT}" -gt 0 ]; then
-            echo "::warning::${FAIL_COUNT} benchmark config(s) failed for ${MODEL_NAME}"
-          fi
+          docker cp "${CONTAINER_NAME}:${CONTAINER_RESULT_DIR}/${RESULT_FILENAME}.json" "./${RESULT_FILENAME}.json"
 
       - name: Collect benchmark result
         if: steps.check.outputs.enabled == 'true'
         run: |
-          shopt -s nullglob
-          results=( "${RESULT_PREFIX}"-*.json )
-          if [ "${#results[@]}" -eq 0 ]; then
-            echo "ERROR: No benchmark result files were generated for ${MODEL_NAME}."
+          if [ ! -f "${RESULT_FILENAME}.json" ]; then
+            echo "ERROR: Benchmark result file ${RESULT_FILENAME}.json was not generated for ${MODEL_NAME}."
             exit 1
           fi
 
@@ -456,8 +430,8 @@ jobs:
         if: steps.check.outputs.enabled == 'true'
         uses: actions/upload-artifact@v7
         with:
-          name: oot-benchmark-${{ env.RESULT_PREFIX }}
-          path: ${{ env.RESULT_PREFIX }}-*.json
+          name: oot-benchmark-${{ env.RESULT_FILENAME }}
+          path: ${{ env.RESULT_FILENAME }}.json
 
       - name: Clean up OOT benchmark container
         if: always() && steps.check.outputs.enabled == 'true'

From 27fd782621e8aa22ca202237e5ea5d71d935d044 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Wed, 25 Mar 2026 17:19:12 +0800
Subject: [PATCH 2/9] add model qwen3.5 change to manual trigger align env and
 arguments choice box default false

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/benchmark/oot_benchmark_models.json   |  20 ++-
 .../workflows/atom-vllm-oot-benchmark.yaml    | 137 +++++-------------
 2 files changed, 50 insertions(+), 107 deletions(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index 3808e5896..9a0e3fe9b 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -7,7 +7,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {
     "display": "DeepSeek-R1 MXFP4",
@@ -17,7 +17,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {
     "display": "gpt-oss-120b",
@@ -27,7 +27,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "linux-atom-mi355-1",
-    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   },
   {
     "display": "Kimi-K2-Thinking-MXFP4 TP4",
@@ -38,7 +38,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {
     "display": "Kimi-K2-Thinking-MXFP4 TP8",
@@ -49,6 +49,16 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "VLLM_ROCM_USE_AITER=1\nAITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
+  },
+  {
+    "display": "Qwen3.5-397B-A17B-FP8",
+    "source_path": "Qwen/Qwen3.5-397B-A17B-FP8",
+    "path": "/models/Qwen3.5-397B-A17B-FP8",
+    "prefix": "qwen3-5-397b-a17b-fp8",
+    "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
+    "bench_args": "",
+    "runner": "atom-mi355-8gpu.predownload",
+    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_DISABLE_VLLM_PLUGIN_ATTENTION=1\nATOM_USE_CUSTOM_ALL_GATHER=0"
   }
 ]
diff --git a/.github/workflows/atom-vllm-oot-benchmark.yaml b/.github/workflows/atom-vllm-oot-benchmark.yaml
index 5c844c07a..cb137c209 100644
--- a/.github/workflows/atom-vllm-oot-benchmark.yaml
+++ b/.github/workflows/atom-vllm-oot-benchmark.yaml
@@ -10,46 +10,40 @@ on:
       deepseek-r1-fp8:
         description: "Benchmark DeepSeek-R1 FP8"
         type: boolean
-        default: true
+        default: false
       deepseek-r1-mxfp4:
         description: "Benchmark DeepSeek-R1 MXFP4"
         type: boolean
-        default: true
+        default: false
       gpt-oss-120b:
         description: "Benchmark gpt-oss-120b"
         type: boolean
-        default: true
-      kimi-k2-thinking-mxfp4:
-        description: "Benchmark Kimi-K2-Thinking-MXFP4 (TP4 and TP8)"
+        default: false
+      kimi-k2-thinking-mxfp4-tp4:
+        description: "Benchmark Kimi-K2-Thinking-MXFP4 TP4"
         type: boolean
-        default: true
-      base_image:
-        description: "ATOM base image used to build the OOT benchmark image"
-        type: string
-        default: "rocm/atom-dev:latest"
-      vllm_commit:
-        description: "vLLM commit used by the OOT benchmark image"
-        type: string
-        default: "b31e9326a7d9394aab8c767f8ebe225c65594b60"
-      vllm_version:
-        description: "vLLM version label used in the benchmark image tag"
+        default: false
+      kimi-k2-thinking-mxfp4-tp8:
+        description: "Benchmark Kimi-K2-Thinking-MXFP4 TP8"
+        type: boolean
+        default: false
+      qwen3-5-397b-a17b-fp8:
+        description: "Benchmark Qwen3.5-397B-A17B-FP8"
+        type: boolean
+        default: false
+      oot_image:
+        description: "OOT benchmark image to pull directly"
         type: string
-        default: "0.17"
+        default: "rocm/atom-dev:vllm-latest"
       param_lists:
         description: |
           "Benchmark parameter lists.
           Input as a single or multiple sets (comma-separated, semicolon between sets),
           format: input_length,output_length,concurrency,random_range_ratio.
-          Example (single set): 1024,1024,128,0.8
-          Example (multiple sets): 1024,1024,128,0.8;2048,1024,256,0.7"
+          Example (single set): 1024,1024,64,0.8
+          Example (multiple sets): 1024,1024,64,0.8;2048,1024,32,0.7"
         type: string
-        default: "1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,1024,128,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;1024,8192,128,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8;8192,1024,128,0.8"
-
-env:
-  BASE_IMAGE: ${{ inputs.base_image || 'rocm/atom-dev:latest' }}
-  GITHUB_REPO_URL: https://github.com/ROCm/ATOM.git
-  GITHUB_COMMIT_SHA: ${{ github.sha }}
-  VALIDATION_IMAGE_REPO: rocm/atom-dev
+        default: "1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8"
 
 jobs:
   parse-param-lists:
@@ -62,7 +56,7 @@ jobs:
       - name: Parse parameter lists
         id: parse-param-lists
         run: |
-          PARAM_LISTS="${{ inputs.param_lists || '1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,1024,128,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;1024,8192,128,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8;8192,1024,128,0.8' }}"
+          PARAM_LISTS="${{ inputs.param_lists || '1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8' }}"
           echo "Using param_lists: ${PARAM_LISTS}"
           printf 'param_lists=%s\n' "${PARAM_LISTS}" >> "$GITHUB_OUTPUT"
           IFS=';' read -ra SETS <<< "${PARAM_LISTS}"
@@ -74,6 +68,13 @@ jobs:
             OUTPUT_LEN="${PARAMS[1]}"
             CONCURRENCY="${PARAMS[2]}"
             RANDOM_RANGE_RATIO="${PARAMS[3]}"
+            case "${CONCURRENCY}" in
+              4|8|16|32|64) ;;
+              *)
+                echo "Unsupported concurrency: ${CONCURRENCY}. Allowed values: 4,8,16,32,64"
+                exit 1
+                ;;
+            esac
             MATRIX_JSON="${MATRIX_JSON}${SEP}{\"input_length\":${INPUT_LEN},\"output_length\":${OUTPUT_LEN},\"concurrency\":${CONCURRENCY},\"random_range_ratio\":${RANDOM_RANGE_RATIO}}"
             SEP=","
           done
@@ -94,84 +95,13 @@ jobs:
       - id: load
         run: echo "models_json=$(jq -c . .github/benchmark/oot_benchmark_models.json)" >> "$GITHUB_OUTPUT"
 
-  build-oot-image:
-    name: Build OOT benchmark image
-    runs-on: build-only-atom
-    outputs:
-      oot_image_tag: ${{ steps.meta.outputs.oot_image_tag }}
-    steps:
-      - name: Checkout ATOM repo
-        uses: actions/checkout@v6
-
-      - name: Docker Login
-        run: |
-          echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u "${{ secrets.DOCKER_USERNAME }}" --password-stdin
-
-      - name: Generate OOT base Dockerfile
-        run: |
-          cat <<EOF > Dockerfile.mod
-          FROM ${{ env.BASE_IMAGE }}
-          RUN pip install -U lm-eval[api]
-          RUN pip show lm-eval || true
-          RUN pip install hf_transfer
-          RUN pip show hf_transfer || true
-          RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
-          RUN pip uninstall -y amd-aiter
-          RUN pip install --upgrade "pybind11>=3.0.1"
-          RUN pip show pybind11
-          RUN rm -rf /app/aiter-test
-          RUN git clone --depth 1 https://github.com/ROCm/aiter.git /app/aiter-test && \\
-              cd /app/aiter-test && \\
-              git checkout HEAD && \\
-              git submodule sync && git submodule update --init --recursive && \\
-              MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
-          RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
-          RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true
-          RUN pip uninstall -y atom
-          RUN rm -rf /app/ATOM
-          RUN git clone ${{ env.GITHUB_REPO_URL }} /app/ATOM && \\
-              cd /app/ATOM && \\
-              git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
-              pip install -e .
-          RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true
-          EOF
-
-      - name: Build OOT base image
-        run: |
-          docker build --pull --network=host \
-            --no-cache \
-            -t atom_oot_base:benchmark \
-            -f Dockerfile.mod .
-
-      - name: Build OOT vLLM image from current commit
-        id: meta
-        run: |
-          SHORT_SHA="${GITHUB_COMMIT_SHA::12}"
-          OOT_IMAGE_TAG="${VALIDATION_IMAGE_REPO}:oot-vllm-benchmark-v${{ inputs.vllm_version || '0.17' }}-${SHORT_SHA}-${{ github.run_id }}"
-          docker build --network=host \
-            --no-cache \
-            -t "${OOT_IMAGE_TAG}" \
-            --target atom_oot \
-            --build-arg OOT_BASE_IMAGE="atom_oot_base:benchmark" \
-            --build-arg MAX_JOBS=64 \
-            --build-arg VLLM_COMMIT="${{ inputs.vllm_commit || 'b31e9326a7d9394aab8c767f8ebe225c65594b60' }}" \
-            --build-arg INSTALL_LM_EVAL=1 \
-            --build-arg INSTALL_FASTSAFETENSORS=1 \
-            -f docker/Dockerfile .
-          echo "oot_image_tag=${OOT_IMAGE_TAG}" >> "$GITHUB_OUTPUT"
-
-      - name: Push OOT benchmark image
-        run: |
-          docker push "${{ steps.meta.outputs.oot_image_tag }}"
-
   benchmark:
     name: OOT ${{ matrix.model.display }} ${{ matrix.params.input_length }}/${{ matrix.params.output_length }} c=${{ matrix.params.concurrency }}
-    needs: [parse-param-lists, load-models, build-oot-image]
+    needs: [parse-param-lists, load-models]
     if: >-
       always()
       && needs.parse-param-lists.result == 'success'
       && needs.load-models.result == 'success'
-      && needs.build-oot-image.result == 'success'
     strategy:
       fail-fast: false
       matrix:
@@ -195,7 +125,7 @@ jobs:
       CONTAINER_NAME: atom_vllm_oot_benchmark_${{ strategy.job-index }}
       CONTAINER_RESULT_DIR: /tmp/oot-benchmark-results
       CONTAINER_BENCH_SERVING_DIR: /tmp/oot-benchmark/bench_serving
-      OOT_IMAGE_TAG: ${{ needs.build-oot-image.outputs.oot_image_tag }}
+      OOT_IMAGE_TAG: ${{ inputs.oot_image || 'rocm/atom-dev:vllm-latest' }}
       BENCH_SERVING_REPO_URL: https://github.com/kimbochen/bench_serving.git
     steps:
       - name: Check if model is enabled
@@ -205,7 +135,9 @@ jobs:
             deepseek-r1-fp8) echo "enabled=${{ inputs.deepseek-r1-fp8 }}" >> "$GITHUB_OUTPUT" ;;
             deepseek-r1-mxfp4) echo "enabled=${{ inputs.deepseek-r1-mxfp4 }}" >> "$GITHUB_OUTPUT" ;;
             gpt-oss-120b) echo "enabled=${{ inputs.gpt-oss-120b }}" >> "$GITHUB_OUTPUT" ;;
-            kimi-k2-thinking-mxfp4-tp4|kimi-k2-thinking-mxfp4-tp8) echo "enabled=${{ inputs.kimi-k2-thinking-mxfp4 }}" >> "$GITHUB_OUTPUT" ;;
+            kimi-k2-thinking-mxfp4-tp4) echo "enabled=${{ inputs.kimi-k2-thinking-mxfp4-tp4 }}" >> "$GITHUB_OUTPUT" ;;
+            kimi-k2-thinking-mxfp4-tp8) echo "enabled=${{ inputs.kimi-k2-thinking-mxfp4-tp8 }}" >> "$GITHUB_OUTPUT" ;;
+            qwen3-5-397b-a17b-fp8) echo "enabled=${{ inputs.qwen3-5-397b-a17b-fp8 }}" >> "$GITHUB_OUTPUT" ;;
             *) echo "enabled=true" >> "$GITHUB_OUTPUT" ;;
           esac
 
@@ -233,9 +165,10 @@ jobs:
         if: steps.check.outputs.enabled == 'true'
         run: echo "HF_TOKEN=${HF_TOKEN:-${{ secrets.AMD_HF_TOKEN }}}" >> "$GITHUB_ENV"
 
-      - name: Pull built OOT image
+      - name: Pull OOT benchmark image
         if: steps.check.outputs.enabled == 'true'
         run: |
+          echo "Pulling OOT benchmark image: ${OOT_IMAGE_TAG}"
           docker pull "${OOT_IMAGE_TAG}"
 
       - name: Prepare model cache mount

From 5f9cf75fed9acea65004e4d44b1bd7814fa3d655 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Wed, 25 Mar 2026 17:29:28 +0800
Subject: [PATCH 3/9] set 4 GPU machine for Kimi-K2 TP4

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/benchmark/oot_benchmark_models.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index 9a0e3fe9b..da388c2ed 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -37,7 +37,7 @@
     "prefix": "kimi-k2-thinking-mxfp4-tp4",
     "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
-    "runner": "atom-mi355-8gpu.predownload",
+    "runner": "linux-atom-mi355-4",
     "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {

From f236d2c35984f1221c3f0e2e0ef5794614c723ea Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Wed, 25 Mar 2026 17:40:47 +0800
Subject: [PATCH 4/9] if the model has not been chosen, the gpu runner will not
 be dispatched

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .../workflows/atom-vllm-oot-benchmark.yaml    | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/atom-vllm-oot-benchmark.yaml b/.github/workflows/atom-vllm-oot-benchmark.yaml
index cb137c209..a14645b8d 100644
--- a/.github/workflows/atom-vllm-oot-benchmark.yaml
+++ b/.github/workflows/atom-vllm-oot-benchmark.yaml
@@ -90,10 +90,36 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       models_json: ${{ steps.load.outputs.models_json }}
+      has_enabled_models: ${{ steps.load.outputs.has_enabled_models }}
     steps:
       - uses: actions/checkout@v6
       - id: load
-        run: echo "models_json=$(jq -c . .github/benchmark/oot_benchmark_models.json)" >> "$GITHUB_OUTPUT"
+        env:
+          ENABLE_DEEPSEEK_R1_FP8: ${{ inputs.deepseek-r1-fp8 }}
+          ENABLE_DEEPSEEK_R1_MXFP4: ${{ inputs.deepseek-r1-mxfp4 }}
+          ENABLE_GPT_OSS_120B: ${{ inputs.gpt-oss-120b }}
+          ENABLE_KIMI_K2_TP4: ${{ inputs.kimi-k2-thinking-mxfp4-tp4 }}
+          ENABLE_KIMI_K2_TP8: ${{ inputs.kimi-k2-thinking-mxfp4-tp8 }}
+          ENABLE_QWEN3_5_397B_A17B_FP8: ${{ inputs.qwen3-5-397b-a17b-fp8 }}
+        run: |
+          MODELS_JSON="$(jq -c '
+            map(select(
+              (.prefix == "deepseek-r1-fp8" and env.ENABLE_DEEPSEEK_R1_FP8 == "true")
+              or (.prefix == "deepseek-r1-mxfp4" and env.ENABLE_DEEPSEEK_R1_MXFP4 == "true")
+              or (.prefix == "gpt-oss-120b" and env.ENABLE_GPT_OSS_120B == "true")
+              or (.prefix == "kimi-k2-thinking-mxfp4-tp4" and env.ENABLE_KIMI_K2_TP4 == "true")
+              or (.prefix == "kimi-k2-thinking-mxfp4-tp8" and env.ENABLE_KIMI_K2_TP8 == "true")
+              or (.prefix == "qwen3-5-397b-a17b-fp8" and env.ENABLE_QWEN3_5_397B_A17B_FP8 == "true")
+            ))
+          ' .github/benchmark/oot_benchmark_models.json)"
+          echo "models_json=${MODELS_JSON}" >> "$GITHUB_OUTPUT"
+          if [ "${MODELS_JSON}" = "[]" ]; then
+            echo "has_enabled_models=false" >> "$GITHUB_OUTPUT"
+            echo "No models selected for OOT benchmark."
+          else
+            echo "has_enabled_models=true" >> "$GITHUB_OUTPUT"
+            echo "Selected models: ${MODELS_JSON}"
+          fi
 
   benchmark:
     name: OOT ${{ matrix.model.display }} ${{ matrix.params.input_length }}/${{ matrix.params.output_length }} c=${{ matrix.params.concurrency }}
@@ -102,6 +128,7 @@ jobs:
       always()
       && needs.parse-param-lists.result == 'success'
       && needs.load-models.result == 'success'
+      && needs.load-models.outputs.has_enabled_models == 'true'
     strategy:
       fail-fast: false
       matrix:

From 62437ef37bbd70b6b196fa334efdf50945c940e0 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Wed, 25 Mar 2026 17:56:40 +0800
Subject: [PATCH 5/9] change the config

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/benchmark/oot_benchmark_models.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index da388c2ed..c7ee28daf 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -26,7 +26,7 @@
     "prefix": "gpt-oss-120b",
     "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
-    "runner": "linux-atom-mi355-1",
+    "runner": "atom-mi355-8gpu.predownload",
     "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   },
   {
@@ -37,7 +37,7 @@
     "prefix": "kimi-k2-thinking-mxfp4-tp4",
     "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
-    "runner": "linux-atom-mi355-4",
+    "runner": "atom-mi355-8gpu.predownload",
     "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {

From f8c77fb270b23867bc7452afc45aa48313eca583 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Wed, 25 Mar 2026 17:58:40 +0800
Subject: [PATCH 6/9] remove redundant env flag for gptoss

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/benchmark/oot_benchmark_models.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index c7ee28daf..a2781102c 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -27,7 +27,7 @@
     "extra_args": "--trust-remote-code --tensor-parallel-size 1 --max-num-batched-tokens 16384 --max-model-len 16384",
     "bench_args": "",
     "runner": "atom-mi355-8gpu.predownload",
-    "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
+    "env_vars": "ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1"
   },
   {
     "display": "Kimi-K2-Thinking-MXFP4 TP4",

From 7a11c93c806380b587636a9585075afc82459e5a Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Thu, 26 Mar 2026 11:50:00 +0800
Subject: [PATCH 7/9] add specific branch trigger OOT benchmark for acceptance
 test when upgrading vLLM

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/scripts/oot_benchmark_to_dashboard.py |  12 +
 .../workflows/atom-vllm-oot-benchmark.yaml    | 361 +++++++++++++++++-
 2 files changed, 353 insertions(+), 20 deletions(-)

diff --git a/.github/scripts/oot_benchmark_to_dashboard.py b/.github/scripts/oot_benchmark_to_dashboard.py
index ba5c009ba..5e5f92fab 100644
--- a/.github/scripts/oot_benchmark_to_dashboard.py
+++ b/.github/scripts/oot_benchmark_to_dashboard.py
@@ -50,6 +50,15 @@ def append_metric(
     entries.append(entry)
 
 
+def is_dashboard_publish_allowed(payload: dict) -> bool:
+    publish_flag = payload.get("dashboard_publish_allowed")
+    if publish_flag is None:
+        return True
+    if isinstance(publish_flag, bool):
+        return publish_flag
+    return str(publish_flag).strip().lower() not in {"0", "false", "no"}
+
+
 def build_entries(result_dir: Path, run_url: str | None) -> list[dict]:
     entries: list[dict] = []
 
@@ -62,6 +71,9 @@ def build_entries(result_dir: Path, run_url: str | None) -> list[dict]:
         except (OSError, UnicodeDecodeError, json.JSONDecodeError):
             continue
 
+        if not is_dashboard_publish_allowed(payload):
+            continue
+
         if "output_throughput" not in payload:
             continue
 
diff --git a/.github/workflows/atom-vllm-oot-benchmark.yaml b/.github/workflows/atom-vllm-oot-benchmark.yaml
index a14645b8d..0ba8beb9d 100644
--- a/.github/workflows/atom-vllm-oot-benchmark.yaml
+++ b/.github/workflows/atom-vllm-oot-benchmark.yaml
@@ -1,7 +1,7 @@
 name: ATOM vLLM OOT Benchmark
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.repository }}-${{ github.ref_name }}
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 
 on:
@@ -32,7 +32,7 @@ on:
         type: boolean
         default: false
       oot_image:
-        description: "OOT benchmark image to pull directly"
+        description: "Prebuilt OOT benchmark image to pull when no custom rebuild is needed"
         type: string
         default: "rocm/atom-dev:vllm-latest"
       param_lists:
@@ -46,6 +46,111 @@ on:
         default: "1024,1024,4,0.8;1024,1024,8,0.8;1024,1024,16,0.8;1024,1024,32,0.8;1024,1024,64,0.8;1024,8192,4,0.8;1024,8192,8,0.8;1024,8192,16,0.8;1024,8192,32,0.8;1024,8192,64,0.8;8192,1024,4,0.8;8192,1024,8,0.8;8192,1024,16,0.8;8192,1024,32,0.8;8192,1024,64,0.8"
 
 jobs:
+  resolve-atom-source:
+    name: Resolve ATOM benchmark source
+    runs-on: ubuntu-latest
+    outputs:
+      atom_repository: ${{ steps.resolve.outputs.atom_repository }}
+      atom_ref: ${{ steps.resolve.outputs.atom_ref }}
+      normalized_ref: ${{ steps.resolve.outputs.normalized_ref }}
+      rebuild_oot_image: ${{ steps.resolve.outputs.rebuild_oot_image }}
+      prebuilt_oot_image: ${{ steps.resolve.outputs.prebuilt_oot_image }}
+      selected_vllm_commit: ${{ steps.resolve.outputs.selected_vllm_commit }}
+      selected_vllm_version: ${{ steps.resolve.outputs.selected_vllm_version }}
+      oot_base_image: ${{ steps.resolve.outputs.oot_base_image }}
+      oot_image_source: ${{ steps.resolve.outputs.oot_image_source }}
+      publish_to_dashboard: ${{ steps.resolve.outputs.publish_to_dashboard }}
+    steps:
+      - name: Checkout selected branch
+        uses: actions/checkout@v6
+
+      - name: Resolve benchmark source
+        id: resolve
+        env:
+          INPUT_OOT_IMAGE: ${{ inputs.oot_image || '' }}
+        run: |
+          set -euo pipefail
+
+          ATOM_REPOSITORY="${GITHUB_REPOSITORY}"
+          ATOM_REF="${GITHUB_REF_NAME}"
+          PREBUILT_OOT_IMAGE="${INPUT_OOT_IMAGE:-rocm/atom-dev:vllm-latest}"
+          OOT_BASE_IMAGE="rocm/atom-dev:latest"
+
+          NORMALIZED_REF="${ATOM_REF#refs/heads/}"
+          NORMALIZED_REF="${NORMALIZED_REF#refs/tags/}"
+
+          REBUILD_OOT_IMAGE=false
+          if [[ "${NORMALIZED_REF}" != "main" ]]; then
+            REBUILD_OOT_IMAGE=true
+          fi
+
+          OOT_IMAGE_SOURCE="prebuilt"
+          if [[ "${REBUILD_OOT_IMAGE}" == "true" ]]; then
+            OOT_IMAGE_SOURCE="rebuild"
+          fi
+
+          PUBLISH_TO_DASHBOARD=false
+          if [[ "${REBUILD_OOT_IMAGE}" != "true" && "${NORMALIZED_REF}" == "main" ]]; then
+            PUBLISH_TO_DASHBOARD=true
+          fi
+
+          mapfile -t VLLM_META < <(python3 - <<'PY'
+          import re
+          from pathlib import Path
+
+          text = Path(".github/workflows/docker-release.yaml").read_text(encoding="utf-8")
+          commit_match = re.search(r'^\s*VLLM_COMMIT:\s*"([^"]+)"', text, re.MULTILINE)
+          version_match = re.search(r'^\s*VLLM_VERSION:\s*"([^"]+)"', text, re.MULTILINE)
+
+          if not commit_match or not version_match:
+              raise SystemExit("Failed to read VLLM_COMMIT/VLLM_VERSION from .github/workflows/docker-release.yaml")
+
+          print(commit_match.group(1))
+          print(version_match.group(1))
+          PY
+          )
+
+          SELECTED_VLLM_COMMIT="${VLLM_META[0]}"
+          SELECTED_VLLM_VERSION="${VLLM_META[1]}"
+
+          {
+            echo "atom_repository=${ATOM_REPOSITORY}"
+            echo "atom_ref=${ATOM_REF}"
+            echo "normalized_ref=${NORMALIZED_REF}"
+            echo "rebuild_oot_image=${REBUILD_OOT_IMAGE}"
+            echo "prebuilt_oot_image=${PREBUILT_OOT_IMAGE}"
+            echo "selected_vllm_commit=${SELECTED_VLLM_COMMIT}"
+            echo "selected_vllm_version=${SELECTED_VLLM_VERSION}"
+            echo "oot_base_image=${OOT_BASE_IMAGE}"
+            echo "oot_image_source=${OOT_IMAGE_SOURCE}"
+            echo "publish_to_dashboard=${PUBLISH_TO_DASHBOARD}"
+          } >> "$GITHUB_OUTPUT"
+
+          printf '### OOT benchmark source\n- Repository: `%s`\n- Ref: `%s`\n- Image mode: `%s`\n' \
+            "${ATOM_REPOSITORY}" \
+            "${ATOM_REF}" \
+            "${OOT_IMAGE_SOURCE}" >> "$GITHUB_STEP_SUMMARY"
+
+          if [[ "${REBUILD_OOT_IMAGE}" == "true" ]]; then
+            printf -- '- Base image: `%s`\n- vLLM version: `%s`\n- vLLM commit: `%s`\n- vLLM source: `.github/workflows/docker-release.yaml`\n' \
+              "${OOT_BASE_IMAGE}" \
+              "${SELECTED_VLLM_VERSION}" \
+              "${SELECTED_VLLM_COMMIT}" >> "$GITHUB_STEP_SUMMARY"
+          else
+            printf -- '- Prebuilt image: `%s`\n- Expected vLLM version: `%s`\n- Expected vLLM commit: `%s`\n' \
+              "${PREBUILT_OOT_IMAGE}" \
+              "${SELECTED_VLLM_VERSION}" \
+              "${SELECTED_VLLM_COMMIT}" >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+          printf -- '- Upload to dashboard: `%s`\n' \
+            "${PUBLISH_TO_DASHBOARD}" >> "$GITHUB_STEP_SUMMARY"
+
+          if [[ "${PUBLISH_TO_DASHBOARD}" != "true" ]]; then
+            printf '\nNon-main or overridden ATOM sources keep benchmark results in artifacts and run summaries only.\n' \
+              >> "$GITHUB_STEP_SUMMARY"
+          fi
+
   parse-param-lists:
     name: Parse parameter lists
     runs-on: ubuntu-latest
@@ -121,13 +226,118 @@ jobs:
             echo "Selected models: ${MODELS_JSON}"
           fi
 
+  build-oot-image:
+    name: Build custom OOT benchmark image
+    needs: [resolve-atom-source]
+    if: needs.resolve-atom-source.outputs.rebuild_oot_image == 'true'
+    runs-on: build-only-atom
+    outputs:
+      oot_image_tag: ${{ steps.image-meta.outputs.oot_image_tag }}
+      atom_source_sha: ${{ steps.source-meta.outputs.atom_source_sha }}
+    env:
+      ATOM_SOURCE_REPOSITORY: ${{ needs.resolve-atom-source.outputs.atom_repository }}
+      ATOM_SOURCE_REF: ${{ needs.resolve-atom-source.outputs.atom_ref }}
+      OOT_BASE_IMAGE: ${{ needs.resolve-atom-source.outputs.oot_base_image }}
+      VLLM_COMMIT: ${{ needs.resolve-atom-source.outputs.selected_vllm_commit }}
+      VLLM_VERSION: ${{ needs.resolve-atom-source.outputs.selected_vllm_version }}
+      OOT_IMAGE_REPO: rocm/atom-dev
+    steps:
+      - name: Checkout benchmark ATOM source
+        uses: actions/checkout@v6
+        with:
+          repository: ${{ env.ATOM_SOURCE_REPOSITORY }}
+          ref: ${{ env.ATOM_SOURCE_REF }}
+          fetch-depth: 1
+
+      - name: Record source metadata
+        id: source-meta
+        run: |
+          ATOM_SOURCE_SHA="$(git rev-parse HEAD)"
+          echo "atom_source_sha=${ATOM_SOURCE_SHA}" >> "$GITHUB_OUTPUT"
+          printf '### Building custom OOT image\n- Repository: `%s`\n- Ref: `%s`\n- Commit: `%s`\n- Base image: `%s`\n- vLLM version: `%s`\n- vLLM commit: `%s`\n' \
+            "${ATOM_SOURCE_REPOSITORY}" \
+            "${ATOM_SOURCE_REF}" \
+            "${ATOM_SOURCE_SHA}" \
+            "${OOT_BASE_IMAGE}" \
+            "${VLLM_VERSION}" \
+            "${VLLM_COMMIT}" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Docker Login
+        run: |
+          echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u "${{ secrets.DOCKER_USERNAME }}" --password-stdin
+
+      - name: Generate custom OOT base Dockerfile
+        run: |
+          cat > Dockerfile.oot-base <<EOF
+          FROM ${OOT_BASE_IMAGE}
+          RUN pip install -U lm-eval[api]
+          RUN pip show lm-eval || true
+          RUN pip install hf_transfer
+          RUN pip show hf_transfer || true
+          RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true
+          RUN pip uninstall -y amd-aiter
+          RUN pip install --upgrade "pybind11>=3.0.1"
+          RUN pip show pybind11
+          RUN rm -rf /app/aiter-test
+          RUN git clone --depth 1 https://github.com/ROCm/aiter.git /app/aiter-test && \\
+              cd /app/aiter-test && \\
+              git checkout HEAD && \\
+              git submodule sync && git submodule update --init --recursive && \\
+              MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 python3 setup.py develop
+          RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
+          RUN echo "=== ATOM version BEFORE uninstall ===" && pip show atom || true
+          RUN pip uninstall -y atom || true
+          RUN rm -rf /app/ATOM
+          COPY . /app/ATOM
+          RUN cd /app/ATOM && pip install -e .
+          RUN echo "=== ATOM version AFTER installation ===" && pip show atom || true
+          EOF
+
+      - name: Build custom OOT base image
+        run: |
+          docker build --pull --network=host \
+            --no-cache \
+            -t atom_oot_base:ci \
+            -f Dockerfile.oot-base .
+
+      - name: Build custom OOT vLLM image
+        id: image-meta
+        run: |
+          SHORT_SHA="$(git rev-parse --short HEAD)"
+          SHORT_VLLM="$(printf '%s' "${VLLM_COMMIT}" | cut -c1-12)"
+          SAFE_VLLM_VERSION="$(printf '%s' "${VLLM_VERSION}" | tr '/:' '--')"
+          OOT_IMAGE_TAG="${OOT_IMAGE_REPO}:oot-benchmark-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${SHORT_SHA}-${SAFE_VLLM_VERSION}-${SHORT_VLLM}"
+          docker build --network=host \
+            --no-cache \
+            -t "${OOT_IMAGE_TAG}" \
+            --target atom_oot \
+            --build-arg OOT_BASE_IMAGE="atom_oot_base:ci" \
+            --build-arg MAX_JOBS=64 \
+            --build-arg VLLM_COMMIT="${VLLM_COMMIT}" \
+            --build-arg INSTALL_LM_EVAL=1 \
+            --build-arg INSTALL_FASTSAFETENSORS=1 \
+            -f docker/Dockerfile .
+          echo "oot_image_tag=${OOT_IMAGE_TAG}" >> "$GITHUB_OUTPUT"
+
+      - name: Push custom OOT image
+        run: |
+          docker push "${{ steps.image-meta.outputs.oot_image_tag }}"
+
+      - name: Clean up build images
+        if: always()
+        run: |
+          docker rmi "${{ steps.image-meta.outputs.oot_image_tag }}" || true
+          docker rmi atom_oot_base:ci || true
+
   benchmark:
     name: OOT ${{ matrix.model.display }} ${{ matrix.params.input_length }}/${{ matrix.params.output_length }} c=${{ matrix.params.concurrency }}
-    needs: [parse-param-lists, load-models]
+    needs: [resolve-atom-source, parse-param-lists, load-models, build-oot-image]
     if: >-
       always()
+      && needs.resolve-atom-source.result == 'success'
       && needs.parse-param-lists.result == 'success'
       && needs.load-models.result == 'success'
+      && (needs.build-oot-image.result == 'success' || needs.build-oot-image.result == 'skipped')
       && needs.load-models.outputs.has_enabled_models == 'true'
     strategy:
       fail-fast: false
@@ -152,8 +362,14 @@ jobs:
       CONTAINER_NAME: atom_vllm_oot_benchmark_${{ strategy.job-index }}
       CONTAINER_RESULT_DIR: /tmp/oot-benchmark-results
       CONTAINER_BENCH_SERVING_DIR: /tmp/oot-benchmark/bench_serving
-      OOT_IMAGE_TAG: ${{ inputs.oot_image || 'rocm/atom-dev:vllm-latest' }}
+      OOT_IMAGE_TAG: ${{ needs.build-oot-image.outputs.oot_image_tag || needs.resolve-atom-source.outputs.prebuilt_oot_image }}
+      OOT_IMAGE_SOURCE: ${{ needs.resolve-atom-source.outputs.oot_image_source }}
       BENCH_SERVING_REPO_URL: https://github.com/kimbochen/bench_serving.git
+      ATOM_SOURCE_REPOSITORY: ${{ needs.resolve-atom-source.outputs.atom_repository }}
+      ATOM_SOURCE_REF: ${{ needs.resolve-atom-source.outputs.atom_ref }}
+      VLLM_COMMIT_USED: ${{ needs.resolve-atom-source.outputs.selected_vllm_commit }}
+      VLLM_VERSION_USED: ${{ needs.resolve-atom-source.outputs.selected_vllm_version }}
+      PUBLISH_TO_DASHBOARD: ${{ needs.resolve-atom-source.outputs.publish_to_dashboard }}
     steps:
       - name: Check if model is enabled
         id: check
@@ -179,9 +395,20 @@ jobs:
           docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
           docker run --rm -v "${GITHUB_WORKSPACE:-$PWD}":/workspace -w /workspace --privileged rocm/pytorch:latest bash -lc "shopt -s dotglob && ls -la /workspace/ && rm -rf /workspace/*" || true
 
-      - name: Checkout ATOM repo
+      - name: Checkout benchmark ATOM source
         if: steps.check.outputs.enabled == 'true'
         uses: actions/checkout@v6
+        with:
+          repository: ${{ env.ATOM_SOURCE_REPOSITORY }}
+          ref: ${{ needs.build-oot-image.outputs.atom_source_sha || github.sha }}
+          fetch-depth: 1
+
+      - name: Record benchmark source revision
+        if: steps.check.outputs.enabled == 'true'
+        run: |
+          SOURCE_SHA="$(git rev-parse HEAD)"
+          echo "ATOM_SOURCE_SHA=${SOURCE_SHA}" >> "$GITHUB_ENV"
+          echo "Benchmarking ${ATOM_SOURCE_REPOSITORY}@${ATOM_SOURCE_REF} (${SOURCE_SHA}) with ${OOT_IMAGE_SOURCE} image ${OOT_IMAGE_TAG}"
 
       - name: Docker Login
         if: steps.check.outputs.enabled == 'true'
@@ -368,13 +595,56 @@ jobs:
                 ${BENCH_EXTRA_ARGS:-}
             "
 
-          docker exec \
+          docker exec -i \
             -e RESULT_PATH="${CONTAINER_RESULT_DIR}/${RESULT_FILENAME}.json" \
             -e ISL="${ISL}" \
             -e OSL="${OSL}" \
             -e EXTRA_ARGS_TEXT="${OOT_EXTRA_ARGS}" \
             -e DASHBOARD_MODEL_NAME="${DASHBOARD_MODEL_NAME}" \
-            "$CONTAINER_NAME" python3 -c "import json, os, re; result_path = os.environ['RESULT_PATH']; data = json.load(open(result_path, encoding='utf-8')); data['random_input_len'] = int(os.environ['ISL']); data['random_output_len'] = int(os.environ['OSL']); data['benchmark_backend'] = 'ATOM-vLLM'; display_name = os.environ.get('DASHBOARD_MODEL_NAME', ''); display_name and data.__setitem__('benchmark_model_name', display_name); tp_match = re.search(r'--tensor-parallel-size\\s+(\\d+)', os.environ.get('EXTRA_ARGS_TEXT', '')); tp_match and data.__setitem__('tensor_parallel_size', int(tp_match.group(1))); json.dump(data, open(result_path, 'w', encoding='utf-8'), indent=2)"
+            -e ATOM_SOURCE_REPOSITORY="${ATOM_SOURCE_REPOSITORY}" \
+            -e ATOM_SOURCE_REF="${ATOM_SOURCE_REF}" \
+            -e ATOM_SOURCE_SHA="${ATOM_SOURCE_SHA}" \
+            -e VLLM_COMMIT_USED="${VLLM_COMMIT_USED}" \
+            -e VLLM_VERSION_USED="${VLLM_VERSION_USED}" \
+            -e OOT_IMAGE_SOURCE="${OOT_IMAGE_SOURCE}" \
+            -e PUBLISH_TO_DASHBOARD="${PUBLISH_TO_DASHBOARD}" \
+            "$CONTAINER_NAME" python3 - <<'PY'
+          import json
+          import os
+          import re
+
+          result_path = os.environ["RESULT_PATH"]
+          with open(result_path, encoding="utf-8") as f:
+              data = json.load(f)
+
+          data["random_input_len"] = int(os.environ["ISL"])
+          data["random_output_len"] = int(os.environ["OSL"])
+          data["benchmark_backend"] = "ATOM-vLLM"
+
+          display_name = os.environ.get("DASHBOARD_MODEL_NAME", "")
+          if display_name:
+              data["benchmark_model_name"] = display_name
+
+          tp_match = re.search(
+              r"--tensor-parallel-size\s+(\d+)",
+              os.environ.get("EXTRA_ARGS_TEXT", ""),
+          )
+          if tp_match:
+              data["tensor_parallel_size"] = int(tp_match.group(1))
+
+          data["atom_source_repository"] = os.environ.get("ATOM_SOURCE_REPOSITORY", "")
+          data["atom_source_ref"] = os.environ.get("ATOM_SOURCE_REF", "")
+          data["atom_source_sha"] = os.environ.get("ATOM_SOURCE_SHA", "")
+          data["vllm_commit"] = os.environ.get("VLLM_COMMIT_USED", "")
+          data["vllm_version"] = os.environ.get("VLLM_VERSION_USED", "")
+          data["oot_image_source"] = os.environ.get("OOT_IMAGE_SOURCE", "")
+          data["dashboard_publish_allowed"] = (
+              os.environ.get("PUBLISH_TO_DASHBOARD", "false").lower() == "true"
+          )
+
+          with open(result_path, "w", encoding="utf-8") as f:
+              json.dump(data, f, indent=2)
+          PY
 
           docker cp "${CONTAINER_NAME}:${CONTAINER_RESULT_DIR}/${RESULT_FILENAME}.json" "./${RESULT_FILENAME}.json"
 
@@ -399,11 +669,12 @@ jobs:
           docker exec "$CONTAINER_NAME" bash -lc "if [ -f /tmp/vllm_oot.pid ]; then kill \$(cat /tmp/vllm_oot.pid) || true; fi" || true
           docker stop "$CONTAINER_NAME" || true
           docker rm "$CONTAINER_NAME" || true
+          docker rmi "$OOT_IMAGE_TAG" || true
 
   summarize-benchmark-result:
     if: always()
     name: Summarize OOT benchmark result
-    needs: [benchmark]
+    needs: [resolve-atom-source, benchmark]
     runs-on: ubuntu-latest
     outputs:
       has_regression: ${{ steps.check-regression.outputs.has_regression }}
@@ -437,22 +708,64 @@ jobs:
         if: steps.check-results.outputs.has_results == 'true'
         id: baseline
         run: |
-          PREV_RUN_ID=$(gh run list \
+          mapfile -t CANDIDATE_RUN_IDS < <(gh run list \
             --workflow="ATOM vLLM OOT Benchmark" \
             --branch=main \
             --event=workflow_dispatch \
             --status=success \
-            --limit=1 \
+            --limit=10 \
             --json databaseId \
-            --jq '.[0].databaseId // empty')
+            --jq '.[].databaseId // empty')
+
+          BASELINE_DIR=""
+          for PREV_RUN_ID in "${CANDIDATE_RUN_IDS[@]}"; do
+            if [ -z "$PREV_RUN_ID" ] || [ "$PREV_RUN_ID" = "${{ github.run_id }}" ]; then
+              continue
+            fi
+
+            echo "Checking baseline candidate run #$PREV_RUN_ID"
+            rm -rf /tmp/baseline_candidate /tmp/baseline
+            mkdir -p /tmp/baseline_candidate
+
+            if ! gh run download "$PREV_RUN_ID" --dir /tmp/baseline_candidate; then
+              echo "::warning::Failed to download baseline artifacts from run #$PREV_RUN_ID"
+              continue
+            fi
+
+            if python3 - <<'PY'
+            import json
+            from pathlib import Path
+
+            valid_payload_found = False
+            for path in Path("/tmp/baseline_candidate").rglob("*.json"):
+                if path.name == "regression_report.json":
+                    continue
+                try:
+                    payload = json.loads(path.read_text(encoding="utf-8"))
+                except (OSError, UnicodeDecodeError, json.JSONDecodeError):
+                    continue
+                if "output_throughput" not in payload:
+                    continue
+                valid_payload_found = True
+                if payload.get("dashboard_publish_allowed", True) is False:
+                    raise SystemExit(1)
+
+            raise SystemExit(0 if valid_payload_found else 1)
+            PY
+            then
+              mv /tmp/baseline_candidate /tmp/baseline
+              BASELINE_DIR="/tmp/baseline"
+              echo "Using baseline from run #$PREV_RUN_ID"
+              break
+            fi
+
+            echo "Skipping run #$PREV_RUN_ID because it contains custom/non-dashboard benchmark results."
+          done
 
-          if [ -n "$PREV_RUN_ID" ] && [ "$PREV_RUN_ID" != "${{ github.run_id }}" ]; then
-            echo "Downloading baseline from run #$PREV_RUN_ID"
-            mkdir -p /tmp/baseline
-            gh run download "$PREV_RUN_ID" --dir /tmp/baseline || echo "::warning::Failed to download baseline artifacts"
-            echo "baseline_dir=/tmp/baseline" >> "$GITHUB_OUTPUT"
+          if [ -n "$BASELINE_DIR" ]; then
+            echo "baseline_dir=${BASELINE_DIR}" >> "$GITHUB_OUTPUT"
           else
-            echo "No previous successful manual run found"
+            echo "No previous successful dashboard-eligible manual run found"
             echo "baseline_dir=" >> "$GITHUB_OUTPUT"
           fi
         env:
@@ -488,8 +801,16 @@ jobs:
           name: oot-benchmark-regression-report
           path: regression_report.json
 
+      - name: Note dashboard upload skipped
+        if: steps.check-results.outputs.has_results == 'true' && needs.resolve-atom-source.outputs.publish_to_dashboard != 'true'
+        run: |
+          printf '### Dashboard upload skipped\nResults from `%s@%s` using `%s` OOT image mode are intentionally kept out of the benchmark dashboard.\n' \
+            "${{ needs.resolve-atom-source.outputs.atom_repository }}" \
+            "${{ needs.resolve-atom-source.outputs.atom_ref }}" \
+            "${{ needs.resolve-atom-source.outputs.oot_image_source }}" >> "$GITHUB_STEP_SUMMARY"
+
       - name: Transform results for benchmark dashboard
-        if: steps.check-results.outputs.has_results == 'true'
+        if: steps.check-results.outputs.has_results == 'true' && needs.resolve-atom-source.outputs.publish_to_dashboard == 'true'
         run: |
           python3 .github/scripts/oot_benchmark_to_dashboard.py \
             . \
@@ -497,7 +818,7 @@ jobs:
             --run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
 
       - name: Store benchmark result to dashboard
-        if: steps.check-results.outputs.has_results == 'true'
+        if: steps.check-results.outputs.has_results == 'true' && needs.resolve-atom-source.outputs.publish_to_dashboard == 'true'
         uses: benchmark-action/github-action-benchmark@v1
         with:
           tool: customBiggerIsBetter
@@ -512,7 +833,7 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Deploy custom dashboard to gh-pages
-        if: steps.check-results.outputs.has_results == 'true'
+        if: steps.check-results.outputs.has_results == 'true' && needs.resolve-atom-source.outputs.publish_to_dashboard == 'true'
         run: |
           git config user.name "github-actions[bot]"
           git config user.email "github-actions[bot]@users.noreply.github.com"

From 8c8833ca9facdc2a8eafe8ea63984df618532082 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Thu, 26 Mar 2026 12:03:06 +0800
Subject: [PATCH 8/9] change the oot benchmark behavior

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/benchmark/oot_benchmark_models.json   |  8 +++----
 .../workflows/atom-vllm-oot-benchmark.yaml    | 21 +++++++++++--------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json
index a2781102c..6469e9b15 100644
--- a/.github/benchmark/oot_benchmark_models.json
+++ b/.github/benchmark/oot_benchmark_models.json
@@ -1,6 +1,6 @@
 [
   {
-    "display": "DeepSeek-R1 FP8",
+    "display": "DeepSeek-R1 FP8 TP8",
     "source_path": "deepseek-ai/DeepSeek-R1-0528",
     "path": "/models/DeepSeek-R1-0528",
     "prefix": "deepseek-r1-fp8",
@@ -10,7 +10,7 @@
     "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {
-    "display": "DeepSeek-R1 MXFP4",
+    "display": "DeepSeek-R1 MXFP4 TP8",
     "source_path": "amd/DeepSeek-R1-0528-MXFP4",
     "path": "/models/DeepSeek-R1-0528-MXFP4",
     "prefix": "deepseek-r1-mxfp4",
@@ -20,7 +20,7 @@
     "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {
-    "display": "gpt-oss-120b",
+    "display": "gpt-oss-120b TP1",
     "source_path": "amd/gpt-oss-120b-w-mxfp4-a-fp8",
     "path": "/models/gpt-oss-120b",
     "prefix": "gpt-oss-120b",
@@ -52,7 +52,7 @@
     "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
   },
   {
-    "display": "Qwen3.5-397B-A17B-FP8",
+    "display": "Qwen3.5-397B-A17B-FP8 TP8",
     "source_path": "Qwen/Qwen3.5-397B-A17B-FP8",
     "path": "/models/Qwen3.5-397B-A17B-FP8",
     "prefix": "qwen3-5-397b-a17b-fp8",
diff --git a/.github/workflows/atom-vllm-oot-benchmark.yaml b/.github/workflows/atom-vllm-oot-benchmark.yaml
index 0ba8beb9d..8a2245214 100644
--- a/.github/workflows/atom-vllm-oot-benchmark.yaml
+++ b/.github/workflows/atom-vllm-oot-benchmark.yaml
@@ -8,31 +8,31 @@ on:
   workflow_dispatch:
     inputs:
       deepseek-r1-fp8:
-        description: "Benchmark DeepSeek-R1 FP8"
+        description: "DeepSeek-R1 FP8 TP8"
         type: boolean
         default: false
       deepseek-r1-mxfp4:
-        description: "Benchmark DeepSeek-R1 MXFP4"
+        description: "DeepSeek-R1 MXFP4 TP8"
         type: boolean
         default: false
       gpt-oss-120b:
-        description: "Benchmark gpt-oss-120b"
+        description: "gpt-oss-120b TP1"
         type: boolean
         default: false
       kimi-k2-thinking-mxfp4-tp4:
-        description: "Benchmark Kimi-K2-Thinking-MXFP4 TP4"
+        description: "Kimi-K2-Thinking-MXFP4 TP4"
         type: boolean
         default: false
       kimi-k2-thinking-mxfp4-tp8:
-        description: "Benchmark Kimi-K2-Thinking-MXFP4 TP8"
+        description: "Kimi-K2-Thinking-MXFP4 TP8"
         type: boolean
         default: false
       qwen3-5-397b-a17b-fp8:
-        description: "Benchmark Qwen3.5-397B-A17B-FP8"
+        description: "Qwen3.5-397B-A17B-FP8 TP8"
         type: boolean
         default: false
       oot_image:
-        description: "Prebuilt OOT benchmark image to pull when no custom rebuild is needed"
+        description: "Prebuilt OOT benchmark image used only for main-branch runs; ignored for non-main branches because a custom image is rebuilt from the selected branch"
         type: string
         default: "rocm/atom-dev:vllm-latest"
       param_lists:
@@ -132,10 +132,13 @@ jobs:
             "${OOT_IMAGE_SOURCE}" >> "$GITHUB_STEP_SUMMARY"
 
           if [[ "${REBUILD_OOT_IMAGE}" == "true" ]]; then
-            printf -- '- Base image: `%s`\n- vLLM version: `%s`\n- vLLM commit: `%s`\n- vLLM source: `.github/workflows/docker-release.yaml`\n' \
+            printf -- '- Base image: `%s`\n- vLLM version: `%s`\n- vLLM commit: `%s`\n- vLLM source: `.github/workflows/docker-release.yaml`\n- Ignored prebuilt image input: `%s`\n' \
               "${OOT_BASE_IMAGE}" \
               "${SELECTED_VLLM_VERSION}" \
-              "${SELECTED_VLLM_COMMIT}" >> "$GITHUB_STEP_SUMMARY"
+              "${SELECTED_VLLM_COMMIT}" \
+              "${PREBUILT_OOT_IMAGE}" >> "$GITHUB_STEP_SUMMARY"
+            printf -- '\nBecause branch `%s` is not `main`, a custom OOT docker is rebuilt from the selected branch and the prebuilt image input is ignored.\n' \
+              "${ATOM_REF}" >> "$GITHUB_STEP_SUMMARY"
           else
             printf -- '- Prebuilt image: `%s`\n- Expected vLLM version: `%s`\n- Expected vLLM commit: `%s`\n' \
               "${PREBUILT_OOT_IMAGE}" \

From 80bff24ac293a715909e2fe30995d4bef63d8278 Mon Sep 17 00:00:00 2001
From: zejunchen-zejun <zejun.chen@amd.com>
Date: Thu, 26 Mar 2026 12:13:56 +0800
Subject: [PATCH 9/9] refine the docker remove logic and rebuild logic

Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
---
 .github/workflows/atom-vllm-oot-benchmark.yaml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/atom-vllm-oot-benchmark.yaml b/.github/workflows/atom-vllm-oot-benchmark.yaml
index 8a2245214..ea601ac9c 100644
--- a/.github/workflows/atom-vllm-oot-benchmark.yaml
+++ b/.github/workflows/atom-vllm-oot-benchmark.yaml
@@ -231,8 +231,11 @@ jobs:
 
   build-oot-image:
     name: Build custom OOT benchmark image
-    needs: [resolve-atom-source]
-    if: needs.resolve-atom-source.outputs.rebuild_oot_image == 'true'
+    needs: [resolve-atom-source, load-models]
+    if: >-
+      needs.resolve-atom-source.outputs.rebuild_oot_image == 'true'
+      && needs.load-models.result == 'success'
+      && needs.load-models.outputs.has_enabled_models == 'true'
     runs-on: build-only-atom
     outputs:
       oot_image_tag: ${{ steps.image-meta.outputs.oot_image_tag }}
@@ -672,7 +675,11 @@ jobs:
           docker exec "$CONTAINER_NAME" bash -lc "if [ -f /tmp/vllm_oot.pid ]; then kill \$(cat /tmp/vllm_oot.pid) || true; fi" || true
           docker stop "$CONTAINER_NAME" || true
           docker rm "$CONTAINER_NAME" || true
-          docker rmi "$OOT_IMAGE_TAG" || true
+          if [[ "${OOT_IMAGE_SOURCE}" == "rebuild" ]]; then
+            docker rmi "$OOT_IMAGE_TAG" || true
+          else
+            echo "Keeping prebuilt OOT image cached on runner: ${OOT_IMAGE_TAG}"
+          fi
 
   summarize-benchmark-result:
     if: always()