SemiAnalysisAI · Oseltamivir · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 9, 2026
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -54,6 +54,11 @@ on:
         type: boolean
         required: true
         default: false
+      eval-only:
+        description: "Run only evals (skip throughput benchmark)"
+        type: boolean
+        required: false
+        default: false
       random-range-ratio:
         required: false
         type: string
@@ -83,6 +88,8 @@ env:
   SPEC_DECODING: ${{ inputs.spec-decoding }}
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
+  EVAL_ONLY: ${{ inputs.eval-only }}
+  PYTHONDONTWRITEBYTECODE: '1'
 
 permissions:
   contents: read
@@ -91,7 +98,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 300
-    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.run-eval && ' | eval' || '' }}"
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
     steps:
       - name: Resource cleanup (pre-run)
         run: &resource-cleanup |
@@ -145,28 +152,42 @@ jobs:
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
-          FOUND_RESULT_FILE=
-          for i in {1..10}; do
-            if [ -f "$RESULT_FILENAME.json" ]; then
-              FOUND_RESULT_FILE=true
-              break
+
+          if [ "${{ inputs.eval-only }}" = "true" ]; then
+            echo "Eval-only mode: skipping benchmark result file check"
+            # Verify eval produced results
+            if ! ls results*.json 1>/dev/null 2>&1; then
+              echo "Eval-only run failed: no results*.json files found." >&2
+              exit 1
             fi
-            echo "Waiting for result file... (attempt $i)"
-            sleep 1
-          done
+            # Verify eval scores meet minimum threshold (85%)
+            python3 utils/evals/validate_scores.py
+          else
+            FOUND_RESULT_FILE=
+            for i in {1..10}; do
+              if [ -f "$RESULT_FILENAME.json" ]; then
+                FOUND_RESULT_FILE=true
+                break
+              fi
+              echo "Waiting for result file... (attempt $i)"
+              sleep 1
+            done
 
-          if [ -z "$FOUND_RESULT_FILE" ]; then
-            echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
-            exit 1
+            if [ -z "$FOUND_RESULT_FILE" ]; then
+              echo "Run failed: Benchmark result $RESULT_FILENAME.json not found." >&2
+              exit 1
+            fi
           fi
 
       - name: Process result
+        if: ${{ !inputs.eval-only }}
         env:
           RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
 
       - name: Upload result
+        if: ${{ !inputs.eval-only }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
@@ -176,31 +197,31 @@ jobs:
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: server_logs_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
           path: server.log
           if-no-files-found: ignore
 
       - name: Upload GPU metrics
         if: always()
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
-          name: gpu_metrics_${{ env.RESULT_FILENAME }}
+          name: ${{ inputs.eval-only && 'eval_gpu_metrics_' || 'gpu_metrics_' }}${{ env.RESULT_FILENAME }}
           path: gpu_metrics.csv
           if-no-files-found: ignore
 
       - name: Upload eval results (if any)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: eval_${{ env.EXP_NAME }}_${{ env.RESULT_FILENAME }}
           path: |
             meta_env.json
             results*.json
             sample*.jsonl
-          if-no-files-found: ignore
+          if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
 
       - name: Cleanup eval outputs (post-upload)
-        if: ${{ env.RUN_EVAL == 'true' }}
+        if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }}
         run: |
           rm -f meta_env.json || true
           # Remove any eval results JSONs that were moved into workspace

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -37,6 +37,7 @@ jobs:
         outputs:
             single-node-config: ${{ steps.get-jobs.outputs.single-node-config }}
             multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }}
+            eval-config: ${{ steps.get-jobs.outputs.eval-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -53,10 +54,12 @@ jobs:
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('run-eval', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x]))")
+                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))")
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
+                  echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
 
     test-sweep-multi-node:
         needs: get-jobs
@@ -123,7 +126,38 @@ jobs:
             conc: ${{ matrix.config.conc }}
             spec-decoding: ${{ matrix.config.spec-decoding }}
             disagg: ${{ matrix.config.disagg }}
-            run-eval: ${{ matrix.config.run-eval }}
+            run-eval: false
+            ref: ${{ inputs.ref }}
+
+    test-sweep-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.eval-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
             ref: ${{ inputs.ref }}
 
     collect-results:
@@ -135,7 +169,7 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-multi-node, test-sweep-single-node]
+        needs: [test-sweep-evals]
         if: ${{ always() }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit

diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
@@ -183,6 +183,36 @@ jobs:
         secrets: inherit
         with: *single-node-inputs
 
+    sweep-evals:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).evals }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+            run-eval: true
+            eval-only: true
+
     collect-results:
         needs:
             [
@@ -201,16 +231,7 @@ jobs:
             result-prefix: "bmk"
 
     collect-evals:
-        needs:
-            [
-                sweep-single-node-1k1k,
-                sweep-single-node-1k8k,
-                sweep-single-node-8k1k,
-                sweep-multi-node-1k1k,
-                sweep-multi-node-1k8k,
-                sweep-multi-node-8k1k,
-                setup,
-            ]
+        needs: [sweep-evals, setup]
         if: ${{ always() && needs.setup.result != 'skipped' }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
@@ -221,10 +242,12 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             - name: Extract and save changelog metadata
-              env:
-                  CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }}
               run: |
-                  echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json
+                  cat <<'CONFIGEOF' > _full_config.json
+                  ${{ needs.setup.outputs.search-space-config }}
+                  CONFIGEOF
+                  jq '.changelog_metadata' _full_config.json > changelog_metadata.json
+                  rm -f _full_config.json
 
             - name: Upload changelog artifact
               uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0