From d0c3088628bb49945b42755c4ac2d56b18538786 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 15:37:40 -0400
Subject: [PATCH 1/2] Add RTFx tracking and validation to all benchmark
 workflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes:
1. Add RTFx metric extraction to qwen3-asr-benchmark.yml
   - Extract medianRTFx and overallRTFx from benchmark results
   - Display metrics in PR comments

2. Add RTFx validation to all 6 benchmark workflows
   - Fail with exit 1 if RTFx is 0 or N/A
   - Prevents silent benchmark failures from being reported as success

3. Fix PR comment posting with if: always()
   - Add "if: always()" to Comment PR steps
   - Ensures PR comments post even when validation fails
   - Allows users to see what went wrong

Workflows updated:
- qwen3-asr-benchmark.yml: Validate 2 RTFx metrics + add tracking
- asr-benchmark.yml: Validate 6 RTFx metrics (v2/v3 × clean/other/streaming)
- diarizer-benchmark.yml: Validate 1 RTFx metric
- parakeet-eou-benchmark.yml: Validate 1 RTFx metric
- sortformer-benchmark.yml: Validate 1 RTFx metric
- vad-benchmark.yml: Validate 2 RTFx metrics (MUSAN + VOiCES)

If RTFx is 0, it means:
- Benchmark didn't run properly
- Audio duration was 0
- Processing failed silently
- Metric extraction failed

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 .github/workflows/asr-benchmark.yml          | 41 ++++++++++++++++++--
 .github/workflows/diarizer-benchmark.yml     |  7 ++++
 .github/workflows/parakeet-eou-benchmark.yml |  9 ++++-
 .github/workflows/qwen3-asr-benchmark.yml    | 35 ++++++++++++++++-
 .github/workflows/sortformer-benchmark.yml   |  7 ++++
 .github/workflows/vad-benchmark.yml          | 26 +++++++++++++
 6 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/asr-benchmark.yml b/.github/workflows/asr-benchmark.yml
index 302577ffb..bf201c414 100644
--- a/.github/workflows/asr-benchmark.yml
+++ b/.github/workflows/asr-benchmark.yml
@@ -198,9 +198,38 @@ jobs:
           echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT
           echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT
 
+          # Validate RTFx values - 0 indicates benchmark failure
+          if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "N/A" ]; then
+            echo "⚠️ test-clean RTFx is 0 or N/A - benchmark may have failed"
+            CLEAN_RTFX_FAILED=1
+          fi
+          if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then
+            echo "⚠️ test-clean (v2) RTFx is 0 or N/A - benchmark may have failed"
+            CLEAN_V2_RTFX_FAILED=1
+          fi
+          if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "N/A" ]; then
+            echo "⚠️ test-other RTFx is 0 or N/A - benchmark may have failed"
+            OTHER_RTFX_FAILED=1
+          fi
+          if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then
+            echo "⚠️ test-other (v2) RTFx is 0 or N/A - benchmark may have failed"
+            OTHER_V2_RTFX_FAILED=1
+          fi
+          if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "N/A" ]; then
+            echo "⚠️ streaming RTFx is 0 or N/A - benchmark may have failed"
+            STREAMING_RTFX_FAILED=1
+          fi
+          if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then
+            echo "⚠️ streaming (v2) RTFx is 0 or N/A - benchmark may have failed"
+            STREAMING_V2_RTFX_FAILED=1
+          fi
+
           # Report failures summary
           if [ ! -z "$CLEAN_FAILED" ] || [ ! -z "$OTHER_FAILED" ] || [ ! -z "$STREAMING_FAILED" ] || \
-             [ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ]; then
+             [ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ] || \
+             [ ! -z "$CLEAN_RTFX_FAILED" ] || [ ! -z "$CLEAN_V2_RTFX_FAILED" ] || \
+             [ ! -z "$OTHER_RTFX_FAILED" ] || [ ! -z "$OTHER_V2_RTFX_FAILED" ] || \
+             [ ! -z "$STREAMING_RTFX_FAILED" ] || [ ! -z "$STREAMING_V2_RTFX_FAILED" ]; then
             echo "BENCHMARK_STATUS=PARTIAL_FAILURE" >> $GITHUB_OUTPUT
             echo "⚠️ Some benchmarks failed:"
             [ ! -z "$CLEAN_FAILED" ] && echo "  - test-clean benchmark failed"
@@ -209,14 +238,20 @@ jobs:
             [ ! -z "$CLEAN_V2_FAILED" ] && echo "  - test-clean (v2) benchmark failed"
             [ ! -z "$OTHER_V2_FAILED" ] && echo "  - test-other (v2) benchmark failed"
             [ ! -z "$STREAMING_V2_FAILED" ] && echo "  - streaming (v2) benchmark failed"
-            # Don't exit with error to allow PR comment to be posted
+            [ ! -z "$CLEAN_RTFX_FAILED" ] && echo "  - test-clean RTFx is 0"
+            [ ! -z "$CLEAN_V2_RTFX_FAILED" ] && echo "  - test-clean (v2) RTFx is 0"
+            [ ! -z "$OTHER_RTFX_FAILED" ] && echo "  - test-other RTFx is 0"
+            [ ! -z "$OTHER_V2_RTFX_FAILED" ] && echo "  - test-other (v2) RTFx is 0"
+            [ ! -z "$STREAMING_RTFX_FAILED" ] && echo "  - streaming RTFx is 0"
+            [ ! -z "$STREAMING_V2_RTFX_FAILED" ] && echo "  - streaming (v2) RTFx is 0"
+            exit 1
           else
             echo "BENCHMARK_STATUS=SUCCESS" >> $GITHUB_OUTPUT
             echo "✅ All benchmarks completed successfully"
           fi
 
       - name: Comment PR
-        if: github.event_name == 'pull_request'
+        if: always() && github.event_name == 'pull_request'
         continue-on-error: true
         uses: actions/github-script@v7
         with:
diff --git a/.github/workflows/diarizer-benchmark.yml b/.github/workflows/diarizer-benchmark.yml
index 43251f671..0775ad844 100644
--- a/.github/workflows/diarizer-benchmark.yml
+++ b/.github/workflows/diarizer-benchmark.yml
@@ -115,6 +115,13 @@ jobs:
           echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT
           echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT
 
+          # Validate RTFx - 0 indicates benchmark failure
+          if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then
+            echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
+            echo "RTFx value: $RTF"
+            exit 1
+          fi
+
       - name: Comment PR with Benchmark Results
         if: always()
         uses: actions/github-script@v7
diff --git a/.github/workflows/parakeet-eou-benchmark.yml b/.github/workflows/parakeet-eou-benchmark.yml
index bef59e306..0b3bb62ba 100644
--- a/.github/workflows/parakeet-eou-benchmark.yml
+++ b/.github/workflows/parakeet-eou-benchmark.yml
@@ -104,8 +104,15 @@ jobs:
           echo "MAX_FILES=$MAX_FILES" >> $GITHUB_OUTPUT
           echo "BENCHMARK_STATUS=$BENCHMARK_STATUS" >> $GITHUB_OUTPUT
 
+          # Validate RTFx - 0 or N/A indicates benchmark failure
+          if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "N/A" ]; then
+            echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed"
+            echo "RTFx value: $RTFx"
+            exit 1
+          fi
+
       - name: Comment PR
-        if: github.event_name == 'pull_request'
+        if: always() && github.event_name == 'pull_request'
         continue-on-error: true
         uses: actions/github-script@v7
         with:
diff --git a/.github/workflows/qwen3-asr-benchmark.yml b/.github/workflows/qwen3-asr-benchmark.yml
index 074bede82..ac1e15580 100644
--- a/.github/workflows/qwen3-asr-benchmark.yml
+++ b/.github/workflows/qwen3-asr-benchmark.yml
@@ -69,17 +69,44 @@ jobs:
             echo "SMOKE_STATUS=FAILED" >> $GITHUB_OUTPUT
           fi
 
+          # Extract RTFx metrics if results file exists
+          if [ -f qwen3_results_int8.json ]; then
+            MEDIAN_RTFx=$(jq -r '.summary.medianRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null)
+            OVERALL_RTFx=$(jq -r '.summary.overallRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null)
+
+            [ "$MEDIAN_RTFx" != "null" ] && [ "$MEDIAN_RTFx" != "N/A" ] && MEDIAN_RTFx=$(printf "%.2f" "$MEDIAN_RTFx") || MEDIAN_RTFx="N/A"
+            [ "$OVERALL_RTFx" != "null" ] && [ "$OVERALL_RTFx" != "N/A" ] && OVERALL_RTFx=$(printf "%.2f" "$OVERALL_RTFx") || OVERALL_RTFx="N/A"
+
+            echo "MEDIAN_RTFx=$MEDIAN_RTFx" >> $GITHUB_OUTPUT
+            echo "OVERALL_RTFx=$OVERALL_RTFx" >> $GITHUB_OUTPUT
+
+            # Fail if RTFx is 0 or N/A - indicates benchmark failure
+            if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ]; then
+              echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed to produce valid results"
+              echo "Median RTFx: $MEDIAN_RTFx"
+              echo "Overall RTFx: $OVERALL_RTFx"
+              exit 1
+            fi
+          else
+            echo "❌ CRITICAL: Results file not found - benchmark failed"
+            echo "MEDIAN_RTFx=N/A" >> $GITHUB_OUTPUT
+            echo "OVERALL_RTFx=N/A" >> $GITHUB_OUTPUT
+            exit 1
+          fi
+
           EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s
           echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT
 
       - name: Comment PR
-        if: github.event_name == 'pull_request'
+        if: always() && github.event_name == 'pull_request'
         continue-on-error: true
         uses: actions/github-script@v7
         with:
           script: |
             const status = '${{ steps.smoketest.outputs.SMOKE_STATUS }}';
             const emoji = status === 'PASSED' ? '✅' : '❌';
+            const medianRTFx = '${{ steps.smoketest.outputs.MEDIAN_RTFx }}';
+            const overallRTFx = '${{ steps.smoketest.outputs.OVERALL_RTFx }}';
 
             const body = `## Qwen3-ASR int8 Smoke Test ${emoji}
 
@@ -91,6 +118,12 @@ jobs:
             | Transcription pipeline | ${emoji} |
             | Decoder size | 571 MB (vs 1.1 GB f32) |
 
+            ### Performance Metrics
+            | Metric | CI Value | Expected on Apple Silicon |
+            |--------|----------|--------------------------|
+            | Median RTFx | ${medianRTFx}x | ~2.5x |
+            | Overall RTFx | ${overallRTFx}x | ~2.5x |
+
             <sub>Runtime: ${{ steps.smoketest.outputs.EXECUTION_TIME }}</sub>
 
             <sub>**Note:** CI VM lacks physical GPU — CoreML MLState (macOS 15) KV cache produces degraded results on virtualized runners. On Apple Silicon: ~1.3% WER / 2.5x RTFx.</sub>
diff --git a/.github/workflows/sortformer-benchmark.yml b/.github/workflows/sortformer-benchmark.yml
index 2f9edd701..a3e04d662 100644
--- a/.github/workflows/sortformer-benchmark.yml
+++ b/.github/workflows/sortformer-benchmark.yml
@@ -115,6 +115,13 @@ jobs:
           echo "DETECTED=${DETECTED}" >> $GITHUB_OUTPUT
           echo "GROUND_TRUTH=${GROUND_TRUTH}" >> $GITHUB_OUTPUT
 
+          # Validate RTFx - 0 indicates benchmark failure
+          if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then
+            echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
+            echo "RTFx value: $RTF"
+            exit 1
+          fi
+
       - name: Comment PR with Benchmark Results
         if: always()
         uses: actions/github-script@v7
diff --git a/.github/workflows/vad-benchmark.yml b/.github/workflows/vad-benchmark.yml
index 3a75a0e60..22806c1fc 100644
--- a/.github/workflows/vad-benchmark.yml
+++ b/.github/workflows/vad-benchmark.yml
@@ -74,6 +74,32 @@ jobs:
             --threshold 0.5 \
             --output voices_vad_results.json
 
+      - name: Validate RTFx metrics
+        run: |
+          # Validate MUSAN RTFx
+          if [ -f musan_vad_results.json ]; then
+            MUSAN_RTFx=$(jq -r '.rtfx // 0' musan_vad_results.json)
+            if [ "$MUSAN_RTFx" = "0" ] || [ -z "$MUSAN_RTFx" ]; then
+              echo "❌ CRITICAL: MUSAN RTFx is 0 or empty - benchmark failed"
+              exit 1
+            fi
+          else
+            echo "❌ CRITICAL: musan_vad_results.json not found"
+            exit 1
+          fi
+
+          # Validate VOiCES RTFx
+          if [ -f voices_vad_results.json ]; then
+            VOICES_RTFx=$(jq -r '.rtfx // 0' voices_vad_results.json)
+            if [ "$VOICES_RTFx" = "0" ] || [ -z "$VOICES_RTFx" ]; then
+              echo "❌ CRITICAL: VOiCES RTFx is 0 or empty - benchmark failed"
+              exit 1
+            fi
+          else
+            echo "❌ CRITICAL: voices_vad_results.json not found"
+            exit 1
+          fi
+
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4

From eea5376e9b660c2d9b5927742c9a55275cd17058 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 15:58:10 -0400
Subject: [PATCH 2/2] Fix EXECUTION_TIME calculation and improve validation
 consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes issues identified in review:

1. Move EXECUTION_TIME calculation before RTFx validation (qwen3)
   - Previously exit 1 prevented EXECUTION_TIME from being set
   - Now PR comments show proper runtime even when validation fails

2. Standardize error messages to "❌ CRITICAL:" across all workflows
   - Changed "⚠️" to "❌ CRITICAL:" for consistency
   - All validation failures now use the same format

3. Add more zero format checks (0, 0.0, 0.00)
   - Catches edge cases like "0" or "0.0" in addition to "0.00"
   - More robust string comparison for RTFx validation

Workflows updated:
- qwen3-asr-benchmark.yml: Move EXECUTION_TIME before validation
- asr-benchmark.yml: Standardize error messages, add zero variants
- parakeet-eou-benchmark.yml: Add zero variants and empty check

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 .github/workflows/asr-benchmark.yml          | 24 ++++++++++----------
 .github/workflows/parakeet-eou-benchmark.yml |  2 +-
 .github/workflows/qwen3-asr-benchmark.yml    | 12 ++++++----
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/asr-benchmark.yml b/.github/workflows/asr-benchmark.yml
index bf201c414..dcad9c70b 100644
--- a/.github/workflows/asr-benchmark.yml
+++ b/.github/workflows/asr-benchmark.yml
@@ -199,28 +199,28 @@ jobs:
           echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT
 
           # Validate RTFx values - 0 indicates benchmark failure
-          if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "N/A" ]; then
-            echo "⚠️ test-clean RTFx is 0 or N/A - benchmark may have failed"
+          if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "0.0" ] || [ "$CLEAN_RTFx" = "0" ] || [ "$CLEAN_RTFx" = "N/A" ]; then
+            echo "❌ CRITICAL: test-clean RTFx is 0 or N/A - benchmark failed"
             CLEAN_RTFX_FAILED=1
           fi
-          if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then
-            echo "⚠️ test-clean (v2) RTFx is 0 or N/A - benchmark may have failed"
+          if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "0.0" ] || [ "$CLEAN_V2_RTFx" = "0" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then
+            echo "❌ CRITICAL: test-clean (v2) RTFx is 0 or N/A - benchmark failed"
             CLEAN_V2_RTFX_FAILED=1
           fi
-          if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "N/A" ]; then
-            echo "⚠️ test-other RTFx is 0 or N/A - benchmark may have failed"
+          if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "0.0" ] || [ "$OTHER_RTFx" = "0" ] || [ "$OTHER_RTFx" = "N/A" ]; then
+            echo "❌ CRITICAL: test-other RTFx is 0 or N/A - benchmark failed"
             OTHER_RTFX_FAILED=1
           fi
-          if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then
-            echo "⚠️ test-other (v2) RTFx is 0 or N/A - benchmark may have failed"
+          if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "0.0" ] || [ "$OTHER_V2_RTFx" = "0" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then
+            echo "❌ CRITICAL: test-other (v2) RTFx is 0 or N/A - benchmark failed"
             OTHER_V2_RTFX_FAILED=1
           fi
-          if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "N/A" ]; then
-            echo "⚠️ streaming RTFx is 0 or N/A - benchmark may have failed"
+          if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "0.0" ] || [ "$STREAMING_RTFx" = "0" ] || [ "$STREAMING_RTFx" = "N/A" ]; then
+            echo "❌ CRITICAL: streaming RTFx is 0 or N/A - benchmark failed"
             STREAMING_RTFX_FAILED=1
           fi
-          if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then
-            echo "⚠️ streaming (v2) RTFx is 0 or N/A - benchmark may have failed"
+          if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "0.0" ] || [ "$STREAMING_V2_RTFx" = "0" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then
+            echo "❌ CRITICAL: streaming (v2) RTFx is 0 or N/A - benchmark failed"
             STREAMING_V2_RTFX_FAILED=1
           fi
 
diff --git a/.github/workflows/parakeet-eou-benchmark.yml b/.github/workflows/parakeet-eou-benchmark.yml
index 0b3bb62ba..b5a23d4df 100644
--- a/.github/workflows/parakeet-eou-benchmark.yml
+++ b/.github/workflows/parakeet-eou-benchmark.yml
@@ -105,7 +105,7 @@ jobs:
           echo "BENCHMARK_STATUS=$BENCHMARK_STATUS" >> $GITHUB_OUTPUT
 
           # Validate RTFx - 0 or N/A indicates benchmark failure
-          if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "N/A" ]; then
+          if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "0.0" ] || [ "$RTFx" = "0" ] || [ "$RTFx" = "N/A" ] || [ -z "$RTFx" ]; then
             echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed"
             echo "RTFx value: $RTFx"
             exit 1
diff --git a/.github/workflows/qwen3-asr-benchmark.yml b/.github/workflows/qwen3-asr-benchmark.yml
index ac1e15580..a23795369 100644
--- a/.github/workflows/qwen3-asr-benchmark.yml
+++ b/.github/workflows/qwen3-asr-benchmark.yml
@@ -69,6 +69,10 @@ jobs:
             echo "SMOKE_STATUS=FAILED" >> $GITHUB_OUTPUT
           fi
 
+          # Calculate execution time before validation (needed for PR comment)
+          EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s
+          echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT
+
           # Extract RTFx metrics if results file exists
           if [ -f qwen3_results_int8.json ]; then
             MEDIAN_RTFx=$(jq -r '.summary.medianRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null)
@@ -80,8 +84,9 @@ jobs:
             echo "MEDIAN_RTFx=$MEDIAN_RTFx" >> $GITHUB_OUTPUT
             echo "OVERALL_RTFx=$OVERALL_RTFx" >> $GITHUB_OUTPUT
 
-            # Fail if RTFx is 0 or N/A - indicates benchmark failure
-            if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ]; then
+            # Validate RTFx - fail if 0 or N/A (indicates benchmark failure)
+            if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$MEDIAN_RTFx" = "0" ] || [ "$MEDIAN_RTFx" = "0.0" ] || \
+               [ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "0" ] || [ "$OVERALL_RTFx" = "0.0" ]; then
               echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed to produce valid results"
               echo "Median RTFx: $MEDIAN_RTFx"
               echo "Overall RTFx: $OVERALL_RTFx"
@@ -94,9 +99,6 @@ jobs:
             exit 1
           fi
 
-          EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s
-          echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT
-
       - name: Comment PR
         if: always() && github.event_name == 'pull_request'
         continue-on-error: true