From 9addf6179104bd3e0600d72c9ffdb0e60aeee3a9 Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Tue, 16 Jun 2026 17:08:41 -0400
Subject: [PATCH 1/8] feat(guardrails): Benchmark analysis in-progress

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .github/workflows/ci.yaml                     |  84 +++++-
 .../benchmarks/configs/mock_llm/README.md     |  21 ++
 .../benchmarks/configs/mock_llm/app-llm.env   |  19 ++
 .../configs/mock_llm/content-safety-llm.env   |  19 ++
 .../results/local-baseline-2026-06-16.md      | 154 ++++++++++
 .../benchmarks/aiperf_runner.py               |  12 +-
 .../benchmarks/analyze.py                     | 270 ++++++++++++++++++
 .../benchmarks/constants.py                   |  11 +
 .../benchmarks/paths.py                       |  26 ++
 .../nemo_guardrails_plugin/benchmarks/run.py  | 238 +++++++++++----
 .../benchmarks/seeding.py                     |  26 ++
 11 files changed, 826 insertions(+), 54 deletions(-)
 create mode 100644 plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
 create mode 100644 plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env
 create mode 100644 plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env
 create mode 100644 plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md
 create mode 100644 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f8643c9cb5..40d8689002 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -1072,10 +1072,21 @@ jobs:
           path: web/packages/studio/playwright-report/
 
   benchmark-guardrails:
-    name: Guardrails plugin benchmark
+    # Run the two benchmark variants as parallel matrix jobs, each with its
+    # own NMP instance. This isolates them from each other (no shared mocks,
+    # no cross-talk on :8080) and roughly halves wall-clock vs. the previous
+    # single-job sequential layout. The `benchmark-guardrails-analyze` job
+    # below merges both artifacts and prints the with-vs-without comparison.
+    name: Guardrails plugin benchmark (${{ matrix.variant }})
     if: github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     timeout-minutes: 30
+    strategy:
+      # Don't cancel the other variant if one fails; the partial artifact
+      # is still useful for diagnosing what went wrong.
+      fail-fast: false
+      matrix:
+        variant: [with-guardrails, without-guardrails]
     steps:
       - name: Checkout nemo-platform
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
@@ -1102,7 +1113,13 @@ jobs:
           PYTORCH_DEPS: cpu
       - name: Run benchmark sweep
         working-directory: nemo-platform
-        run: make benchmark-guardrails
+        # Pin both variants to the same `--run-id` so when the analyze job
+        # downloads both artifacts into one `runs/` parent, they merge into
+        # a single run directory the analyzer can read normally.
+        run: |
+          make benchmark-guardrails BENCHMARK_ARGS="\
+            --variant ${{ matrix.variant }} \
+            --run-id ci-${{ github.run_id }}-${{ github.run_attempt }}"
         env:
           NEMO_GUARDRAILS_REPO_ROOT: ${{ github.workspace }}/NeMo-Guardrails
           _TYPER_FORCE_DISABLE_TERMINAL: "1"
@@ -1110,7 +1127,67 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: benchmark-guardrails-results
+          # Per-variant artifact name; GHA disallows two artifacts with the
+          # same name in one workflow run.
+          name: benchmark-guardrails-results-${{ matrix.variant }}
+          retention-days: 30
+          path: |
+            nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+
+  benchmark-guardrails-analyze:
+    # Joins the two parallel matrix jobs above, downloads both artifacts into
+    # one merged `runs/<id>/` tree, and prints the with-vs-without
+    # comparison table. No regression gate yet: this exists to produce
+    # CI-collected numbers we can use to seed a baseline file later.
+    name: Guardrails benchmark analysis
+    needs: [benchmark-guardrails]
+    if: github.event_name == 'workflow_dispatch' && !cancelled()
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout nemo-platform
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+        with:
+          path: nemo-platform
+      - name: Download with-guardrails artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: benchmark-guardrails-results-with-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Download without-guardrails artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: benchmark-guardrails-results-without-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Install uv
+        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
+        with:
+          working-directory: nemo-platform
+          python-version: "3.11"
+          enable-cache: true
+      - name: Bootstrap Python environment
+        working-directory: nemo-platform
+        run: make bootstrap-python
+        env:
+          PYTORCH_DEPS: cpu
+      - name: Print benchmark comparison
+        working-directory: nemo-platform
+        # Both matrix jobs above use the same `--run-id`, so there should be
+        # exactly one merged run directory under `runs/`. `ls -td ... | head -1`
+        # is defensive in case that ever stops being true (e.g. an artifact
+        # ever leaks in from a previous workflow attempt).
+        run: |
+          RUN_DIR=$(ls -td plugins/nemo-guardrails/benchmarks/artifacts/runs/*/ | head -1)
+          echo "Analyzing run directory: $RUN_DIR"
+          uv run --frozen --package nemo-guardrails-plugin --extra bench \
+            python -m nemo_guardrails_plugin.benchmarks.analyze "$RUN_DIR"
+      - name: Upload merged benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          # Single merged artifact so collecting baseline samples is a matter
+          # of downloading one thing per workflow run rather than two.
+          name: benchmark-guardrails-results-merged
           retention-days: 30
           path: |
             nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
@@ -1262,6 +1339,7 @@ jobs:
       - web-studio-deps
       - web-studio-e2e
       - benchmark-guardrails
+      - benchmark-guardrails-analyze
       - opa-policy-test
     if: always()
     runs-on: ubuntu-latest
diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
new file mode 100644
index 0000000000..1cd5cc3dd4
--- /dev/null
+++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
@@ -0,0 +1,21 @@
+# Mock LLM configurations
+
+These `.env` files configure the upstream `benchmark.mock_llm_server.run_server`
+(from the `NeMo-Guardrails` checkout) for the IGW guardrails benchmark.
+
+We keep our own copies (instead of pointing at the upstream checkout's
+`benchmark/mock_llm_server/configs/`) so:
+
+- We can change mock latency without touching the upstream repo. The original
+  motivation was tuning `E2E_LATENCY_*` to isolate NMP+middleware overhead
+  from mandatory NIM sleep (see the benchmark README for the full rationale).
+- The exact mock behavior we benchmarked against is versioned alongside the
+  results, so historical numbers stay reproducible even if upstream changes
+  its defaults.
+
+Initial contents are a verbatim copy of the upstream files:
+
+- `app-llm.env`            ← upstream `meta-llama-3.3-70b-instruct.env`
+- `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env`
+
+Update either file to change mock behavior for the next benchmark run.
diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env
new file mode 100644
index 0000000000..0cec095b38
--- /dev/null
+++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env
@@ -0,0 +1,19 @@
+MODEL="meta/llama-3.3-70b-instruct"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
+SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=4.0
+E2E_LATENCY_MAX_SECONDS=4.0
+E2E_LATENCY_MEAN_SECONDS=4.0
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.3
+TTFT_MAX_SECONDS=0.3
+TTFT_MEAN_SECONDS=0.3
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env
new file mode 100644
index 0000000000..9e2467b1db
--- /dev/null
+++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env
@@ -0,0 +1,19 @@
+MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
+SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=0.5
+E2E_LATENCY_MAX_SECONDS=0.5
+E2E_LATENCY_MEAN_SECONDS=0.5
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.2
+TTFT_MAX_SECONDS=0.2
+TTFT_MEAN_SECONDS=0.2
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
diff --git a/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md b/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md
new file mode 100644
index 0000000000..509c9e8c62
--- /dev/null
+++ b/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md
@@ -0,0 +1,154 @@
+# Local baseline results — 2026-06-16
+
+Three back-to-back runs of `make benchmark-guardrails` on a local MacBook Pro
+(Apple Silicon), no other heavy workloads running. Goal: characterize the
+run-to-run variance of the new with-guardrails / without-guardrails harness so
+we can decide what's gateable in CI.
+
+## Hardware / setup
+
+- Host: MacBook Pro, Apple Silicon, on AC power
+- NMP, mocks, shim: all on localhost
+- Mock LLM config: in-repo defaults (`plugins/nemo-guardrails/benchmarks/configs/mock_llm/`)
+  - app LLM: 4.0s e2e latency, std 0
+  - content-safety LLM: 0.5s e2e latency, std 0
+- AIPerf sweep: concurrency `[1, 2, 4, 8, 16, 32, 64]`, `benchmark_duration: 60s`,
+  `warmup_request_count: 10`, non-streaming chat completions
+- Mock workers: 4 (default)
+- Three runs in the same afternoon, NMP data dir reused across runs
+
+## Run inventory
+
+| Run | Run dir | Notes |
+|---|---|---|
+| 1 | `20260616_123851` | first run after the with/without harness change |
+| 2 | `20260616_145058` | identical config |
+| 3 | `20260616_152834` | identical config |
+
+All three runs completed with 7/7 sweeps passing per variant, exit code 0.
+
+## Δp50 (with-guardrails − without-guardrails), milliseconds
+
+This is the headline metric: how much wall-clock time the guardrails middleware
+adds on top of the bare NMP+IGW path, including the two content-safety LLM
+round-trips that the rails cause but don't do themselves.
+
+| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64    |
+|---------|-----:|-----:|-----:|-----:|-----:|-----:|--------:|
+| Run 1   | 1029 | 1071 | 1068 | 1104 | 1145 | 1260 |    778  |
+| Run 2   | 1027 | 1062 | 1096 | 1105 | 1226 | 1256 |  -2896  |
+| Run 3   | 1030 | 1062 | 1079 | 1070 | 1118 | 1201 |  -2077  |
+| **mean**| **1029** | **1065** | **1081** | **1093** | **1163** | **1239** | **−1398** |
+| range   |    3 |    9 |   28 |   35 |  108 |   59 |   3674  |
+| range % | 0.3% | 0.8% | 2.6% | 3.2% | 9.3% | 4.8% |   n/a   |
+
+## with-guardrails p50 (absolute), milliseconds
+
+Useful as a sanity check that nothing catastrophic shifted in the absolute
+numbers — even if Δp50 stays steady, both variants could slow down together.
+
+| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64 |
+|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| Run 1   | 5049 | 5101 | 5114 | 5152 | 5201 | 5318 | 6164 |
+| Run 2   | 5048 | 5093 | 5125 | 5137 | 5255 | 5279 | 5614 |
+| Run 3   | 5050 | 5094 | 5123 | 5146 | 5163 | 5250 | 5486 |
+| **mean**| **5049** | **5096** | **5121** | **5145** | **5206** | **5282** | **5755** |
+| range   |    2 |    8 |   11 |   15 |   92 |   68 |  678 |
+| range % | 0.0% | 0.2% | 0.2% | 0.3% | 1.8% | 1.3% | 11.8%|
+
+## without-guardrails p50 (absolute), milliseconds
+
+For completeness. This is the variant that's wildly unstable at c=64.
+
+| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64 |
+|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| Run 1   | 4020 | 4030 | 4045 | 4048 | 4056 | 4058 | 5386 |
+| Run 2   | 4020 | 4031 | 4029 | 4032 | 4029 | 4023 | 8510 |
+| Run 3   | 4020 | 4032 | 4044 | 4076 | 4045 | 4049 | 7563 |
+| **mean**| **4020** | **4031** | **4039** | **4052** | **4043** | **4043** | **7153** |
+| range   |    0 |    2 |   16 |   44 |   27 |   35 | 3124 |
+
+The app mock sleeps for exactly 4.0s. The ~20–80 ms above 4000 across c=1–c=32
+is pure NMP+IGW+shim overhead. At c=64 the mock saturates (4 workers × 1 req/4s
+= 4 RPS ceiling, vs. 64 requested in-flight) and requests queue.
+
+## p90 — informational only
+
+p90 is much noisier than p50 across runs. Not gateable with three samples.
+
+### Δp90, milliseconds
+
+| Run   | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64  |
+|-------|-----:|-----:|-----:|-----:|-----:|-----:|------:|
+| Run 1 | 1039 | 1099 | 1162 | 1025 |  911 |  604 | 3009  |
+| Run 2 | 1028 | 1115 | 1160 | 1262 |  783 |  641 | 1015  |
+| Run 3 | 1023 | 1076 | 1189 | 1085 | 1209 |   18 | 1998  |
+
+## Observations
+
+### What's stable enough to gate on
+
+**c=1, 2, 4, 8.** The Δp50 ranges are 3–35 ms, well under any tolerance we'd
+realistically write. The absolute with-guardrails p50 is even tighter (2–15 ms
+across three runs). This is the regime where the harness is genuinely measuring
+what we want: NMP+middleware overhead on top of fixed-latency mocks.
+
+### What's borderline
+
+**c=16.** Δp50 range is 9.3%. Gateable with a generous tolerance (~10%+) but
+adds limited signal beyond c=8.
+
+### What's not gateable
+
+**c=32.** ~5% Δp50 range. Still bounded, but the run-to-run distance is
+several times larger than at c=1–c=8 and the absolute numbers wobble too.
+
+**c=64.** Unusable. Δp50 swings from +778 to −2896 across three runs.
+Root cause is the app mock's 4-worker saturation at this load level: the
+without-guardrails path fires app requests as fast as it can and the mock queues
+unpredictably. The with-guardrails path's CS-mock work paces requests enough to
+hide most of this. This is a test-rig artifact, not an NMP behavior.
+
+### Side observation: middleware overhead is small
+
+Of the ~1029 ms Δp50 at c=1:
+- ~1000 ms is the two content-safety mock round-trips (0.5s each, mandatory).
+- ~29 ms is the middleware's *own* work (rails orchestration, request/response
+  shaping, etc.) plus bare NMP+IGW overhead delta vs. without-guardrails.
+
+The without-guardrails baseline of ~4020 ms at c=1 against a 4000 ms mock means
+**bare NMP+IGW+shim overhead is ~20 ms** at idle.
+
+## Recommendation for the CI gate
+
+Based on the variance data above:
+
+| Concurrency | Gate Δp50? | Gate absolute with-guardrails p50? | Notes |
+|---|---|---|---|
+| 1  | yes | yes | tightest signal |
+| 2  | yes | yes | |
+| 4  | yes | yes | |
+| 8  | yes | yes | |
+| 16 | informational | informational | record but don't fail |
+| 32 | informational | informational | record but don't fail |
+| 64 | exclude | exclude | mock saturation, not gateable |
+
+Proposed tolerance bands (`max(absolute_ms, relative_%)`):
+- Δp50: `max(±100 ms, ±5%)`
+- with-guardrails p50: `max(±150 ms, ±3%)`
+
+Both bands are ~3× the observed local run-to-run range, leaving headroom for
+CI hardware noise being noisier than a quiet laptop.
+
+## Open questions / followups
+
+- **Local baselines won't transfer to CI hardware.** These numbers should seed
+  the baseline file but be replaced once we have N runs from the actual CI
+  runner class.
+- **Three samples is a small N.** Worth one more local run (Run 4) before we
+  treat the means above as canonical, but the c=1–c=8 numbers are unlikely
+  to budge meaningfully.
+- **c=64 instability is downstream of NMP.** Hypothesis: app mock's 4 workers
+  saturate at concurrency 64 (4 RPS ceiling on 4.0s sleep). Easy to test by
+  running with `--mock-workers 16`. Not blocking the gate work since c=64 is
+  excluded anyway.
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
index f7ff6729c1..5cbfe70071 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
@@ -60,12 +60,15 @@ def prepare_runtime_aiperf_config(
     template_path: Path,
     runtime_config_path: Path,
     aiperf_output_dir: Path,
+    model_ref: str | None = None,
 ) -> dict[str, Any]:
     """Materialize the AIPerf config this run will use.
 
     Reads the checked-in ``template_path`` config, overrides its
-    ``output_base_dir`` to point inside the current run's directory, and writes
-    the result to ``runtime_config_path``. AIPerf is later invoked with
+    ``output_base_dir`` to point inside the current run's directory, optionally
+    overrides ``base_config.model`` (so the same template can target multiple
+    VirtualModels in one harness invocation), and writes the result to
+    ``runtime_config_path``. AIPerf is later invoked with
     ``--config-file <runtime_config_path>`` so every artifact lands under a
     separate per-run directory.
 
@@ -82,6 +85,11 @@ def prepare_runtime_aiperf_config(
     # Point AIPerf's output_base_dir at this run's directory so its results
     # nest under our per-run artifacts tree.
     config["output_base_dir"] = str(aiperf_output_dir)
+    if model_ref is not None:
+        base_config = config.get("base_config")
+        if not isinstance(base_config, dict):
+            raise ValueError(f"Expected `base_config` mapping in {template_path}, got {type(base_config).__name__}")
+        base_config["model"] = model_ref
     runtime_config_path.parent.mkdir(parents=True, exist_ok=True)
     runtime_config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
 
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
new file mode 100644
index 0000000000..6262159b8a
--- /dev/null
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
@@ -0,0 +1,270 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Post-run analyzer for the nemo-guardrails IGW benchmark.
+
+Reads the ``profile_export_aiperf.csv`` files produced by both benchmark
+variants in a single run directory and prints a side-by-side latency table:
+``with``, ``without``, and the ``Δ`` per concurrency level.
+
+The delta is what answers the question "how much latency does the guardrails
+middleware add", since the only thing that differs between the two variants
+is whether the middleware is attached to the targeted VirtualModel.
+
+Used two ways:
+
+* Standalone:
+  ``python -m nemo_guardrails_plugin.benchmarks.analyze <run-dir>``
+* Auto-invoked from ``run.py`` at the end of a sweep that ran both variants.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import logging
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+from nemo_guardrails_plugin.benchmarks.constants import (
+    VARIANT_WITH_GUARDRAILS,
+    VARIANT_WITHOUT_GUARDRAILS,
+)
+
+log = logging.getLogger(__name__)
+
+# Concurrency levels we expect from the sweep config. The analyzer only
+# reports on levels that exist in both variants; missing rungs are skipped
+# silently rather than failing, since a partial run can still be informative.
+_EXPECTED_CONCURRENCIES = (1, 2, 4, 8, 16, 32, 64)
+
+# Metric we read from `profile_export_aiperf.csv`. AIPerf reports a number of
+# metrics; this one is end-to-end request wall time including the shim hop,
+# IGW, middleware, and any mock-NIM round-trips.
+_LATENCY_METRIC = "Request Latency (ms)"
+
+
+@dataclass(frozen=True)
+class LatencyRow:
+    """Per-concurrency latency stats parsed from one AIPerf CSV."""
+
+    concurrency: int
+    avg: float
+    p50: float
+    p90: float
+    p99: float
+    std: float
+
+
+@dataclass(frozen=True)
+class ComparisonRow:
+    """Side-by-side comparison of one concurrency level across variants."""
+
+    concurrency: int
+    with_guardrails: LatencyRow
+    without_guardrails: LatencyRow
+
+    @property
+    def delta_p50(self) -> float:
+        return self.with_guardrails.p50 - self.without_guardrails.p50
+
+    @property
+    def delta_p90(self) -> float:
+        return self.with_guardrails.p90 - self.without_guardrails.p90
+
+    @property
+    def delta_avg(self) -> float:
+        return self.with_guardrails.avg - self.without_guardrails.avg
+
+
+def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]:
+    """Load per-concurrency latency stats for one variant.
+
+    Walks the same ``<batch>/<timestamp>/concurrency<N>/`` layout that
+    ``collect_sweep_results`` produces. Missing files are logged and skipped
+    rather than raising, so a partial run still produces a useful table.
+    """
+    if not variant_output_dir.is_dir():
+        return {}
+
+    rows: dict[int, LatencyRow] = {}
+    for batch_dir in sorted(p for p in variant_output_dir.iterdir() if p.is_dir()):
+        for timestamp_dir in sorted(p for p in batch_dir.iterdir() if p.is_dir()):
+            for sweep_dir in sorted(p for p in timestamp_dir.iterdir() if p.is_dir()):
+                concurrency = _parse_concurrency_from_label(sweep_dir.name)
+                if concurrency is None:
+                    continue
+                csv_path = sweep_dir / "profile_export_aiperf.csv"
+                row = _read_latency_row(csv_path, concurrency)
+                if row is not None:
+                    rows[concurrency] = row
+    return rows
+
+
+def compare(
+    with_rows: dict[int, LatencyRow],
+    without_rows: dict[int, LatencyRow],
+) -> list[ComparisonRow]:
+    """Build per-concurrency comparison rows, ordered by concurrency.
+
+    Only concurrencies present in *both* variants are included; an asymmetry
+    means one variant failed for that level and the comparison is undefined.
+    """
+    shared = sorted(set(with_rows) & set(without_rows))
+    return [ComparisonRow(c, with_rows[c], without_rows[c]) for c in shared]
+
+
+def format_table(rows: list[ComparisonRow]) -> str:
+    """Render the comparison as a fixed-width text table."""
+    if not rows:
+        return "No comparable sweep results found (need both variants to share concurrency levels)."
+
+    header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg")
+    fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
+    lines = [fmt.format(*header), "-" * 109]
+    for r in rows:
+        lines.append(
+            fmt.format(
+                r.concurrency,
+                f"{r.with_guardrails.p50:.0f}",
+                f"{r.without_guardrails.p50:.0f}",
+                f"{r.delta_p50:+.0f}",
+                f"{r.with_guardrails.p90:.0f}",
+                f"{r.without_guardrails.p90:.0f}",
+                f"{r.delta_p90:+.0f}",
+                f"{r.with_guardrails.avg:.0f}",
+                f"{r.without_guardrails.avg:.0f}",
+                f"{r.delta_avg:+.0f}",
+            )
+        )
+    lines.append("")
+    lines.append("All values in milliseconds. 'Δ' = with-guardrails minus without-guardrails.")
+    return "\n".join(lines)
+
+
+def analyze_run(run_dir: Path) -> str:
+    """Top-level: read both variants from one run dir and return a printable table."""
+    aiperf_dir = run_dir / "aiperf_results"
+    with_rows = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS)
+    without_rows = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS)
+
+    if not with_rows and not without_rows:
+        return f"No AIPerf results found under {aiperf_dir}"
+    if not with_rows or not without_rows:
+        # Single-variant run: dump whichever side is present without trying
+        # to compute deltas.
+        only = with_rows or without_rows
+        which = VARIANT_WITH_GUARDRAILS if with_rows else VARIANT_WITHOUT_GUARDRAILS
+        return _format_single_variant(which, only)
+
+    rows = compare(with_rows, without_rows)
+    return format_table(rows)
+
+
+def _format_single_variant(variant: str, rows: dict[int, LatencyRow]) -> str:
+    """Render one variant's table when the other variant didn't run."""
+    fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}"
+    lines = [
+        f"Only one variant present: {variant}",
+        fmt.format("conc", "avg", "p50", "p90", "std"),
+        "-" * 49,
+    ]
+    for c in sorted(rows):
+        r = rows[c]
+        lines.append(fmt.format(c, f"{r.avg:.0f}", f"{r.p50:.0f}", f"{r.p90:.0f}", f"{r.std:.0f}"))
+    lines.append("")
+    lines.append("All values in milliseconds.")
+    return "\n".join(lines)
+
+
+def _parse_concurrency_from_label(label: str) -> int | None:
+    """Extract the integer N from a sweep label like ``concurrency16``.
+
+    Returns ``None`` for labels that don't match the expected pattern so
+    unrelated subdirectories (logs, etc.) are skipped silently.
+    """
+    if not label.startswith("concurrency"):
+        return None
+    try:
+        return int(label.removeprefix("concurrency"))
+    except ValueError:
+        return None
+
+
+def _read_latency_row(csv_path: Path, concurrency: int) -> LatencyRow | None:
+    """Pull the ``Request Latency (ms)`` line out of an AIPerf CSV.
+
+    AIPerf writes a header row followed by one ``Metric,avg,min,max,sum,p1,...``
+    row per metric, then a blank line and a second small block. We only care
+    about the first block.
+    """
+    if not csv_path.is_file():
+        log.debug("Missing CSV at %s; skipping", csv_path)
+        return None
+
+    try:
+        with csv_path.open(encoding="utf-8") as f:
+            reader = csv.reader(f)
+            header = next(reader, None)
+            if not header or header[0] != "Metric":
+                log.warning("Unexpected header in %s: %s", csv_path, header)
+                return None
+            try:
+                col = {name: header.index(name) for name in ("avg", "p50", "p90", "p99", "std")}
+            except ValueError as exc:
+                log.warning("Missing expected column in %s: %s", csv_path, exc)
+                return None
+            for row in reader:
+                if not row:
+                    break  # end of first block
+                if row[0] == _LATENCY_METRIC:
+                    return LatencyRow(
+                        concurrency=concurrency,
+                        avg=float(row[col["avg"]]),
+                        p50=float(row[col["p50"]]),
+                        p90=float(row[col["p90"]]),
+                        p99=float(row[col["p99"]]),
+                        std=float(row[col["std"]]),
+                    )
+    except (OSError, ValueError, IndexError) as exc:
+        log.warning("Failed to parse %s: %s", csv_path, exc)
+        return None
+
+    log.warning("Did not find '%s' row in %s", _LATENCY_METRIC, csv_path)
+    return None
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="nemo-guardrails-benchmark-analyze",
+        description=__doc__,
+    )
+    parser.add_argument(
+        "run_dir",
+        type=Path,
+        help=(
+            "Path to a run directory under "
+            "`plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/`."
+        ),
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=("DEBUG", "INFO", "WARNING", "ERROR"),
+    )
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level=args.log_level, format="%(levelname)s %(message)s")
+
+    run_dir: Path = args.run_dir.resolve()
+    if not run_dir.is_dir():
+        print(f"Not a directory: {run_dir}", file=sys.stderr)
+        return 2
+
+    print(analyze_run(run_dir))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py
index cbe4b19960..14db3561f4 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py
@@ -8,6 +8,17 @@
 WORKSPACE = "benchmark"
 GUARDRAIL_CONFIG = "content-safety-local"
 VM_NAME = "guardrails-vm"
+# Control VirtualModel with no middleware attached. Used by the benchmark
+# harness to measure NMP+IGW latency *without* the guardrails middleware so
+# the with-vs-without delta isolates middleware overhead.
+NO_GUARDRAILS_VM_NAME = "no-guardrails-vm"
+
+# Logical identifiers for the two benchmark variants. Used as subdirectory
+# names under `aiperf_results/` and `logs/`, and as the value of the
+# harness's `--variant` flag.
+VARIANT_WITH_GUARDRAILS = "with-guardrails"
+VARIANT_WITHOUT_GUARDRAILS = "without-guardrails"
+ALL_VARIANTS = (VARIANT_WITH_GUARDRAILS, VARIANT_WITHOUT_GUARDRAILS)
 
 # ModelProvider that proxies requests to the mock main model
 APP_PROVIDER = "benchmark-app-llm"
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
index b1113fa2c2..80dda5d329 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
@@ -54,6 +54,13 @@ class RunPaths:
     nmp_data_dir: Path
     # Checked-in YAML template for the AIPerf sweep config.
     config_template: Path
+    # In-repo copies of the upstream mock-LLM `.env` files. We point the
+    # mock server processes at these (via `--config-file`) so we can change
+    # mock latency or refusal behavior without touching the NeMo-Guardrails
+    # checkout, and so the exact mock config a benchmark ran against stays
+    # versioned alongside our results.
+    mock_app_env: Path
+    mock_content_safety_env: Path
     # Per-run materialized copy of `config_template` with `output_base_dir`
     # overridden to `aiperf_output_dir`. AIPerf is invoked against this file.
     runtime_config: Path
@@ -75,6 +82,23 @@ def ensure_directories(self) -> None:
         ):
             path.mkdir(parents=True, exist_ok=True)
 
+    def aiperf_output_dir_for(self, variant: str) -> Path:
+        """Per-variant AIPerf output dir under ``aiperf_results/<variant>/``.
+
+        Each variant gets its own subtree so a single run can hold side-by-side
+        sweeps (e.g. ``with-guardrails`` and ``without-guardrails``) without
+        their outputs colliding.
+        """
+        return self.aiperf_output_dir / variant
+
+    def runtime_config_for(self, variant: str) -> Path:
+        """Per-variant materialized AIPerf config under ``generated/``."""
+        return self.generated_dir / f"aiperf_config_{variant}.yaml"
+
+    def aiperf_log_for(self, variant: str) -> Path:
+        """Per-variant AIPerf stdout/stderr log under ``logs/``."""
+        return self.log_dir / f"aiperf_{variant}.log"
+
 
 def _now_run_id() -> str:
     return dt.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -120,4 +144,6 @@ def build_run_paths(
         config_template=benchmark_dir / "configs" / "nmp_igw_guardrails_sweep_concurrency.yaml",
         runtime_config=run_dir / "generated" / "nmp_igw_guardrails_sweep_concurrency.yaml",
         aiperf_venv_dir=artifacts_dir / "venvs" / "aiperf",
+        mock_app_env=benchmark_dir / "configs" / "mock_llm" / "app-llm.env",
+        mock_content_safety_env=benchmark_dir / "configs" / "mock_llm" / "content-safety-llm.env",
     )
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
index 47114e0195..e3386b303c 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
@@ -25,21 +25,27 @@
 import sys
 import time
 from contextlib import ExitStack
+from dataclasses import dataclass
 from pathlib import Path
 
 from nemo_guardrails_plugin.benchmarks.aiperf_runner import (
+    SweepRunResult,
     collect_sweep_results,
     prepare_runtime_aiperf_config,
     run_aiperf_sweep,
 )
+from nemo_guardrails_plugin.benchmarks.analyze import analyze_run
 from nemo_guardrails_plugin.benchmarks.bootstrap import ensure_aiperf_venv
 from nemo_guardrails_plugin.benchmarks.constants import (
     AIPERF_SHIM_BASE_URL,
+    ALL_VARIANTS,
     APP_PROVIDER_URL,
     CS_PROVIDER_URL,
     IGW_CHAT_PATH,
     NMP_BASE_URL,
     NMP_HEALTH_PATH,
+    VARIANT_WITH_GUARDRAILS,
+    VARIANT_WITHOUT_GUARDRAILS,
     WORKSPACE,
 )
 from nemo_guardrails_plugin.benchmarks.paths import (
@@ -66,8 +72,6 @@
     Path("benchmark/aiperf/__main__.py"),
     Path("benchmark/aiperf/run_aiperf.py"),
     Path("benchmark/mock_llm_server/run_server.py"),
-    Path("benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env"),
-    Path("benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env"),
     Path("examples/configs/content_safety_local/config.yml"),
     Path("examples/configs/content_safety_local/prompts.yml"),
 )
@@ -91,6 +95,20 @@ def _validate_nemoguardrails_repo(nemoguardrails_repo_root: Path) -> None:
         )
 
 
+def _validate_in_repo_mock_configs(paths: RunPaths) -> None:
+    """Fail fast if the in-repo mock LLM env files are missing.
+
+    These configure the upstream mock server but live in this repo (under
+    `plugins/nemo-guardrails/benchmarks/configs/mock_llm/`) so the harness can
+    tune mock behavior without forking the upstream checkout. A missing file
+    here is almost certainly a checkout problem on the NMP side.
+    """
+    missing = [p for p in (paths.mock_app_env, paths.mock_content_safety_env) if not p.is_file()]
+    if missing:
+        bullet = "\n  - ".join(str(p) for p in missing)
+        raise FileNotFoundError(f"In-repo mock LLM config files missing:\n  - {bullet}")
+
+
 def _build_mock_nim_processes(paths: RunPaths, workers: int) -> list[SupervisedProcess]:
     """Spawn ``python -m benchmark.mock_llm_server.run_server`` for both mocks.
 
@@ -122,20 +140,20 @@ def spec(name: str, port: int, env_file: Path, *, health_url: str) -> Supervised
             health_timeout_seconds=_MOCK_HEALTH_TIMEOUT_SECONDS,
         )
 
+    # Both env files are sourced from this repo (see
+    # plugins/nemo-guardrails/benchmarks/configs/mock_llm/) so mock behavior is
+    # versioned alongside the benchmark and independent of the upstream checkout.
     return [
-        # Main LLM mock server
         spec(
             "mock-app-llm",
             8000,
-            paths.nemoguardrails_repo_root / "benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env",
+            paths.mock_app_env,
             health_url=f"{APP_PROVIDER_URL}/health",
         ),
-        # Content-safety LLM mock server
         spec(
             "mock-content-safety-llm",
             8001,
-            paths.nemoguardrails_repo_root
-            / "benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env",
+            paths.mock_content_safety_env,
             health_url=f"{CS_PROVIDER_URL}/health",
         ),
     ]
@@ -205,6 +223,109 @@ def _smoke_test(client: NeMoPlatform, seeded: SeededResources) -> None:
     raise RuntimeError(f"Smoke test failed after 60 attempts: {last_error}")
 
 
+@dataclass(frozen=True)
+class BenchmarkOutcome:
+    """Outcome of a single benchmark variant's AIPerf sweep, used by the summary."""
+
+    variant: str
+    aiperf_exit: int
+    output_dir: Path
+    sweep_results: list[SweepRunResult]
+
+    @property
+    def failures(self) -> int:
+        return sum(1 for r in self.sweep_results if not r.passed)
+
+    @property
+    def passed(self) -> bool:
+        return self.aiperf_exit == 0 and bool(self.sweep_results) and self.failures == 0
+
+
+def _vm_ref_for_variant(variant: str, seeded: SeededResources) -> str:
+    """Pick which seeded VirtualModel a benchmark variant should target."""
+    if variant == VARIANT_WITH_GUARDRAILS:
+        return seeded.vm_ref
+    if variant == VARIANT_WITHOUT_GUARDRAILS:
+        return seeded.no_guardrails_vm_ref
+    raise ValueError(f"Unknown variant: {variant!r}")
+
+
+def _run_benchmark(
+    *,
+    variant: str,
+    paths: RunPaths,
+    seeded: SeededResources,
+    aiperf_python: Path,
+) -> BenchmarkOutcome:
+    """Materialize a per-variant AIPerf config, run the sweep, collect results."""
+    vm_ref = _vm_ref_for_variant(variant, seeded)
+    runtime_config = paths.runtime_config_for(variant)
+    aiperf_output_dir = paths.aiperf_output_dir_for(variant)
+    aiperf_log = paths.aiperf_log_for(variant)
+
+    sweep_config = prepare_runtime_aiperf_config(
+        template_path=paths.config_template,
+        runtime_config_path=runtime_config,
+        aiperf_output_dir=aiperf_output_dir,
+        model_ref=vm_ref,
+    )
+    log.info(
+        "Benchmark %s: targeting %s; concurrency=%s, duration=%ss",
+        variant,
+        vm_ref,
+        sweep_config.get("sweeps", {}).get("concurrency"),
+        sweep_config.get("base_config", {}).get("benchmark_duration"),
+    )
+    log.info(
+        "Starting AIPerf sweep [%s] against %s -> shim -> %s%s",
+        variant,
+        AIPERF_SHIM_BASE_URL,
+        NMP_BASE_URL,
+        IGW_CHAT_PATH,
+    )
+
+    aiperf_exit = run_aiperf_sweep(
+        nemoguardrails_repo_root=paths.nemoguardrails_repo_root,
+        runtime_config=runtime_config,
+        log_path=aiperf_log,
+        python_executable=str(aiperf_python),
+        venv_bin_path=paths.aiperf_venv_dir / "bin",
+    )
+
+    sweep_results = collect_sweep_results(aiperf_output_dir)
+    return BenchmarkOutcome(
+        variant=variant,
+        aiperf_exit=aiperf_exit,
+        output_dir=aiperf_output_dir,
+        sweep_results=sweep_results,
+    )
+
+
+def _summarize_benchmark_results(outcomes: list[BenchmarkOutcome]) -> int:
+    """Log per-benchmark + overall summary; return process exit code."""
+    overall_failed = False
+    for outcome in outcomes:
+        if not outcome.sweep_results:
+            log.error(
+                "Benchmark %s: aiperf exited with code %d and produced no per-sweep results in %s",
+                outcome.variant,
+                outcome.aiperf_exit,
+                outcome.output_dir,
+            )
+        else:
+            log.info(
+                "Benchmark %s: %d run(s), %d failure(s); per-sweep outputs under %s",
+                outcome.variant,
+                len(outcome.sweep_results),
+                outcome.failures,
+                outcome.output_dir,
+            )
+        if not outcome.passed:
+            overall_failed = True
+
+    return 1 if overall_failed else 0
+
+
 def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         prog="nemo-guardrails-benchmark",
@@ -244,10 +365,30 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         default=None,
         help="Override the per-run directory name (default: current timestamp).",
     )
+    parser.add_argument(
+        "--variant",
+        choices=(*ALL_VARIANTS, "all"),
+        default="all",
+        help=(
+            "Which sweep to run: 'with-guardrails' targets the VirtualModel with "
+            "middleware attached; 'without-guardrails' targets a control VM with "
+            "no middleware; 'all' (default) runs them sequentially against the "
+            "same NMP so the with-vs-without delta isolates middleware overhead. "
+            "In CI, run two parallel jobs with --variant=with-guardrails and "
+            "--variant=without-guardrails against isolated NMP instances."
+        ),
+    )
     parser.add_argument("--verbose", "-v", action="store_true")
     return parser.parse_args(argv)
 
 
+def _resolve_variants(variant_arg: str) -> tuple[str, ...]:
+    """Translate the ``--variant`` CLI argument into the ordered list to run."""
+    if variant_arg == "all":
+        return ALL_VARIANTS
+    return (variant_arg,)
+
+
 def main(argv: list[str] | None = None) -> int:
     args = parse_args(argv)
     _configure_logging(args.verbose)
@@ -266,20 +407,18 @@ def main(argv: list[str] | None = None) -> int:
         run_id=args.run_id,
     )
     paths.ensure_directories()
+    _validate_in_repo_mock_configs(paths)
 
     log.info("Created directory for benchmark results at: %s", paths.run_dir)
-
-    sweep_config = prepare_runtime_aiperf_config(
-        template_path=paths.config_template,
-        runtime_config_path=paths.runtime_config,
-        aiperf_output_dir=paths.aiperf_output_dir,
-    )
     log.info(
-        "AIPerf sweep: concurrency=%s, duration=%ss",
-        sweep_config.get("sweeps", {}).get("concurrency"),
-        sweep_config.get("base_config", {}).get("benchmark_duration"),
+        "Mock LLM configs: app=%s, content-safety=%s",
+        paths.mock_app_env,
+        paths.mock_content_safety_env,
     )
 
+    variants = _resolve_variants(args.variant)
+    log.info("Will run %d variant(s): %s", len(variants), ", ".join(variants))
+
     # Ensure the dedicated aiperf venv exists *before* we start any supervised
     # processes.
     aiperf_python = ensure_aiperf_venv(paths.aiperf_venv_dir)
@@ -318,43 +457,44 @@ def main(argv: list[str] | None = None) -> int:
         log.info("Waiting for VirtualModel %s to be ready...", seeded.vm_ref)
         _smoke_test(client, seeded)
 
-        log.info(
-            "Starting AIPerf sweep against %s -> shim -> %s%s",
-            AIPERF_SHIM_BASE_URL,
-            NMP_BASE_URL,
-            IGW_CHAT_PATH,
-        )
-        aiperf_exit = run_aiperf_sweep(
-            nemoguardrails_repo_root=paths.nemoguardrails_repo_root,
-            runtime_config=paths.runtime_config,
-            log_path=paths.log_dir / "aiperf.log",
-            python_executable=str(aiperf_python),
-            venv_bin_path=paths.aiperf_venv_dir / "bin",
-        )
+        # Run each requested benchmark variant sequentially against the same
+        # NMP. The bootstrap (mocks, NMP, shim) is shared so the only thing
+        # that differs between variants is which VirtualModel AIPerf targets,
+        # which is exactly what makes the with-vs-without delta interpretable
+        # as middleware overhead.
+        outcomes: list[BenchmarkOutcome] = []
+        for variant in variants:
+            outcomes.append(
+                _run_benchmark(
+                    variant=variant,
+                    paths=paths,
+                    seeded=seeded,
+                    aiperf_python=aiperf_python,
+                )
+            )
 
-    sweep_results = collect_sweep_results(paths.aiperf_output_dir)
-    failures = sum(1 for r in sweep_results if not r.passed)
+    exit_code = _summarize_benchmark_results(outcomes)
+    _maybe_print_analysis(paths.run_dir, outcomes)
+    return exit_code
 
-    if not sweep_results:
-        # AIPerf exited before producing any per-sweep dirs. Surface that
-        # explicitly so the log isn't ambiguous about why we're failing.
-        log.error(
-            "aiperf exited with code %d and produced no per-sweep results in %s",
-            aiperf_exit,
-            paths.aiperf_output_dir,
-        )
-    else:
-        log.info(
-            "Sweep summary: %d run(s), %d failure(s); per-sweep outputs under %s",
-            len(sweep_results),
-            failures,
-            paths.aiperf_output_dir,
-        )
 
-    if failures or aiperf_exit != 0 or not sweep_results:
-        return 1
+def _maybe_print_analysis(run_dir: Path, outcomes: list[BenchmarkOutcome]) -> None:
+    """Print the with-vs-without comparison table when it's meaningful.
 
-    return 0
+    Only invoked when at least one variant produced results, since the
+    analyzer can also dump a single-variant table for partial runs. The call
+    is wrapped in a broad try/except: the analyzer is pure post-processing
+    over already-written artifacts, so a bug here must not change the
+    harness's exit code or hide a real benchmark failure.
+    """
+    if not any(o.sweep_results for o in outcomes):
+        return
+    try:
+        report = analyze_run(run_dir)
+    except Exception as exc:
+        log.warning("Analyzer failed; skipping summary table: %s", exc)
+        return
+    log.info("Benchmark analysis:\n%s", report)
 
 
 if __name__ == "__main__":
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
index 96cd152f0f..97f9b0df0f 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
@@ -28,6 +28,7 @@
     GUARDRAIL_CONFIG,
     GUARDRAILS_MIDDLEWARE_CONFIG_TYPE,
     GUARDRAILS_MIDDLEWARE_NAME,
+    NO_GUARDRAILS_VM_NAME,
     VM_NAME,
     WORKSPACE,
 )
@@ -52,11 +53,19 @@ class SeededResources:
     cs_model_entity: str
     guardrail_config_name: str
     vm_name: str
+    # Sibling VM with no middleware attached, used as the control variant by
+    # the benchmark harness. Same default_model_entity and models list as the
+    # guardrails VM so the only difference is middleware attachment.
+    no_guardrails_vm_name: str
 
     @property
     def vm_ref(self) -> str:
         return f"{self.workspace}/{self.vm_name}"
 
+    @property
+    def no_guardrails_vm_ref(self) -> str:
+        return f"{self.workspace}/{self.no_guardrails_vm_name}"
+
     @property
     def guardrail_config_ref(self) -> str:
         return f"{self.workspace}/{self.guardrail_config_name}"
@@ -171,6 +180,22 @@ def seed_benchmark(
     )
     _dump_model(generated_dir / "virtual_model.json", vm)
 
+    # Control VirtualModel: identical to the guardrails VM except no
+    # middleware. The harness uses this to measure NMP+IGW latency without
+    # the guardrails middleware so the with-vs-without delta isolates
+    # middleware overhead.
+    log.info("Creating control VirtualModel %s/%s", WORKSPACE, NO_GUARDRAILS_VM_NAME)
+    no_guardrails_vm = client.inference.virtual_models.create(
+        workspace=WORKSPACE,
+        name=NO_GUARDRAILS_VM_NAME,
+        default_model_entity=app_entity,
+        models=vm_models,
+        request_middleware=[],
+        response_middleware=[],
+        exist_ok=True,
+    )
+    _dump_model(generated_dir / "virtual_model_no_guardrails.json", no_guardrails_vm)
+
     return SeededResources(
         workspace=WORKSPACE,
         app_provider_name=APP_PROVIDER,
@@ -179,6 +204,7 @@ def seed_benchmark(
         cs_model_entity=cs_entity,
         guardrail_config_name=GUARDRAIL_CONFIG,
         vm_name=VM_NAME,
+        no_guardrails_vm_name=NO_GUARDRAILS_VM_NAME,
     )
 
 

From f78ffa8a890f38b98c7f5c36cbd9d3270a83f73c Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Mon, 22 Jun 2026 09:54:10 -0400
Subject: [PATCH 2/8] Fix CI

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .github/workflows/ci.yaml                     |  2 +-
 .../benchmarks/analyze.py                     |  5 +--
 .../tests/unit/benchmarks/test_seeding.py     | 31 ++++++++++++++-----
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 40d8689002..30117b985b 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -1177,7 +1177,7 @@ jobs:
         # is defensive in case that ever stops being true (e.g. an artifact
         # ever leaks in from a previous workflow attempt).
         run: |
-          RUN_DIR=$(ls -td plugins/nemo-guardrails/benchmarks/artifacts/runs/*/ | head -1)
+          RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-)
           echo "Analyzing run directory: $RUN_DIR"
           uv run --frozen --package nemo-guardrails-plugin --extra bench \
             python -m nemo_guardrails_plugin.benchmarks.analyze "$RUN_DIR"
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
index 6262159b8a..54fa91ec50 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
@@ -243,10 +243,7 @@ def main(argv: list[str] | None = None) -> int:
     parser.add_argument(
         "run_dir",
         type=Path,
-        help=(
-            "Path to a run directory under "
-            "`plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/`."
-        ),
+        help=("Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/`."),
     )
     parser.add_argument(
         "--log-level",
diff --git a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py
index f0ad57aeaa..8342315e13 100644
--- a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py
+++ b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py
@@ -14,6 +14,7 @@
     CS_MODEL_NAME,
     CS_PROVIDER,
     GUARDRAIL_CONFIG,
+    NO_GUARDRAILS_VM_NAME,
     VM_NAME,
     WORKSPACE,
 )
@@ -145,12 +146,18 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path
         cs_entity = seeded.cs_model_entity
         assert gc_call.kwargs["data"]["models"][0]["model"] == cs_entity
 
-        # VirtualModel uses the discovered app entity and points middleware at the
-        # guardrail config we just created.
-        vm_call = fake_client.inference.virtual_models.create.call_args
-        assert vm_call.kwargs["name"] == VM_NAME
-        assert vm_call.kwargs["default_model_entity"] == seeded.app_model_entity
-        assert vm_call.kwargs["models"] == [{"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"}]
+        # Two VirtualModels are created: the guardrails VM (with middleware) and
+        # a control VM (no middleware) used by the without-guardrails benchmark
+        # variant.
+        vm_calls = fake_client.inference.virtual_models.create.call_args_list
+        assert len(vm_calls) == 2
+
+        guardrails_vm_call = vm_calls[0]
+        assert guardrails_vm_call.kwargs["name"] == VM_NAME
+        assert guardrails_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity
+        assert guardrails_vm_call.kwargs["models"] == [
+            {"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"}
+        ]
         expected_middleware = [
             {
                 "name": "nemo-guardrails",
@@ -158,8 +165,14 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path
                 "config_id": f"{WORKSPACE}/{GUARDRAIL_CONFIG}",
             }
         ]
-        assert vm_call.kwargs["request_middleware"] == expected_middleware
-        assert vm_call.kwargs["response_middleware"] == expected_middleware
+        assert guardrails_vm_call.kwargs["request_middleware"] == expected_middleware
+        assert guardrails_vm_call.kwargs["response_middleware"] == expected_middleware
+
+        control_vm_call = vm_calls[1]
+        assert control_vm_call.kwargs["name"] == NO_GUARDRAILS_VM_NAME
+        assert control_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity
+        assert control_vm_call.kwargs["request_middleware"] == []
+        assert control_vm_call.kwargs["response_middleware"] == []
 
     def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path: Path) -> None:
         ng_root = tmp_path / "NeMo-Guardrails"
@@ -176,6 +189,7 @@ def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path
         assert (generated_dir / "app_provider.json").is_file()
         assert (generated_dir / "content_safety_provider.json").is_file()
         assert (generated_dir / "virtual_model.json").is_file()
+        assert (generated_dir / "virtual_model_no_guardrails.json").is_file()
 
         request_payload = json.loads(
             (generated_dir / "content_safety_local_nmp_request.json").read_text(encoding="utf-8")
@@ -197,6 +211,7 @@ def test_returns_seeded_resources(self, fake_client: MagicMock, tmp_path: Path)
 
         assert seeded.workspace == WORKSPACE
         assert seeded.vm_ref == f"{WORKSPACE}/{VM_NAME}"
+        assert seeded.no_guardrails_vm_name == NO_GUARDRAILS_VM_NAME
         assert seeded.guardrail_config_ref == f"{WORKSPACE}/{GUARDRAIL_CONFIG}"
 
     def test_raises_if_served_models_never_populated(self, tmp_path: Path) -> None:

From 78b7730f75232f9e7d8eb43cea96f4cd2d3d977f Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Mon, 22 Jun 2026 11:22:24 -0400
Subject: [PATCH 3/8] always run benchmark

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .github/workflows/ci.yaml                     | 35 ++++----
 .../benchmarks/analyze.py                     | 81 ++++++++++++-------
 2 files changed, 67 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 30117b985b..4181069c1a 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -1078,7 +1078,9 @@ jobs:
     # single-job sequential layout. The `benchmark-guardrails-analyze` job
     # below merges both artifacts and prints the with-vs-without comparison.
     name: Guardrails plugin benchmark (${{ matrix.variant }})
-    if: github.event_name == 'workflow_dispatch'
+    # TODO(jgulabrai): gate back to `workflow_dispatch` once we're done
+    # collecting baseline samples and the harness is stable enough that we
+    # don't need it firing on every PR push.
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
@@ -1141,7 +1143,7 @@ jobs:
     # CI-collected numbers we can use to seed a baseline file later.
     name: Guardrails benchmark analysis
     needs: [benchmark-guardrails]
-    if: github.event_name == 'workflow_dispatch' && !cancelled()
+    if: ${{ !cancelled() }}
     runs-on: ubuntu-latest
     timeout-minutes: 10
     steps:
@@ -1150,37 +1152,34 @@ jobs:
         with:
           path: nemo-platform
       - name: Download with-guardrails artifact
+        # If a variant failed entirely it may have uploaded no artifact;
+        # the analyzer handles the single-variant case so don't fail here.
+        continue-on-error: true
         uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: benchmark-guardrails-results-with-guardrails
           path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
       - name: Download without-guardrails artifact
+        continue-on-error: true
         uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: benchmark-guardrails-results-without-guardrails
           path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
-      - name: Install uv
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
-        with:
-          working-directory: nemo-platform
-          python-version: "3.11"
-          enable-cache: true
-      - name: Bootstrap Python environment
-        working-directory: nemo-platform
-        run: make bootstrap-python
-        env:
-          PYTORCH_DEPS: cpu
       - name: Print benchmark comparison
         working-directory: nemo-platform
+        # `analyze.py` is intentionally stdlib-only and import-free w.r.t. the
+        # rest of the plugin, so we run it on the runner's stock python3
+        # rather than spinning up the full `uv` workspace (which costs ~3
+        # minutes per CI run for ~50ms of CSV parsing).
+        #
         # Both matrix jobs above use the same `--run-id`, so there should be
-        # exactly one merged run directory under `runs/`. `ls -td ... | head -1`
-        # is defensive in case that ever stops being true (e.g. an artifact
-        # ever leaks in from a previous workflow attempt).
+        # exactly one merged run directory under `runs/`. The `find | sort`
+        # pipeline is defensive in case that ever stops being true.
+        # NOTE: `find -printf` is GNU-only (works on ubuntu-latest, not macOS).
         run: |
           RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-)
           echo "Analyzing run directory: $RUN_DIR"
-          uv run --frozen --package nemo-guardrails-plugin --extra bench \
-            python -m nemo_guardrails_plugin.benchmarks.analyze "$RUN_DIR"
+          python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR"
       - name: Upload merged benchmark artifacts
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
index 54fa91ec50..1a31dff306 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
@@ -27,18 +27,15 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-from nemo_guardrails_plugin.benchmarks.constants import (
-    VARIANT_WITH_GUARDRAILS,
-    VARIANT_WITHOUT_GUARDRAILS,
-)
+# Variant identifiers are duplicated from `constants.py` (rather than imported)
+# so this module has zero intra-repo imports and can run on bare `python3` in
+# CI without the full `uv`/`make bootstrap-python` setup. Keep these in sync
+# with `constants.VARIANT_WITH_GUARDRAILS` / `VARIANT_WITHOUT_GUARDRAILS`.
+VARIANT_WITH_GUARDRAILS = "with-guardrails"
+VARIANT_WITHOUT_GUARDRAILS = "without-guardrails"
 
 log = logging.getLogger(__name__)
 
-# Concurrency levels we expect from the sweep config. The analyzer only
-# reports on levels that exist in both variants; missing rungs are skipped
-# silently rather than failing, since a partial run can still be informative.
-_EXPECTED_CONCURRENCIES = (1, 2, 4, 8, 16, 32, 64)
-
 # Metric we read from `profile_export_aiperf.csv`. AIPerf reports a number of
 # metrics; this one is end-to-end request wall time including the shim hop,
 # IGW, middleware, and any mock-NIM round-trips.
@@ -84,11 +81,13 @@ def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]:
     Walks the same ``<batch>/<timestamp>/concurrency<N>/`` layout that
     ``collect_sweep_results`` produces. Missing files are logged and skipped
     rather than raising, so a partial run still produces a useful table.
+
+    Returns a mapping of ``concurrency_level -> LatencyRow``.
     """
     if not variant_output_dir.is_dir():
         return {}
 
-    rows: dict[int, LatencyRow] = {}
+    latency_by_concurrency: dict[int, LatencyRow] = {}
     for batch_dir in sorted(p for p in variant_output_dir.iterdir() if p.is_dir()):
         for timestamp_dir in sorted(p for p in batch_dir.iterdir() if p.is_dir()):
             for sweep_dir in sorted(p for p in timestamp_dir.iterdir() if p.is_dir()):
@@ -98,21 +97,39 @@ def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]:
                 csv_path = sweep_dir / "profile_export_aiperf.csv"
                 row = _read_latency_row(csv_path, concurrency)
                 if row is not None:
-                    rows[concurrency] = row
-    return rows
+                    latency_by_concurrency[concurrency] = row
+    return latency_by_concurrency
 
 
 def compare(
-    with_rows: dict[int, LatencyRow],
-    without_rows: dict[int, LatencyRow],
+    latency_by_concurrency_with_guardrails: dict[int, LatencyRow],
+    latency_by_concurrency_without_guardrails: dict[int, LatencyRow],
 ) -> list[ComparisonRow]:
     """Build per-concurrency comparison rows, ordered by concurrency.
 
     Only concurrencies present in *both* variants are included; an asymmetry
     means one variant failed for that level and the comparison is undefined.
+    Asymmetric levels are logged at WARNING so silent drops are visible.
     """
-    shared = sorted(set(with_rows) & set(without_rows))
-    return [ComparisonRow(c, with_rows[c], without_rows[c]) for c in shared]
+    concurrencies_with_guardrails = set(latency_by_concurrency_with_guardrails)
+    concurrencies_without_guardrails = set(latency_by_concurrency_without_guardrails)
+    concurrencies_in_both_variants = sorted(concurrencies_with_guardrails & concurrencies_without_guardrails)
+
+    concurrencies_in_only_one_variant = sorted(concurrencies_with_guardrails ^ concurrencies_without_guardrails)
+    if concurrencies_in_only_one_variant:
+        log.warning(
+            "Concurrency levels present in only one variant, excluded from comparison: %s",
+            concurrencies_in_only_one_variant,
+        )
+
+    return [
+        ComparisonRow(
+            concurrency,
+            latency_by_concurrency_with_guardrails[concurrency],
+            latency_by_concurrency_without_guardrails[concurrency],
+        )
+        for concurrency in concurrencies_in_both_variants
+    ]
 
 
 def format_table(rows: list[ComparisonRow]) -> str:
@@ -122,7 +139,8 @@ def format_table(rows: list[ComparisonRow]) -> str:
 
     header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg")
     fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
-    lines = [fmt.format(*header), "-" * 109]
+    header_line = fmt.format(*header)
+    lines = [header_line, "-" * len(header_line)]
     for r in rows:
         lines.append(
             fmt.format(
@@ -146,33 +164,34 @@ def format_table(rows: list[ComparisonRow]) -> str:
 def analyze_run(run_dir: Path) -> str:
     """Top-level: read both variants from one run dir and return a printable table."""
     aiperf_dir = run_dir / "aiperf_results"
-    with_rows = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS)
-    without_rows = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS)
+    latency_by_concurrency_with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS)
+    latency_by_concurrency_without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS)
 
-    if not with_rows and not without_rows:
+    if not latency_by_concurrency_with_guardrails and not latency_by_concurrency_without_guardrails:
         return f"No AIPerf results found under {aiperf_dir}"
-    if not with_rows or not without_rows:
+    if not latency_by_concurrency_with_guardrails or not latency_by_concurrency_without_guardrails:
         # Single-variant run: dump whichever side is present without trying
         # to compute deltas.
-        only = with_rows or without_rows
-        which = VARIANT_WITH_GUARDRAILS if with_rows else VARIANT_WITHOUT_GUARDRAILS
-        return _format_single_variant(which, only)
+        if latency_by_concurrency_with_guardrails:
+            return _format_single_variant(VARIANT_WITH_GUARDRAILS, latency_by_concurrency_with_guardrails)
+        return _format_single_variant(VARIANT_WITHOUT_GUARDRAILS, latency_by_concurrency_without_guardrails)
 
-    rows = compare(with_rows, without_rows)
+    rows = compare(latency_by_concurrency_with_guardrails, latency_by_concurrency_without_guardrails)
     return format_table(rows)
 
 
-def _format_single_variant(variant: str, rows: dict[int, LatencyRow]) -> str:
+def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str:
     """Render one variant's table when the other variant didn't run."""
     fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}"
+    header_line = fmt.format("conc", "avg", "p50", "p90", "std")
     lines = [
         f"Only one variant present: {variant}",
-        fmt.format("conc", "avg", "p50", "p90", "std"),
-        "-" * 49,
+        header_line,
+        "-" * len(header_line),
     ]
-    for c in sorted(rows):
-        r = rows[c]
-        lines.append(fmt.format(c, f"{r.avg:.0f}", f"{r.p50:.0f}", f"{r.p90:.0f}", f"{r.std:.0f}"))
+    for concurrency in sorted(latency_by_concurrency):
+        row = latency_by_concurrency[concurrency]
+        lines.append(fmt.format(concurrency, f"{row.avg:.0f}", f"{row.p50:.0f}", f"{row.p90:.0f}", f"{row.std:.0f}"))
     lines.append("")
     lines.append("All values in milliseconds.")
     return "\n".join(lines)

From 2acf55bc4817e6e03bda18e1354bf68cd1d59156 Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Mon, 22 Jun 2026 12:24:26 -0400
Subject: [PATCH 4/8] Cleanup

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .github/workflows/ci.yaml                     |  37 ++----
 .../benchmarks/aiperf_runner.py               |  15 +--
 .../benchmarks/analyze.py                     | 124 ++++++++++++------
 .../benchmarks/paths.py                       |  14 +-
 .../nemo_guardrails_plugin/benchmarks/run.py  |  36 ++---
 .../benchmarks/seeding.py                     |  10 +-
 6 files changed, 114 insertions(+), 122 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 4181069c1a..37d5864c9f 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -1072,20 +1072,15 @@ jobs:
           path: web/packages/studio/playwright-report/
 
   benchmark-guardrails:
-    # Run the two benchmark variants as parallel matrix jobs, each with its
-    # own NMP instance. This isolates them from each other (no shared mocks,
-    # no cross-talk on :8080) and roughly halves wall-clock vs. the previous
-    # single-job sequential layout. The `benchmark-guardrails-analyze` job
-    # below merges both artifacts and prints the with-vs-without comparison.
+    # Parallel matrix jobs (one NMP per variant) so the two sweeps don't
+    # share mocks or contend on :8080. `benchmark-guardrails-analyze` merges
+    # the artifacts and prints the comparison.
     name: Guardrails plugin benchmark (${{ matrix.variant }})
-    # TODO(jgulabrai): gate back to `workflow_dispatch` once we're done
-    # collecting baseline samples and the harness is stable enough that we
-    # don't need it firing on every PR push.
+    # TODO(jgulabrai): gate back to `workflow_dispatch` once we have a CI baseline.
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
-      # Don't cancel the other variant if one fails; the partial artifact
-      # is still useful for diagnosing what went wrong.
+      # Keep the partial artifact if one variant fails.
       fail-fast: false
       matrix:
         variant: [with-guardrails, without-guardrails]
@@ -1129,18 +1124,14 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          # Per-variant artifact name; GHA disallows two artifacts with the
-          # same name in one workflow run.
+          # Ensure we use a unique artifact name per benchmark vaiant.
           name: benchmark-guardrails-results-${{ matrix.variant }}
           retention-days: 30
           path: |
             nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
 
   benchmark-guardrails-analyze:
-    # Joins the two parallel matrix jobs above, downloads both artifacts into
-    # one merged `runs/<id>/` tree, and prints the with-vs-without
-    # comparison table. No regression gate yet: this exists to produce
-    # CI-collected numbers we can use to seed a baseline file later.
+    # Merge both variant artifacts and print the comparison table.
     name: Guardrails benchmark analysis
     needs: [benchmark-guardrails]
     if: ${{ !cancelled() }}
@@ -1167,15 +1158,8 @@ jobs:
           path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
       - name: Print benchmark comparison
         working-directory: nemo-platform
-        # `analyze.py` is intentionally stdlib-only and import-free w.r.t. the
-        # rest of the plugin, so we run it on the runner's stock python3
-        # rather than spinning up the full `uv` workspace (which costs ~3
-        # minutes per CI run for ~50ms of CSV parsing).
-        #
-        # Both matrix jobs above use the same `--run-id`, so there should be
-        # exactly one merged run directory under `runs/`. The `find | sort`
-        # pipeline is defensive in case that ever stops being true.
-        # NOTE: `find -printf` is GNU-only (works on ubuntu-latest, not macOS).
+        # `analyze.py` doesn't rely on NMP or AIPerf, so we skip the uv bootstrap
+        # step and run it with the runner's `python3` CLI directly.
         run: |
           RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-)
           echo "Analyzing run directory: $RUN_DIR"
@@ -1184,8 +1168,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          # Single merged artifact so collecting baseline samples is a matter
-          # of downloading one thing per workflow run rather than two.
+          # Single artifact so baseline collection is one download per run.
           name: benchmark-guardrails-results-merged
           retention-days: 30
           path: |
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
index 5cbfe70071..20963e11f3 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
@@ -64,16 +64,11 @@ def prepare_runtime_aiperf_config(
 ) -> dict[str, Any]:
     """Materialize the AIPerf config this run will use.
 
-    Reads the checked-in ``template_path`` config, overrides its
-    ``output_base_dir`` to point inside the current run's directory, optionally
-    overrides ``base_config.model`` (so the same template can target multiple
-    VirtualModels in one harness invocation), and writes the result to
-    ``runtime_config_path``. AIPerf is later invoked with
-    ``--config-file <runtime_config_path>`` so every artifact lands under a
-    separate per-run directory.
-
-    Returns the parsed config dict so callers can log fields (sweep params,
-    benchmark_duration) without re-reading the file.
+    Reads ``template_path``, overrides ``output_base_dir`` (so AIPerf
+    artifacts nest under this run) and optionally ``base_config.model``
+    (so one template can target multiple VirtualModels), and writes the
+    result to ``runtime_config_path``. Returns the parsed config so
+    callers can log sweep params without re-reading the file.
     """
     if not template_path.is_file():
         raise FileNotFoundError(f"AIPerf template not found: {template_path}")
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
index 1a31dff306..60545a70b1 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
@@ -3,19 +3,13 @@
 
 """Post-run analyzer for the nemo-guardrails IGW benchmark.
 
-Reads the ``profile_export_aiperf.csv`` files produced by both benchmark
-variants in a single run directory and prints a side-by-side latency table:
-``with``, ``without``, and the ``Δ`` per concurrency level.
+Reads ``profile_export_aiperf.csv`` files from both variants in one run dir
+and prints a with-vs-without latency comparison. The delta isolates
+middleware overhead since the only difference between variants is whether
+middleware is attached to the targeted VirtualModel.
 
-The delta is what answers the question "how much latency does the guardrails
-middleware add", since the only thing that differs between the two variants
-is whether the middleware is attached to the targeted VirtualModel.
-
-Used two ways:
-
-* Standalone:
-  ``python -m nemo_guardrails_plugin.benchmarks.analyze <run-dir>``
-* Auto-invoked from ``run.py`` at the end of a sweep that ran both variants.
+Used both as a script (``python -m ... <run-dir>``) and auto-invoked from
+``run.py`` after a multi-variant sweep.
 """
 
 from __future__ import annotations
@@ -27,20 +21,26 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-# Variant identifiers are duplicated from `constants.py` (rather than imported)
-# so this module has zero intra-repo imports and can run on bare `python3` in
-# CI without the full `uv`/`make bootstrap-python` setup. Keep these in sync
-# with `constants.VARIANT_WITH_GUARDRAILS` / `VARIANT_WITHOUT_GUARDRAILS`.
+# Duplicated from `constants.py` so this module stays import-free and can
+# run on bare `python3` in CI without bootstrapping the uv workspace.
 VARIANT_WITH_GUARDRAILS = "with-guardrails"
 VARIANT_WITHOUT_GUARDRAILS = "without-guardrails"
 
 log = logging.getLogger(__name__)
 
-# Metric we read from `profile_export_aiperf.csv`. AIPerf reports a number of
-# metrics; this one is end-to-end request wall time including the shim hop,
-# IGW, middleware, and any mock-NIM round-trips.
 _LATENCY_METRIC = "Request Latency (ms)"
 
+# Mock-LLM time per request, subtracted to isolate platform overhead. Mirrors
+# `E2E_LATENCY_MEAN_SECONDS` in configs/mock_llm/*.env and the 2 CS calls
+# (input + output rails) of `content_safety_local`. Update in lock-step.
+_APP_MOCK_LATENCY_MS = 4000.0
+_CONTENT_SAFETY_MOCK_LATENCY_MS = 500.0
+_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST = 2
+_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS = _APP_MOCK_LATENCY_MS
+_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS = (
+    _APP_MOCK_LATENCY_MS + _CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST * _CONTENT_SAFETY_MOCK_LATENCY_MS
+)
+
 
 @dataclass(frozen=True)
 class LatencyRow:
@@ -78,11 +78,9 @@ def delta_avg(self) -> float:
 def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]:
     """Load per-concurrency latency stats for one variant.
 
-    Walks the same ``<batch>/<timestamp>/concurrency<N>/`` layout that
-    ``collect_sweep_results`` produces. Missing files are logged and skipped
-    rather than raising, so a partial run still produces a useful table.
-
-    Returns a mapping of ``concurrency_level -> LatencyRow``.
+    Walks the ``<batch>/<timestamp>/concurrency<N>/`` layout produced by
+    ``collect_sweep_results``. Missing CSVs are skipped, not raised, so
+    partial runs still produce a table.
     """
     if not variant_output_dir.is_dir():
         return {}
@@ -105,11 +103,10 @@ def compare(
     latency_by_concurrency_with_guardrails: dict[int, LatencyRow],
     latency_by_concurrency_without_guardrails: dict[int, LatencyRow],
 ) -> list[ComparisonRow]:
-    """Build per-concurrency comparison rows, ordered by concurrency.
+    """Build per-concurrency comparison rows, sorted by concurrency.
 
-    Only concurrencies present in *both* variants are included; an asymmetry
-    means one variant failed for that level and the comparison is undefined.
-    Asymmetric levels are logged at WARNING so silent drops are visible.
+    Only levels present in both variants are compared; asymmetric levels are
+    logged at WARNING and excluded.
     """
     concurrencies_with_guardrails = set(latency_by_concurrency_with_guardrails)
     concurrencies_without_guardrails = set(latency_by_concurrency_without_guardrails)
@@ -161,8 +158,60 @@ def format_table(rows: list[ComparisonRow]) -> str:
     return "\n".join(lines)
 
 
+def format_platform_overhead_table(rows: list[ComparisonRow]) -> str:
+    """Render a table with mock-LLM time subtracted from p50/p90/avg.
+
+    Isolates NMP + IGW + shim + middleware overhead from the much larger
+    mock sleeps. The Δ columns are the middleware's own cost over the bare
+    path.
+    """
+    if not rows:
+        return "No comparable sweep results found (need both variants to share concurrency levels)."
+
+    header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg")
+    fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
+    header_line = fmt.format(*header)
+    lines = [header_line, "-" * len(header_line)]
+
+    for r in rows:
+        with_p50 = r.with_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS
+        without_p50 = r.without_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS
+        with_p90 = r.with_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS
+        without_p90 = r.without_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS
+        with_avg = r.with_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS
+        without_avg = r.without_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS
+        lines.append(
+            fmt.format(
+                r.concurrency,
+                f"{with_p50:+.0f}",
+                f"{without_p50:+.0f}",
+                f"{with_p50 - without_p50:+.0f}",
+                f"{with_p90:+.0f}",
+                f"{without_p90:+.0f}",
+                f"{with_p90 - without_p90:+.0f}",
+                f"{with_avg:+.0f}",
+                f"{without_avg:+.0f}",
+                f"{with_avg - without_avg:+.0f}",
+            )
+        )
+    lines.append("")
+    lines.append(
+        "All values in milliseconds, with mock-LLM time subtracted "
+        f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms = 1× app + "
+        f"{_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST}× content-safety; "
+        f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms = 1× app)."
+    )
+    lines.append("'Δ' columns are the middleware's own overhead over the bare NMP+IGW path.")
+    return "\n".join(lines)
+
+
 def analyze_run(run_dir: Path) -> str:
-    """Top-level: read both variants from one run dir and return a printable table."""
+    """Read both variants from one run dir and return a printable report.
+
+    Output is the raw comparison table followed by a platform-overhead table
+    (mock time subtracted). Falls back to a single-variant table if only one
+    variant has results.
+    """
     aiperf_dir = run_dir / "aiperf_results"
     latency_by_concurrency_with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS)
     latency_by_concurrency_without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS)
@@ -170,14 +219,12 @@ def analyze_run(run_dir: Path) -> str:
     if not latency_by_concurrency_with_guardrails and not latency_by_concurrency_without_guardrails:
         return f"No AIPerf results found under {aiperf_dir}"
     if not latency_by_concurrency_with_guardrails or not latency_by_concurrency_without_guardrails:
-        # Single-variant run: dump whichever side is present without trying
-        # to compute deltas.
         if latency_by_concurrency_with_guardrails:
             return _format_single_variant(VARIANT_WITH_GUARDRAILS, latency_by_concurrency_with_guardrails)
         return _format_single_variant(VARIANT_WITHOUT_GUARDRAILS, latency_by_concurrency_without_guardrails)
 
     rows = compare(latency_by_concurrency_with_guardrails, latency_by_concurrency_without_guardrails)
-    return format_table(rows)
+    return f"{format_table(rows)}\n\n{format_platform_overhead_table(rows)}"
 
 
 def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str:
@@ -198,11 +245,7 @@ def _format_single_variant(variant: str, latency_by_concurrency: dict[int, Laten
 
 
 def _parse_concurrency_from_label(label: str) -> int | None:
-    """Extract the integer N from a sweep label like ``concurrency16``.
-
-    Returns ``None`` for labels that don't match the expected pattern so
-    unrelated subdirectories (logs, etc.) are skipped silently.
-    """
+    """Extract N from a sweep label like ``concurrency16``; ``None`` otherwise."""
     if not label.startswith("concurrency"):
         return None
     try:
@@ -212,12 +255,7 @@ def _parse_concurrency_from_label(label: str) -> int | None:
 
 
 def _read_latency_row(csv_path: Path, concurrency: int) -> LatencyRow | None:
-    """Pull the ``Request Latency (ms)`` line out of an AIPerf CSV.
-
-    AIPerf writes a header row followed by one ``Metric,avg,min,max,sum,p1,...``
-    row per metric, then a blank line and a second small block. We only care
-    about the first block.
-    """
+    """Pull the ``Request Latency (ms)`` row from an AIPerf CSV's first block."""
     if not csv_path.is_file():
         log.debug("Missing CSV at %s; skipping", csv_path)
         return None
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
index 80dda5d329..00af57580a 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
@@ -54,11 +54,8 @@ class RunPaths:
     nmp_data_dir: Path
     # Checked-in YAML template for the AIPerf sweep config.
     config_template: Path
-    # In-repo copies of the upstream mock-LLM `.env` files. We point the
-    # mock server processes at these (via `--config-file`) so we can change
-    # mock latency or refusal behavior without touching the NeMo-Guardrails
-    # checkout, and so the exact mock config a benchmark ran against stays
-    # versioned alongside our results.
+    # In-repo mock-LLM `.env` files. Versioned with the benchmark so mock
+    # behavior is independent of the NeMo-Guardrails checkout.
     mock_app_env: Path
     mock_content_safety_env: Path
     # Per-run materialized copy of `config_template` with `output_base_dir`
@@ -83,12 +80,7 @@ def ensure_directories(self) -> None:
             path.mkdir(parents=True, exist_ok=True)
 
     def aiperf_output_dir_for(self, variant: str) -> Path:
-        """Per-variant AIPerf output dir under ``aiperf_results/<variant>/``.
-
-        Each variant gets its own subtree so a single run can hold side-by-side
-        sweeps (e.g. ``with-guardrails`` and ``without-guardrails``) without
-        their outputs colliding.
-        """
+        """Per-variant AIPerf output dir; keeps side-by-side sweeps from colliding."""
         return self.aiperf_output_dir / variant
 
     def runtime_config_for(self, variant: str) -> Path:
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
index e3386b303c..2fc8e7421e 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
@@ -98,10 +98,8 @@ def _validate_nemoguardrails_repo(nemoguardrails_repo_root: Path) -> None:
 def _validate_in_repo_mock_configs(paths: RunPaths) -> None:
     """Fail fast if the in-repo mock LLM env files are missing.
 
-    These configure the upstream mock server but live in this repo (under
-    `plugins/nemo-guardrails/benchmarks/configs/mock_llm/`) so the harness can
-    tune mock behavior without forking the upstream checkout. A missing file
-    here is almost certainly a checkout problem on the NMP side.
+    These live in this repo (not upstream) so we control mock behavior
+    independently of the NeMo-Guardrails checkout.
     """
     missing = [p for p in (paths.mock_app_env, paths.mock_content_safety_env) if not p.is_file()]
     if missing:
@@ -140,9 +138,7 @@ def spec(name: str, port: int, env_file: Path, *, health_url: str) -> Supervised
             health_timeout_seconds=_MOCK_HEALTH_TIMEOUT_SECONDS,
         )
 
-    # Both env files are sourced from this repo (see
-    # plugins/nemo-guardrails/benchmarks/configs/mock_llm/) so mock behavior is
-    # versioned alongside the benchmark and independent of the upstream checkout.
+    # Env files come from this repo, rather than the upstream library.
     return [
         spec(
             "mock-app-llm",
@@ -370,12 +366,10 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         choices=(*ALL_VARIANTS, "all"),
         default="all",
         help=(
-            "Which sweep to run: 'with-guardrails' targets the VirtualModel with "
-            "middleware attached; 'without-guardrails' targets a control VM with "
-            "no middleware; 'all' (default) runs them sequentially against the "
-            "same NMP so the with-vs-without delta isolates middleware overhead. "
-            "In CI, run two parallel jobs with --variant=with-guardrails and "
-            "--variant=without-guardrails against isolated NMP instances."
+            "Which sweep to run. 'all' (default) runs both variants sequentially "
+            "against the same NMP; the with-vs-without delta isolates middleware "
+            "overhead. In CI, run the two variants as parallel jobs against "
+            "separate NMP instances."
         ),
     )
     parser.add_argument("--verbose", "-v", action="store_true")
@@ -457,11 +451,8 @@ def main(argv: list[str] | None = None) -> int:
         log.info("Waiting for VirtualModel %s to be ready...", seeded.vm_ref)
         _smoke_test(client, seeded)
 
-        # Run each requested benchmark variant sequentially against the same
-        # NMP. The bootstrap (mocks, NMP, shim) is shared so the only thing
-        # that differs between variants is which VirtualModel AIPerf targets,
-        # which is exactly what makes the with-vs-without delta interpretable
-        # as middleware overhead.
+        # Variants run sequentially against the same NMP; only the targeted
+        # VirtualModel differs, so the delta isolates middleware overhead.
         outcomes: list[BenchmarkOutcome] = []
         for variant in variants:
             outcomes.append(
@@ -479,13 +470,10 @@ def main(argv: list[str] | None = None) -> int:
 
 
 def _maybe_print_analysis(run_dir: Path, outcomes: list[BenchmarkOutcome]) -> None:
-    """Print the with-vs-without comparison table when it's meaningful.
+    """Print the analyzer's comparison table when at least one variant has results.
 
-    Only invoked when at least one variant produced results, since the
-    analyzer can also dump a single-variant table for partial runs. The call
-    is wrapped in a broad try/except: the analyzer is pure post-processing
-    over already-written artifacts, so a bug here must not change the
-    harness's exit code or hide a real benchmark failure.
+    Wrapped in a broad try/except: the analyzer is post-processing only and
+    must not change the harness's exit code or hide a real benchmark failure.
     """
     if not any(o.sweep_results for o in outcomes):
         return
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
index 97f9b0df0f..cde4f31b3f 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
@@ -53,9 +53,7 @@ class SeededResources:
     cs_model_entity: str
     guardrail_config_name: str
     vm_name: str
-    # Sibling VM with no middleware attached, used as the control variant by
-    # the benchmark harness. Same default_model_entity and models list as the
-    # guardrails VM so the only difference is middleware attachment.
+    # Control VM with no middleware; otherwise identical to the guardrails VM.
     no_guardrails_vm_name: str
 
     @property
@@ -180,10 +178,8 @@ def seed_benchmark(
     )
     _dump_model(generated_dir / "virtual_model.json", vm)
 
-    # Control VirtualModel: identical to the guardrails VM except no
-    # middleware. The harness uses this to measure NMP+IGW latency without
-    # the guardrails middleware so the with-vs-without delta isolates
-    # middleware overhead.
+    # Control VM: identical to the guardrails VM but no middleware, so the
+    # with-vs-without delta isolates middleware overhead.
     log.info("Creating control VirtualModel %s/%s", WORKSPACE, NO_GUARDRAILS_VM_NAME)
     no_guardrails_vm = client.inference.virtual_models.create(
         workspace=WORKSPACE,

From f0fef0a24a313a051f4d107e89a57d656b7a8151 Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Tue, 23 Jun 2026 09:27:22 -0400
Subject: [PATCH 5/8] Updates

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .github/workflows/ci.yaml                     |   2 +-
 plugins/nemo-guardrails/benchmarks/README.md  |  80 +++++++-
 .../benchmarks/analyze.py                     | 176 +++++++++++++++++-
 3 files changed, 241 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 37d5864c9f..b87ba8ccdd 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -1163,7 +1163,7 @@ jobs:
         run: |
           RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-)
           echo "Analyzing run directory: $RUN_DIR"
-          python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR"
+          python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" --strict
       - name: Upload merged benchmark artifacts
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md
index bede28d85e..cb103d2529 100644
--- a/plugins/nemo-guardrails/benchmarks/README.md
+++ b/plugins/nemo-guardrails/benchmarks/README.md
@@ -15,9 +15,12 @@ benchmark modules with `PYTHONPATH` pointed at that checkout.
 plugins/nemo-guardrails/benchmarks/
   configs/
     nmp_igw_guardrails_sweep_concurrency.yaml   # AIPerf sweep template
+    mock_llm/                                   # in-repo mock LLM env files
+  results/                                      # historical baseline notes
   artifacts/                                    # per-run outputs (gitignored)
 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/
   run.py             # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run`
+  analyze.py         # post-run analysis; checks latencies against baseline values
   paths.py           # filesystem layout
   constants.py       # workspace / VM / provider names
   processes.py       # subprocess supervision (process groups + ExitStack)
@@ -181,15 +184,76 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/
 
 ## CI
 
-A `benchmark-guardrails` job in `.github/workflows/ci.yaml` checks out both
-this repo and `NVIDIA/NeMo-Guardrails`, runs `make bootstrap-python` and
-`make benchmark-guardrails`, and uploads the per-run artifacts directory
-(`logs/`, `generated/`, `aiperf_results/`) on success or failure.
+Two jobs in `.github/workflows/ci.yaml`:
 
-Pass/fail is driven by the harness's exit code, which is non-zero if `aiperf`
-itself exits non-zero or any sweep returns a non-zero exit code. No latency
-thresholds are enforced — those can be layered on later by a separate
-analyzer that reads the per-sweep CSVs.
+- `benchmark-guardrails` — matrix of two parallel jobs, one per variant
+  (`with-guardrails`, `without-guardrails`), each on its own NMP instance.
+  Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`).
+- `benchmark-guardrails-analyze` — joins the two matrix jobs, downloads both
+  artifacts, prints a side-by-side comparison via
+  `nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check
+  (see below). Fails the build on a latency regression beyond tolerance. The
+  analyzer is stdlib-only by design, so this job runs on the runner's stock
+  `python3` without bootstrapping the uv workspace.
+
+### Baseline and gating
+
+CI compares the run's delta_p50 (with-guardrails minus without-guardrails
+p50, in ms) against a checked-in baseline. The baseline lives as
+module-level constants in:
+
+```text
+plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+```
+
+Why only delta_p50 (and not absolute with-guardrails p50)? delta_p50
+isolates the middleware's contribution — shared CI runner noise cancels
+across the two variants.
+
+#### Baseline constants
+
+- `CONCURRENCIES_TO_VALIDATE: list[int]` — concurrency levels to gate on.
+  Other levels still appear in the analyzer's output tables, but pass/fail
+  is decided only by these.
+- `DEFAULT_DELTA_P50_TOLERANCE_MS: int` — default tolerance (in ms) applied
+  to every validated concurrency. A check fails when
+  `|observed - baseline| > tolerance`.
+- `DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int]` — per-concurrency
+  tolerance overrides (in ms). Levels without an override fall back to the
+  default.
+- `DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int]` — expected delta_p50
+  (in ms) per concurrency level. Edit by hand when a real change shifts
+  the numbers.
+
+Worked example: at c=16 the override is 300 ms, so a run with observed
+delta_p50 = 1689 (diff +299 from baseline 1390) passes; observed
+delta_p50 = 1691 (diff +301) fails.
+
+Notes on the current values:
+
+- c=16 and c=32 use wider tolerances than the default because their
+  absolute delta_p50 is larger. Over time, we can tighten these values
+  if latencies in CI produce less variance.
+- Any change to mock-LLM latencies, the guardrails config, or the runner
+  class invalidates the current baseline values. The benchmark should be
+  re-run in CI several tiems to establish updated baseline values.
+
+#### Running the analyzer locally
+
+```bash
+python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py \
+    plugins/nemo-guardrails/benchmarks/artifacts/runs/<run-id>
+```
+
+Local runs print both tables and the baseline-check table.
+CI passes `--strict` to make any out-of-tolerance check fail the job.
+
+#### Updating the baseline
+
+When a real change shifts the numbers (ex. a deliberate middleware change,
+a mock-LLM config change, or a runner-class change), edit the constants at
+the top of `analyze.py` by hand and reference the PR / CI run that
+justifies it in the commit.
 
 ## Cleanup
 
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
index 60545a70b1..8e7671edcb 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
@@ -26,6 +26,36 @@
 VARIANT_WITH_GUARDRAILS = "with-guardrails"
 VARIANT_WITHOUT_GUARDRAILS = "without-guardrails"
 
+# --- CI baseline gate ---------------------------------------------------------
+# For each concurrency level we list:
+#   - The expected p50 latency delta between requests with guardrails vs.
+#     without guardrails.
+#   - The allowed plus/minus tolerance in CI. Benchmark jobs whose p50
+#     latency exceeds this tolerance will fail.
+
+# Concurrency levels we check in CI.
+CONCURRENCIES_TO_VALIDATE: list[int] = [1, 2, 4, 8, 16, 32]
+
+# Tolerance (ms) used for every concurrency level unless overridden below.
+DEFAULT_DELTA_P50_TOLERANCE_MS: int = 150
+
+# Looser tolerance (ms) for higher concurrencies. With more requests in
+# flight at once, they contend for shared resources (the IGW event loop,
+# the mock-LLM workers, the CI runner's CPU), so we see more variance in
+# latency values.
+DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 300}
+
+# Estimated expected delta_p50 (ms) at each concurrency level, based on
+# a few sample runs in CI.
+DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int] = {
+    1: 1070,
+    2: 1110,
+    4: 1190,
+    8: 1230,
+    16: 1390,
+    32: 2430,
+}
+
 log = logging.getLogger(__name__)
 
 _LATENCY_METRIC = "Request Latency (ms)"
@@ -134,7 +164,18 @@ def format_table(rows: list[ComparisonRow]) -> str:
     if not rows:
         return "No comparable sweep results found (need both variants to share concurrency levels)."
 
-    header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg")
+    header = (
+        "conc",
+        "with p50",
+        "w/o p50",
+        "delta p50",
+        "with p90",
+        "w/o p90",
+        "delta p90",
+        "with avg",
+        "w/o avg",
+        "delta avg",
+    )
     fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
     header_line = fmt.format(*header)
     lines = [header_line, "-" * len(header_line)]
@@ -154,7 +195,7 @@ def format_table(rows: list[ComparisonRow]) -> str:
             )
         )
     lines.append("")
-    lines.append("All values in milliseconds. 'Δ' = with-guardrails minus without-guardrails.")
+    lines.append("All values in milliseconds. 'delta' columns = with-guardrails minus without-guardrails.")
     return "\n".join(lines)
 
 
@@ -162,13 +203,24 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str:
     """Render a table with mock-LLM time subtracted from p50/p90/avg.
 
     Isolates NMP + IGW + shim + middleware overhead from the much larger
-    mock sleeps. The Δ columns are the middleware's own cost over the bare
-    path.
+    mock sleeps. The delta columns are the middleware's own cost over the
+    bare path.
     """
     if not rows:
         return "No comparable sweep results found (need both variants to share concurrency levels)."
 
-    header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg")
+    header = (
+        "conc",
+        "with p50",
+        "w/o p50",
+        "delta p50",
+        "with p90",
+        "w/o p90",
+        "delta p90",
+        "with avg",
+        "w/o avg",
+        "delta avg",
+    )
     fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
     header_line = fmt.format(*header)
     lines = [header_line, "-" * len(header_line)]
@@ -201,7 +253,7 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str:
         f"{_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST}× content-safety; "
         f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms = 1× app)."
     )
-    lines.append("'Δ' columns are the middleware's own overhead over the bare NMP+IGW path.")
+    lines.append("'delta' columns are the middleware's own overhead over the bare NMP+IGW path.")
     return "\n".join(lines)
 
 
@@ -227,6 +279,100 @@ def analyze_run(run_dir: Path) -> str:
     return f"{format_table(rows)}\n\n{format_platform_overhead_table(rows)}"
 
 
+def _load_comparison_rows(run_dir: Path) -> list[ComparisonRow]:
+    """Reload comparison rows from a run dir; returns ``[]`` if either variant is absent."""
+    aiperf_dir = run_dir / "aiperf_results"
+    with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS)
+    without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS)
+    if not with_guardrails or not without_guardrails:
+        return []
+    return compare(with_guardrails, without_guardrails)
+
+
+@dataclass(frozen=True)
+class LatencyReport:
+    """Latency results for a single concurrency level, rendered as one row of the report.
+
+    Each instance represents a single concurrency level from the benchmark
+    run: what we measured (observed_ms), what we expected from the
+    baseline (baseline_ms), and how much they're allowed to differ
+    (tolerance_ms).
+    The check passes when |observed_ms - baseline_ms| <= tolerance_ms.
+    """
+
+    concurrency: int
+    metric: str
+    baseline_ms: float
+    observed_ms: float
+    tolerance_ms: float
+
+    @property
+    def diff_ms(self) -> float:
+        return self.observed_ms - self.baseline_ms
+
+    @property
+    def passed(self) -> bool:
+        return abs(self.diff_ms) <= self.tolerance_ms
+
+
+def check_against_baseline(rows: list[ComparisonRow]) -> tuple[str, int]:
+    """Compare the delta_p50 for each concurrency level against the baseline latencies.
+
+    Returns ``(report_text, failed_count)``. Concurrencies missing from
+    either the run or the baseline are skipped with a note.
+    """
+    rows_by_concurrency = {r.concurrency: r for r in rows}
+
+    latency_reports: list[LatencyReport] = []
+    skipped_concurrencies: list[int] = []
+
+    for concurrency in sorted(CONCURRENCIES_TO_VALIDATE):
+        if concurrency not in rows_by_concurrency or concurrency not in DELTA_P50_BASELINE_BY_CONCURRENCY:
+            skipped_concurrencies.append(concurrency)
+            continue
+        latency_reports.append(
+            LatencyReport(
+                concurrency=concurrency,
+                metric="delta_p50",
+                baseline_ms=float(DELTA_P50_BASELINE_BY_CONCURRENCY[concurrency]),
+                observed_ms=rows_by_concurrency[concurrency].delta_p50,
+                tolerance_ms=float(DELTA_P50_TOLERANCE_OVERRIDES_MS.get(concurrency, DEFAULT_DELTA_P50_TOLERANCE_MS)),
+            )
+        )
+
+    fmt = "{:>9}  {:>4}  {:>10}  {:>10}  {:>9}  {:>11}  {:>6}"
+    header_line = fmt.format("metric", "conc", "baseline", "observed", "diff", "tolerance", "status")
+    lines = [
+        "Check vs baseline (see DELTA_P50_BASELINE_BY_CONCURRENCY in analyze.py):",
+        header_line,
+        "-" * len(header_line),
+    ]
+    failed_count = 0
+    for report in latency_reports:
+        status = "PASS" if report.passed else "FAIL"
+        if not report.passed:
+            failed_count += 1
+        lines.append(
+            fmt.format(
+                report.metric,
+                report.concurrency,
+                f"{report.baseline_ms:.0f}",
+                f"{report.observed_ms:.0f}",
+                f"{report.diff_ms:+.0f}",
+                f"±{report.tolerance_ms:.0f}ms",
+                status,
+            )
+        )
+    if skipped_concurrencies:
+        lines.append("")
+        lines.append(f"Skipped (missing from results or baseline): {skipped_concurrencies}")
+    if failed_count:
+        lines.append("")
+        lines.append(f"FAIL: {failed_count} of {len(latency_reports)} check(s) exceeded tolerance.")
+
+    return "\n".join(lines), failed_count
+
+
 def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str:
     """Render one variant's table when the other variant didn't run."""
     fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}"
@@ -300,7 +446,12 @@ def main(argv: list[str] | None = None) -> int:
     parser.add_argument(
         "run_dir",
         type=Path,
-        help=("Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/`."),
+        help="Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/`.",
+    )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help="Exit non-zero when any baseline check exceeds tolerance. CI sets this; local runs default off so you can iterate without the gate failing.",
     )
     parser.add_argument(
         "--log-level",
@@ -317,7 +468,16 @@ def main(argv: list[str] | None = None) -> int:
         return 2
 
     print(analyze_run(run_dir))
-    return 0
+
+    rows = _load_comparison_rows(run_dir)
+    if not rows:
+        print("Skipping baseline check: no comparable rows from this run.", file=sys.stderr)
+        return 0 if not args.strict else 2
+
+    report, failed_count = check_against_baseline(rows)
+    print()
+    print(report)
+    return 1 if (args.strict and failed_count) else 0
 
 
 if __name__ == "__main__":

From 98f5d457c84218eb12e3e41d3c492983ab976ac6 Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Tue, 23 Jun 2026 10:09:36 -0400
Subject: [PATCH 6/8] Clean up CI config and baselines

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .github/actions/changes/action.yaml           |   6 +
 .github/workflows/ci.yaml                     |  36 ++--
 plugins/nemo-guardrails/benchmarks/README.md  |   5 +-
 .../results/local-baseline-2026-06-16.md      | 154 ------------------
 .../benchmarks/analyze.py                     |  20 +--
 5 files changed, 39 insertions(+), 182 deletions(-)
 delete mode 100644 plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md

diff --git a/.github/actions/changes/action.yaml b/.github/actions/changes/action.yaml
index f6a7a27d2e..948d73015d 100644
--- a/.github/actions/changes/action.yaml
+++ b/.github/actions/changes/action.yaml
@@ -40,6 +40,9 @@ outputs:
   k8s-smoke:
     description: "'true' if Kubernetes smoke test support files changed"
     value: ${{ steps.filter.outputs.k8s-smoke }}
+  guardrails-benchmark:
+    description: "'true' if the nemo-guardrails plugin or guardrails service changed"
+    value: ${{ steps.filter.outputs.guardrails-benchmark }}
   cpu-smoke:
     description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed"
     value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }}
@@ -97,3 +100,6 @@ runs:
             - 'e2e/k8s/values/**'
             - 'e2e/test_jobs.py'
             - '.github/actions/free-disk-space/action.yaml'
+          guardrails-benchmark:
+            - 'plugins/nemo-guardrails/**'
+            - 'services/guardrails/**'
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index b87ba8ccdd..8f1ac2e718 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -44,6 +44,7 @@ jobs:
       docker: ${{ steps.changes.outputs.docker }}
       helm: ${{ steps.changes.outputs.helm }}
       cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }}
+      guardrails-benchmark: ${{ steps.changes.outputs.guardrails-benchmark }}
     steps:
       - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
       - uses: ./.github/actions/changes
@@ -1071,12 +1072,17 @@ jobs:
           retention-days: 7
           path: web/packages/studio/playwright-report/
 
-  benchmark-guardrails:
+  guardrails-benchmark:
     # Parallel matrix jobs (one NMP per variant) so the two sweeps don't
-    # share mocks or contend on :8080. `benchmark-guardrails-analyze` merges
+    # share mocks or contend on :8080. `guardrails-benchmark-analyze` merges
     # the artifacts and prints the comparison.
-    name: Guardrails plugin benchmark (${{ matrix.variant }})
-    # TODO(jgulabrai): gate back to `workflow_dispatch` once we have a CI baseline.
+    name: nemo-guardrails plugin benchmark (${{ matrix.variant }})
+    needs: [changes]
+    if: >
+      !cancelled() && (
+        github.event_name == 'workflow_dispatch' ||
+        needs.changes.outputs.guardrails-benchmark == 'true'
+      )
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
@@ -1125,16 +1131,20 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           # Ensure we use a unique artifact name per benchmark vaiant.
-          name: benchmark-guardrails-results-${{ matrix.variant }}
+          name: guardrails-benchmark-results-${{ matrix.variant }}
           retention-days: 30
           path: |
             nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
 
-  benchmark-guardrails-analyze:
+  guardrails-benchmark-analyze:
     # Merge both variant artifacts and print the comparison table.
-    name: Guardrails benchmark analysis
-    needs: [benchmark-guardrails]
-    if: ${{ !cancelled() }}
+    name: nemo-guardrails plugin benchmark analysis
+    needs: [changes, guardrails-benchmark]
+    if: >
+      !cancelled() && (
+        github.event_name == 'workflow_dispatch' ||
+        needs.changes.outputs.guardrails-benchmark == 'true'
+      )
     runs-on: ubuntu-latest
     timeout-minutes: 10
     steps:
@@ -1148,13 +1158,13 @@ jobs:
         continue-on-error: true
         uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
-          name: benchmark-guardrails-results-with-guardrails
+          name: guardrails-benchmark-results-with-guardrails
           path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
       - name: Download without-guardrails artifact
         continue-on-error: true
         uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
-          name: benchmark-guardrails-results-without-guardrails
+          name: guardrails-benchmark-results-without-guardrails
           path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
       - name: Print benchmark comparison
         working-directory: nemo-platform
@@ -1169,7 +1179,7 @@ jobs:
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           # Single artifact so baseline collection is one download per run.
-          name: benchmark-guardrails-results-merged
+          name: guardrails-benchmark-results-merged
           retention-days: 30
           path: |
             nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
@@ -1320,8 +1330,6 @@ jobs:
       - web-sdk-gen
       - web-studio-deps
       - web-studio-e2e
-      - benchmark-guardrails
-      - benchmark-guardrails-analyze
       - opa-policy-test
     if: always()
     runs-on: ubuntu-latest
diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md
index cb103d2529..f9084971c8 100644
--- a/plugins/nemo-guardrails/benchmarks/README.md
+++ b/plugins/nemo-guardrails/benchmarks/README.md
@@ -16,7 +16,6 @@ plugins/nemo-guardrails/benchmarks/
   configs/
     nmp_igw_guardrails_sweep_concurrency.yaml   # AIPerf sweep template
     mock_llm/                                   # in-repo mock LLM env files
-  results/                                      # historical baseline notes
   artifacts/                                    # per-run outputs (gitignored)
 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/
   run.py             # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run`
@@ -186,10 +185,10 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/
 
 Two jobs in `.github/workflows/ci.yaml`:
 
-- `benchmark-guardrails` — matrix of two parallel jobs, one per variant
+- `guardrails-benchmark` — matrix of two parallel jobs, one per variant
   (`with-guardrails`, `without-guardrails`), each on its own NMP instance.
   Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`).
-- `benchmark-guardrails-analyze` — joins the two matrix jobs, downloads both
+- `guardrails-benchmark-analyze` — joins the two matrix jobs, downloads both
   artifacts, prints a side-by-side comparison via
   `nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check
   (see below). Fails the build on a latency regression beyond tolerance. The
diff --git a/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md b/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md
deleted file mode 100644
index 509c9e8c62..0000000000
--- a/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md
+++ /dev/null
@@ -1,154 +0,0 @@
-# Local baseline results — 2026-06-16
-
-Three back-to-back runs of `make benchmark-guardrails` on a local MacBook Pro
-(Apple Silicon), no other heavy workloads running. Goal: characterize the
-run-to-run variance of the new with-guardrails / without-guardrails harness so
-we can decide what's gateable in CI.
-
-## Hardware / setup
-
-- Host: MacBook Pro, Apple Silicon, on AC power
-- NMP, mocks, shim: all on localhost
-- Mock LLM config: in-repo defaults (`plugins/nemo-guardrails/benchmarks/configs/mock_llm/`)
-  - app LLM: 4.0s e2e latency, std 0
-  - content-safety LLM: 0.5s e2e latency, std 0
-- AIPerf sweep: concurrency `[1, 2, 4, 8, 16, 32, 64]`, `benchmark_duration: 60s`,
-  `warmup_request_count: 10`, non-streaming chat completions
-- Mock workers: 4 (default)
-- Three runs in the same afternoon, NMP data dir reused across runs
-
-## Run inventory
-
-| Run | Run dir | Notes |
-|---|---|---|
-| 1 | `20260616_123851` | first run after the with/without harness change |
-| 2 | `20260616_145058` | identical config |
-| 3 | `20260616_152834` | identical config |
-
-All three runs completed with 7/7 sweeps passing per variant, exit code 0.
-
-## Δp50 (with-guardrails − without-guardrails), milliseconds
-
-This is the headline metric: how much wall-clock time the guardrails middleware
-adds on top of the bare NMP+IGW path, including the two content-safety LLM
-round-trips that the rails cause but don't do themselves.
-
-| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64    |
-|---------|-----:|-----:|-----:|-----:|-----:|-----:|--------:|
-| Run 1   | 1029 | 1071 | 1068 | 1104 | 1145 | 1260 |    778  |
-| Run 2   | 1027 | 1062 | 1096 | 1105 | 1226 | 1256 |  -2896  |
-| Run 3   | 1030 | 1062 | 1079 | 1070 | 1118 | 1201 |  -2077  |
-| **mean**| **1029** | **1065** | **1081** | **1093** | **1163** | **1239** | **−1398** |
-| range   |    3 |    9 |   28 |   35 |  108 |   59 |   3674  |
-| range % | 0.3% | 0.8% | 2.6% | 3.2% | 9.3% | 4.8% |   n/a   |
-
-## with-guardrails p50 (absolute), milliseconds
-
-Useful as a sanity check that nothing catastrophic shifted in the absolute
-numbers — even if Δp50 stays steady, both variants could slow down together.
-
-| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64 |
-|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
-| Run 1   | 5049 | 5101 | 5114 | 5152 | 5201 | 5318 | 6164 |
-| Run 2   | 5048 | 5093 | 5125 | 5137 | 5255 | 5279 | 5614 |
-| Run 3   | 5050 | 5094 | 5123 | 5146 | 5163 | 5250 | 5486 |
-| **mean**| **5049** | **5096** | **5121** | **5145** | **5206** | **5282** | **5755** |
-| range   |    2 |    8 |   11 |   15 |   92 |   68 |  678 |
-| range % | 0.0% | 0.2% | 0.2% | 0.3% | 1.8% | 1.3% | 11.8%|
-
-## without-guardrails p50 (absolute), milliseconds
-
-For completeness. This is the variant that's wildly unstable at c=64.
-
-| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64 |
-|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
-| Run 1   | 4020 | 4030 | 4045 | 4048 | 4056 | 4058 | 5386 |
-| Run 2   | 4020 | 4031 | 4029 | 4032 | 4029 | 4023 | 8510 |
-| Run 3   | 4020 | 4032 | 4044 | 4076 | 4045 | 4049 | 7563 |
-| **mean**| **4020** | **4031** | **4039** | **4052** | **4043** | **4043** | **7153** |
-| range   |    0 |    2 |   16 |   44 |   27 |   35 | 3124 |
-
-The app mock sleeps for exactly 4.0s. The ~20–80 ms above 4000 across c=1–c=32
-is pure NMP+IGW+shim overhead. At c=64 the mock saturates (4 workers × 1 req/4s
-= 4 RPS ceiling, vs. 64 requested in-flight) and requests queue.
-
-## p90 — informational only
-
-p90 is much noisier than p50 across runs. Not gateable with three samples.
-
-### Δp90, milliseconds
-
-| Run   | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64  |
-|-------|-----:|-----:|-----:|-----:|-----:|-----:|------:|
-| Run 1 | 1039 | 1099 | 1162 | 1025 |  911 |  604 | 3009  |
-| Run 2 | 1028 | 1115 | 1160 | 1262 |  783 |  641 | 1015  |
-| Run 3 | 1023 | 1076 | 1189 | 1085 | 1209 |   18 | 1998  |
-
-## Observations
-
-### What's stable enough to gate on
-
-**c=1, 2, 4, 8.** The Δp50 ranges are 3–35 ms, well under any tolerance we'd
-realistically write. The absolute with-guardrails p50 is even tighter (2–15 ms
-across three runs). This is the regime where the harness is genuinely measuring
-what we want: NMP+middleware overhead on top of fixed-latency mocks.
-
-### What's borderline
-
-**c=16.** Δp50 range is 9.3%. Gateable with a generous tolerance (~10%+) but
-adds limited signal beyond c=8.
-
-### What's not gateable
-
-**c=32.** ~5% Δp50 range. Still bounded, but the run-to-run distance is
-several times larger than at c=1–c=8 and the absolute numbers wobble too.
-
-**c=64.** Unusable. Δp50 swings from +778 to −2896 across three runs.
-Root cause is the app mock's 4-worker saturation at this load level: the
-without-guardrails path fires app requests as fast as it can and the mock queues
-unpredictably. The with-guardrails path's CS-mock work paces requests enough to
-hide most of this. This is a test-rig artifact, not an NMP behavior.
-
-### Side observation: middleware overhead is small
-
-Of the ~1029 ms Δp50 at c=1:
-- ~1000 ms is the two content-safety mock round-trips (0.5s each, mandatory).
-- ~29 ms is the middleware's *own* work (rails orchestration, request/response
-  shaping, etc.) plus bare NMP+IGW overhead delta vs. without-guardrails.
-
-The without-guardrails baseline of ~4020 ms at c=1 against a 4000 ms mock means
-**bare NMP+IGW+shim overhead is ~20 ms** at idle.
-
-## Recommendation for the CI gate
-
-Based on the variance data above:
-
-| Concurrency | Gate Δp50? | Gate absolute with-guardrails p50? | Notes |
-|---|---|---|---|
-| 1  | yes | yes | tightest signal |
-| 2  | yes | yes | |
-| 4  | yes | yes | |
-| 8  | yes | yes | |
-| 16 | informational | informational | record but don't fail |
-| 32 | informational | informational | record but don't fail |
-| 64 | exclude | exclude | mock saturation, not gateable |
-
-Proposed tolerance bands (`max(absolute_ms, relative_%)`):
-- Δp50: `max(±100 ms, ±5%)`
-- with-guardrails p50: `max(±150 ms, ±3%)`
-
-Both bands are ~3× the observed local run-to-run range, leaving headroom for
-CI hardware noise being noisier than a quiet laptop.
-
-## Open questions / followups
-
-- **Local baselines won't transfer to CI hardware.** These numbers should seed
-  the baseline file but be replaced once we have N runs from the actual CI
-  runner class.
-- **Three samples is a small N.** Worth one more local run (Run 4) before we
-  treat the means above as canonical, but the c=1–c=8 numbers are unlikely
-  to budge meaningfully.
-- **c=64 instability is downstream of NMP.** Hypothesis: app mock's 4 workers
-  saturate at concurrency 64 (4 RPS ceiling on 4.0s sleep). Easy to test by
-  running with `--mock-workers 16`. Not blocking the gate work since c=64 is
-  excluded anyway.
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
index 8e7671edcb..7413a074cc 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
@@ -43,7 +43,7 @@
 # flight at once, they contend for shared resources (the IGW event loop,
 # the mock-LLM workers, the CI runner's CPU), so we see more variance in
 # latency values.
-DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 300}
+DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 450}
 
 # Estimated expected delta_p50 (ms) at each concurrency level, based on
 # a few sample runs in CI.
@@ -53,7 +53,7 @@
     4: 1190,
     8: 1230,
     16: 1390,
-    32: 2430,
+    32: 2110,
 }
 
 log = logging.getLogger(__name__)
@@ -178,7 +178,7 @@ def format_table(rows: list[ComparisonRow]) -> str:
     )
     fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
     header_line = fmt.format(*header)
-    lines = [header_line, "-" * len(header_line)]
+    lines = ["Measured Latencies (ms), with and without guardrails:", header_line, "-" * len(header_line)]
     for r in rows:
         lines.append(
             fmt.format(
@@ -195,7 +195,7 @@ def format_table(rows: list[ComparisonRow]) -> str:
             )
         )
     lines.append("")
-    lines.append("All values in milliseconds. 'delta' columns = with-guardrails minus without-guardrails.")
+    lines.append("delta = with-guardrails minus without-guardrails.")
     return "\n".join(lines)
 
 
@@ -223,7 +223,7 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str:
     )
     fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
     header_line = fmt.format(*header)
-    lines = [header_line, "-" * len(header_line)]
+    lines = ["Platform Overhead (ms), with and without guardrails:", header_line, "-" * len(header_line)]
 
     for r in rows:
         with_p50 = r.with_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS
@@ -248,12 +248,10 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str:
         )
     lines.append("")
     lines.append(
-        "All values in milliseconds, with mock-LLM time subtracted "
-        f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms = 1× app + "
-        f"{_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST}× content-safety; "
-        f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms = 1× app)."
+        f"Minus mock-LLM time "
+        f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms; "
+        f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms)."
     )
-    lines.append("'delta' columns are the middleware's own overhead over the bare NMP+IGW path.")
     return "\n".join(lines)
 
 
@@ -343,7 +341,7 @@ def check_against_baseline(rows: list[ComparisonRow]) -> tuple[str, int]:
     fmt = "{:>9}  {:>4}  {:>10}  {:>10}  {:>9}  {:>11}  {:>6}"
     header_line = fmt.format("metric", "conc", "baseline", "observed", "diff", "tolerance", "status")
     lines = [
-        "Check vs baseline (see DELTA_P50_BASELINE_BY_CONCURRENCY in analyze.py):",
+        "Guardrails Overhead vs. Baseline (ms):",
         header_line,
         "-" * len(header_line),
     ]

From 8a55a3ac14f695c38fc7964113ef33b9fbdcbaa2 Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Tue, 23 Jun 2026 10:15:45 -0400
Subject: [PATCH 7/8] Minor ReadME cleanup

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .../benchmarks/configs/mock_llm/README.md        | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
index 1cd5cc3dd4..87c1563b59 100644
--- a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
+++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
@@ -1,21 +1,15 @@
 # Mock LLM configurations
 
-These `.env` files configure the upstream `benchmark.mock_llm_server.run_server`
-(from the `NeMo-Guardrails` checkout) for the IGW guardrails benchmark.
+These `.env` files configure the behavior of the mock LLMs, used by the upstream
+`nemo-guardrails` library's `benchmark.mock_llm_server.run_server`.
 
-We keep our own copies (instead of pointing at the upstream checkout's
-`benchmark/mock_llm_server/configs/`) so:
+The library stores these files, but we keep our own copies so:
 
-- We can change mock latency without touching the upstream repo. The original
-  motivation was tuning `E2E_LATENCY_*` to isolate NMP+middleware overhead
-  from mandatory NIM sleep (see the benchmark README for the full rationale).
+- We can change mock latency without touching the upstream repo.
 - The exact mock behavior we benchmarked against is versioned alongside the
   results, so historical numbers stay reproducible even if upstream changes
   its defaults.
 
-Initial contents are a verbatim copy of the upstream files:
-
+Mapping to upstream files:
 - `app-llm.env`            ← upstream `meta-llama-3.3-70b-instruct.env`
 - `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env`
-
-Update either file to change mock behavior for the next benchmark run.

From dcaf12f3bfb11326f494832c86463f4fe1e55a3b Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <jgulabrai@nvidia.com>
Date: Tue, 23 Jun 2026 10:34:30 -0400
Subject: [PATCH 8/8] Address CodeRabbit

Signed-off-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 .github/workflows/ci.yaml                    | 3 +++
 plugins/nemo-guardrails/benchmarks/README.md | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 8f1ac2e718..9daeb9c1c2 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -1095,11 +1095,13 @@ jobs:
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           path: nemo-platform
+          persist-credentials: false
       - name: Checkout NeMo-Guardrails
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           repository: NVIDIA/NeMo-Guardrails
           path: NeMo-Guardrails
+          persist-credentials: false
       - name: Install uv
         uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
         with:
@@ -1152,6 +1154,7 @@ jobs:
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           path: nemo-platform
+          persist-credentials: false
       - name: Download with-guardrails artifact
         # If a variant failed entirely it may have uploaded no artifact;
         # the analyzer handles the single-variant case so don't fail here.
diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md
index f9084971c8..f0894902ae 100644
--- a/plugins/nemo-guardrails/benchmarks/README.md
+++ b/plugins/nemo-guardrails/benchmarks/README.md
@@ -224,9 +224,9 @@ across the two variants.
   (in ms) per concurrency level. Edit by hand when a real change shifts
   the numbers.
 
-Worked example: at c=16 the override is 300 ms, so a run with observed
-delta_p50 = 1689 (diff +299 from baseline 1390) passes; observed
-delta_p50 = 1691 (diff +301) fails.
+Worked example: at c=16 the override is 200 ms, so a run with observed
+delta_p50 = 1689 (diff +199 from baseline 1390) passes; observed
+delta_p50 = 1691 (diff +201) fails.
 
 Notes on the current values: