diff --git a/.github/actions/changes/action.yaml b/.github/actions/changes/action.yaml
index f6a7a27d2e..948d73015d 100644
--- a/.github/actions/changes/action.yaml
+++ b/.github/actions/changes/action.yaml
@@ -40,6 +40,9 @@ outputs:
   k8s-smoke:
     description: "'true' if Kubernetes smoke test support files changed"
     value: ${{ steps.filter.outputs.k8s-smoke }}
+  guardrails-benchmark:
+    description: "'true' if the nemo-guardrails plugin or guardrails service changed"
+    value: ${{ steps.filter.outputs.guardrails-benchmark }}
   cpu-smoke:
     description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed"
     value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }}
@@ -97,3 +100,6 @@ runs:
             - 'e2e/k8s/values/**'
             - 'e2e/test_jobs.py'
             - '.github/actions/free-disk-space/action.yaml'
+          guardrails-benchmark:
+            - 'plugins/nemo-guardrails/**'
+            - 'services/guardrails/**'
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f8643c9cb5..9daeb9c1c2 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -44,6 +44,7 @@ jobs:
       docker: ${{ steps.changes.outputs.docker }}
       helm: ${{ steps.changes.outputs.helm }}
       cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }}
+      guardrails-benchmark: ${{ steps.changes.outputs.guardrails-benchmark }}
     steps:
       - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
       - uses: ./.github/actions/changes
@@ -1071,21 +1072,36 @@ jobs:
           retention-days: 7
           path: web/packages/studio/playwright-report/
 
-  benchmark-guardrails:
-    name: Guardrails plugin benchmark
-    if: github.event_name == 'workflow_dispatch'
+  guardrails-benchmark:
+    # Parallel matrix jobs (one NMP per variant) so the two sweeps don't
+    # share mocks or contend on :8080. `guardrails-benchmark-analyze` merges
+    # the artifacts and prints the comparison.
+    name: nemo-guardrails plugin benchmark (${{ matrix.variant }})
+    needs: [changes]
+    if: >
+      !cancelled() && (
+        github.event_name == 'workflow_dispatch' ||
+        needs.changes.outputs.guardrails-benchmark == 'true'
+      )
     runs-on: ubuntu-latest
     timeout-minutes: 30
+    strategy:
+      # Keep the partial artifact if one variant fails.
+      fail-fast: false
+      matrix:
+        variant: [with-guardrails, without-guardrails]
     steps:
       - name: Checkout nemo-platform
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           path: nemo-platform
+          persist-credentials: false
       - name: Checkout NeMo-Guardrails
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           repository: NVIDIA/NeMo-Guardrails
           path: NeMo-Guardrails
+          persist-credentials: false
       - name: Install uv
         uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
         with:
@@ -1102,7 +1118,13 @@ jobs:
           PYTORCH_DEPS: cpu
       - name: Run benchmark sweep
         working-directory: nemo-platform
-        run: make benchmark-guardrails
+        # Pin both variants to the same `--run-id` so when the analyze job
+        # downloads both artifacts into one `runs/` parent, they merge into
+        # a single run directory the analyzer can read normally.
+        run: |
+          make benchmark-guardrails BENCHMARK_ARGS="\
+            --variant ${{ matrix.variant }} \
+            --run-id ci-${{ github.run_id }}-${{ github.run_attempt }}"
         env:
           NEMO_GUARDRAILS_REPO_ROOT: ${{ github.workspace }}/NeMo-Guardrails
           _TYPER_FORCE_DISABLE_TERMINAL: "1"
@@ -1110,7 +1132,57 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: benchmark-guardrails-results
+          # Ensure we use a unique artifact name per benchmark vaiant.
+          name: guardrails-benchmark-results-${{ matrix.variant }}
+          retention-days: 30
+          path: |
+            nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+
+  guardrails-benchmark-analyze:
+    # Merge both variant artifacts and print the comparison table.
+    name: nemo-guardrails plugin benchmark analysis
+    needs: [changes, guardrails-benchmark]
+    if: >
+      !cancelled() && (
+        github.event_name == 'workflow_dispatch' ||
+        needs.changes.outputs.guardrails-benchmark == 'true'
+      )
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout nemo-platform
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+        with:
+          path: nemo-platform
+          persist-credentials: false
+      - name: Download with-guardrails artifact
+        # If a variant failed entirely it may have uploaded no artifact;
+        # the analyzer handles the single-variant case so don't fail here.
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: guardrails-benchmark-results-with-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Download without-guardrails artifact
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: guardrails-benchmark-results-without-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Print benchmark comparison
+        working-directory: nemo-platform
+        # `analyze.py` doesn't rely on NMP or AIPerf, so we skip the uv bootstrap
+        # step and run it with the runner's `python3` CLI directly.
+        run: |
+          RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-)
+          echo "Analyzing run directory: $RUN_DIR"
+          python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" --strict
+      - name: Upload merged benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          # Single artifact so baseline collection is one download per run.
+          name: guardrails-benchmark-results-merged
           retention-days: 30
           path: |
             nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
@@ -1261,7 +1333,6 @@ jobs:
       - web-sdk-gen
       - web-studio-deps
       - web-studio-e2e
-      - benchmark-guardrails
       - opa-policy-test
     if: always()
     runs-on: ubuntu-latest
diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md
index bede28d85e..f0894902ae 100644
--- a/plugins/nemo-guardrails/benchmarks/README.md
+++ b/plugins/nemo-guardrails/benchmarks/README.md
@@ -15,9 +15,11 @@ benchmark modules with `PYTHONPATH` pointed at that checkout.
 plugins/nemo-guardrails/benchmarks/
   configs/
     nmp_igw_guardrails_sweep_concurrency.yaml   # AIPerf sweep template
+    mock_llm/                                   # in-repo mock LLM env files
   artifacts/                                    # per-run outputs (gitignored)
 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/
   run.py             # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run`
+  analyze.py         # post-run analysis; checks latencies against baseline values
   paths.py           # filesystem layout
   constants.py       # workspace / VM / provider names
   processes.py       # subprocess supervision (process groups + ExitStack)
@@ -181,15 +183,76 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/
 
 ## CI
 
-A `benchmark-guardrails` job in `.github/workflows/ci.yaml` checks out both
-this repo and `NVIDIA/NeMo-Guardrails`, runs `make bootstrap-python` and
-`make benchmark-guardrails`, and uploads the per-run artifacts directory
-(`logs/`, `generated/`, `aiperf_results/`) on success or failure.
+Two jobs in `.github/workflows/ci.yaml`:
 
-Pass/fail is driven by the harness's exit code, which is non-zero if `aiperf`
-itself exits non-zero or any sweep returns a non-zero exit code. No latency
-thresholds are enforced — those can be layered on later by a separate
-analyzer that reads the per-sweep CSVs.
+- `guardrails-benchmark` — matrix of two parallel jobs, one per variant
+  (`with-guardrails`, `without-guardrails`), each on its own NMP instance.
+  Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`).
+- `guardrails-benchmark-analyze` — joins the two matrix jobs, downloads both
+  artifacts, prints a side-by-side comparison via
+  `nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check
+  (see below). Fails the build on a latency regression beyond tolerance. The
+  analyzer is stdlib-only by design, so this job runs on the runner's stock
+  `python3` without bootstrapping the uv workspace.
+
+### Baseline and gating
+
+CI compares the run's delta_p50 (with-guardrails minus without-guardrails
+p50, in ms) against a checked-in baseline. The baseline lives as
+module-level constants in:
+
+```text
+plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+```
+
+Why only delta_p50 (and not absolute with-guardrails p50)? delta_p50
+isolates the middleware's contribution — shared CI runner noise cancels
+across the two variants.
+
+#### Baseline constants
+
+- `CONCURRENCIES_TO_VALIDATE: list[int]` — concurrency levels to gate on.
+  Other levels still appear in the analyzer's output tables, but pass/fail
+  is decided only by these.
+- `DEFAULT_DELTA_P50_TOLERANCE_MS: int` — default tolerance (in ms) applied
+  to every validated concurrency. A check fails when
+  `|observed - baseline| > tolerance`.
+- `DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int]` — per-concurrency
+  tolerance overrides (in ms). Levels without an override fall back to the
+  default.
+- `DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int]` — expected delta_p50
+  (in ms) per concurrency level. Edit by hand when a real change shifts
+  the numbers.
+
+Worked example: at c=16 the override is 200 ms, so a run with observed
+delta_p50 = 1689 (diff +199 from baseline 1390) passes; observed
+delta_p50 = 1691 (diff +201) fails.
+
+Notes on the current values:
+
+- c=16 and c=32 use wider tolerances than the default because their
+  absolute delta_p50 is larger. Over time, we can tighten these values
+  if latencies in CI produce less variance.
+- Any change to mock-LLM latencies, the guardrails config, or the runner
+  class invalidates the current baseline values. The benchmark should be
+  re-run in CI several tiems to establish updated baseline values.
+
+#### Running the analyzer locally
+
+```bash
+python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py \
+    plugins/nemo-guardrails/benchmarks/artifacts/runs/<run-id>
+```
+
+Local runs print both tables and the baseline-check table.
+CI passes `--strict` to make any out-of-tolerance check fail the job.
+
+#### Updating the baseline
+
+When a real change shifts the numbers (ex. a deliberate middleware change,
+a mock-LLM config change, or a runner-class change), edit the constants at
+the top of `analyze.py` by hand and reference the PR / CI run that
+justifies it in the commit.
 
 ## Cleanup
 
diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
new file mode 100644
index 0000000000..87c1563b59
--- /dev/null
+++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
@@ -0,0 +1,15 @@
+# Mock LLM configurations
+
+These `.env` files configure the behavior of the mock LLMs, used by the upstream
+`nemo-guardrails` library's `benchmark.mock_llm_server.run_server`.
+
+The library stores these files, but we keep our own copies so:
+
+- We can change mock latency without touching the upstream repo.
+- The exact mock behavior we benchmarked against is versioned alongside the
+  results, so historical numbers stay reproducible even if upstream changes
+  its defaults.
+
+Mapping to upstream files:
+- `app-llm.env`            ← upstream `meta-llama-3.3-70b-instruct.env`
+- `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env`
diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env
new file mode 100644
index 0000000000..0cec095b38
--- /dev/null
+++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env
@@ -0,0 +1,19 @@
+MODEL="meta/llama-3.3-70b-instruct"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
+SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=4.0
+E2E_LATENCY_MAX_SECONDS=4.0
+E2E_LATENCY_MEAN_SECONDS=4.0
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.3
+TTFT_MAX_SECONDS=0.3
+TTFT_MEAN_SECONDS=0.3
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env
new file mode 100644
index 0000000000..9e2467b1db
--- /dev/null
+++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env
@@ -0,0 +1,19 @@
+MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
+SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=0.5
+E2E_LATENCY_MAX_SECONDS=0.5
+E2E_LATENCY_MEAN_SECONDS=0.5
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.2
+TTFT_MAX_SECONDS=0.2
+TTFT_MEAN_SECONDS=0.2
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
index f7ff6729c1..20963e11f3 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py
@@ -60,17 +60,15 @@ def prepare_runtime_aiperf_config(
     template_path: Path,
     runtime_config_path: Path,
     aiperf_output_dir: Path,
+    model_ref: str | None = None,
 ) -> dict[str, Any]:
     """Materialize the AIPerf config this run will use.
 
-    Reads the checked-in ``template_path`` config, overrides its
-    ``output_base_dir`` to point inside the current run's directory, and writes
-    the result to ``runtime_config_path``. AIPerf is later invoked with
-    ``--config-file <runtime_config_path>`` so every artifact lands under a
-    separate per-run directory.
-
-    Returns the parsed config dict so callers can log fields (sweep params,
-    benchmark_duration) without re-reading the file.
+    Reads ``template_path``, overrides ``output_base_dir`` (so AIPerf
+    artifacts nest under this run) and optionally ``base_config.model``
+    (so one template can target multiple VirtualModels), and writes the
+    result to ``runtime_config_path``. Returns the parsed config so
+    callers can log sweep params without re-reading the file.
     """
     if not template_path.is_file():
         raise FileNotFoundError(f"AIPerf template not found: {template_path}")
@@ -82,6 +80,11 @@ def prepare_runtime_aiperf_config(
     # Point AIPerf's output_base_dir at this run's directory so its results
     # nest under our per-run artifacts tree.
     config["output_base_dir"] = str(aiperf_output_dir)
+    if model_ref is not None:
+        base_config = config.get("base_config")
+        if not isinstance(base_config, dict):
+            raise ValueError(f"Expected `base_config` mapping in {template_path}, got {type(base_config).__name__}")
+        base_config["model"] = model_ref
     runtime_config_path.parent.mkdir(parents=True, exist_ok=True)
     runtime_config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
 
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
new file mode 100644
index 0000000000..7413a074cc
--- /dev/null
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
@@ -0,0 +1,482 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Post-run analyzer for the nemo-guardrails IGW benchmark.
+
+Reads ``profile_export_aiperf.csv`` files from both variants in one run dir
+and prints a with-vs-without latency comparison. The delta isolates
+middleware overhead since the only difference between variants is whether
+middleware is attached to the targeted VirtualModel.
+
+Used both as a script (``python -m ... <run-dir>``) and auto-invoked from
+``run.py`` after a multi-variant sweep.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import logging
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+# Duplicated from `constants.py` so this module stays import-free and can
+# run on bare `python3` in CI without bootstrapping the uv workspace.
+VARIANT_WITH_GUARDRAILS = "with-guardrails"
+VARIANT_WITHOUT_GUARDRAILS = "without-guardrails"
+
+# --- CI baseline gate ---------------------------------------------------------
+# For each concurrency level we list:
+#   - The expected p50 latency delta between requests with guardrails vs.
+#     without guardrails.
+#   - The allowed plus/minus tolerance in CI. Benchmark jobs whose p50
+#     latency exceeds this tolerance will fail.
+
+# Concurrency levels we check in CI.
+CONCURRENCIES_TO_VALIDATE: list[int] = [1, 2, 4, 8, 16, 32]
+
+# Tolerance (ms) used for every concurrency level unless overridden below.
+DEFAULT_DELTA_P50_TOLERANCE_MS: int = 150
+
+# Looser tolerance (ms) for higher concurrencies. With more requests in
+# flight at once, they contend for shared resources (the IGW event loop,
+# the mock-LLM workers, the CI runner's CPU), so we see more variance in
+# latency values.
+DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 450}
+
+# Estimated expected delta_p50 (ms) at each concurrency level, based on
+# a few sample runs in CI.
+DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int] = {
+    1: 1070,
+    2: 1110,
+    4: 1190,
+    8: 1230,
+    16: 1390,
+    32: 2110,
+}
+
+log = logging.getLogger(__name__)
+
+_LATENCY_METRIC = "Request Latency (ms)"
+
+# Mock-LLM time per request, subtracted to isolate platform overhead. Mirrors
+# `E2E_LATENCY_MEAN_SECONDS` in configs/mock_llm/*.env and the 2 CS calls
+# (input + output rails) of `content_safety_local`. Update in lock-step.
+_APP_MOCK_LATENCY_MS = 4000.0
+_CONTENT_SAFETY_MOCK_LATENCY_MS = 500.0
+_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST = 2
+_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS = _APP_MOCK_LATENCY_MS
+_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS = (
+    _APP_MOCK_LATENCY_MS + _CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST * _CONTENT_SAFETY_MOCK_LATENCY_MS
+)
+
+
+@dataclass(frozen=True)
+class LatencyRow:
+    """Per-concurrency latency stats parsed from one AIPerf CSV."""
+
+    concurrency: int
+    avg: float
+    p50: float
+    p90: float
+    p99: float
+    std: float
+
+
+@dataclass(frozen=True)
+class ComparisonRow:
+    """Side-by-side comparison of one concurrency level across variants."""
+
+    concurrency: int
+    with_guardrails: LatencyRow
+    without_guardrails: LatencyRow
+
+    @property
+    def delta_p50(self) -> float:
+        return self.with_guardrails.p50 - self.without_guardrails.p50
+
+    @property
+    def delta_p90(self) -> float:
+        return self.with_guardrails.p90 - self.without_guardrails.p90
+
+    @property
+    def delta_avg(self) -> float:
+        return self.with_guardrails.avg - self.without_guardrails.avg
+
+
+def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]:
+    """Load per-concurrency latency stats for one variant.
+
+    Walks the ``<batch>/<timestamp>/concurrency<N>/`` layout produced by
+    ``collect_sweep_results``. Missing CSVs are skipped, not raised, so
+    partial runs still produce a table.
+    """
+    if not variant_output_dir.is_dir():
+        return {}
+
+    latency_by_concurrency: dict[int, LatencyRow] = {}
+    for batch_dir in sorted(p for p in variant_output_dir.iterdir() if p.is_dir()):
+        for timestamp_dir in sorted(p for p in batch_dir.iterdir() if p.is_dir()):
+            for sweep_dir in sorted(p for p in timestamp_dir.iterdir() if p.is_dir()):
+                concurrency = _parse_concurrency_from_label(sweep_dir.name)
+                if concurrency is None:
+                    continue
+                csv_path = sweep_dir / "profile_export_aiperf.csv"
+                row = _read_latency_row(csv_path, concurrency)
+                if row is not None:
+                    latency_by_concurrency[concurrency] = row
+    return latency_by_concurrency
+
+
+def compare(
+    latency_by_concurrency_with_guardrails: dict[int, LatencyRow],
+    latency_by_concurrency_without_guardrails: dict[int, LatencyRow],
+) -> list[ComparisonRow]:
+    """Build per-concurrency comparison rows, sorted by concurrency.
+
+    Only levels present in both variants are compared; asymmetric levels are
+    logged at WARNING and excluded.
+    """
+    concurrencies_with_guardrails = set(latency_by_concurrency_with_guardrails)
+    concurrencies_without_guardrails = set(latency_by_concurrency_without_guardrails)
+    concurrencies_in_both_variants = sorted(concurrencies_with_guardrails & concurrencies_without_guardrails)
+
+    concurrencies_in_only_one_variant = sorted(concurrencies_with_guardrails ^ concurrencies_without_guardrails)
+    if concurrencies_in_only_one_variant:
+        log.warning(
+            "Concurrency levels present in only one variant, excluded from comparison: %s",
+            concurrencies_in_only_one_variant,
+        )
+
+    return [
+        ComparisonRow(
+            concurrency,
+            latency_by_concurrency_with_guardrails[concurrency],
+            latency_by_concurrency_without_guardrails[concurrency],
+        )
+        for concurrency in concurrencies_in_both_variants
+    ]
+
+
+def format_table(rows: list[ComparisonRow]) -> str:
+    """Render the comparison as a fixed-width text table."""
+    if not rows:
+        return "No comparable sweep results found (need both variants to share concurrency levels)."
+
+    header = (
+        "conc",
+        "with p50",
+        "w/o p50",
+        "delta p50",
+        "with p90",
+        "w/o p90",
+        "delta p90",
+        "with avg",
+        "w/o avg",
+        "delta avg",
+    )
+    fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
+    header_line = fmt.format(*header)
+    lines = ["Measured Latencies (ms), with and without guardrails:", header_line, "-" * len(header_line)]
+    for r in rows:
+        lines.append(
+            fmt.format(
+                r.concurrency,
+                f"{r.with_guardrails.p50:.0f}",
+                f"{r.without_guardrails.p50:.0f}",
+                f"{r.delta_p50:+.0f}",
+                f"{r.with_guardrails.p90:.0f}",
+                f"{r.without_guardrails.p90:.0f}",
+                f"{r.delta_p90:+.0f}",
+                f"{r.with_guardrails.avg:.0f}",
+                f"{r.without_guardrails.avg:.0f}",
+                f"{r.delta_avg:+.0f}",
+            )
+        )
+    lines.append("")
+    lines.append("delta = with-guardrails minus without-guardrails.")
+    return "\n".join(lines)
+
+
+def format_platform_overhead_table(rows: list[ComparisonRow]) -> str:
+    """Render a table with mock-LLM time subtracted from p50/p90/avg.
+
+    Isolates NMP + IGW + shim + middleware overhead from the much larger
+    mock sleeps. The delta columns are the middleware's own cost over the
+    bare path.
+    """
+    if not rows:
+        return "No comparable sweep results found (need both variants to share concurrency levels)."
+
+    header = (
+        "conc",
+        "with p50",
+        "w/o p50",
+        "delta p50",
+        "with p90",
+        "w/o p90",
+        "delta p90",
+        "with avg",
+        "w/o avg",
+        "delta avg",
+    )
+    fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}  {:>9}"
+    header_line = fmt.format(*header)
+    lines = ["Platform Overhead (ms), with and without guardrails:", header_line, "-" * len(header_line)]
+
+    for r in rows:
+        with_p50 = r.with_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS
+        without_p50 = r.without_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS
+        with_p90 = r.with_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS
+        without_p90 = r.without_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS
+        with_avg = r.with_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS
+        without_avg = r.without_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS
+        lines.append(
+            fmt.format(
+                r.concurrency,
+                f"{with_p50:+.0f}",
+                f"{without_p50:+.0f}",
+                f"{with_p50 - without_p50:+.0f}",
+                f"{with_p90:+.0f}",
+                f"{without_p90:+.0f}",
+                f"{with_p90 - without_p90:+.0f}",
+                f"{with_avg:+.0f}",
+                f"{without_avg:+.0f}",
+                f"{with_avg - without_avg:+.0f}",
+            )
+        )
+    lines.append("")
+    lines.append(
+        f"Minus mock-LLM time "
+        f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms; "
+        f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms)."
+    )
+    return "\n".join(lines)
+
+
+def analyze_run(run_dir: Path) -> str:
+    """Read both variants from one run dir and return a printable report.
+
+    Output is the raw comparison table followed by a platform-overhead table
+    (mock time subtracted). Falls back to a single-variant table if only one
+    variant has results.
+    """
+    aiperf_dir = run_dir / "aiperf_results"
+    latency_by_concurrency_with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS)
+    latency_by_concurrency_without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS)
+
+    if not latency_by_concurrency_with_guardrails and not latency_by_concurrency_without_guardrails:
+        return f"No AIPerf results found under {aiperf_dir}"
+    if not latency_by_concurrency_with_guardrails or not latency_by_concurrency_without_guardrails:
+        if latency_by_concurrency_with_guardrails:
+            return _format_single_variant(VARIANT_WITH_GUARDRAILS, latency_by_concurrency_with_guardrails)
+        return _format_single_variant(VARIANT_WITHOUT_GUARDRAILS, latency_by_concurrency_without_guardrails)
+
+    rows = compare(latency_by_concurrency_with_guardrails, latency_by_concurrency_without_guardrails)
+    return f"{format_table(rows)}\n\n{format_platform_overhead_table(rows)}"
+
+
+def _load_comparison_rows(run_dir: Path) -> list[ComparisonRow]:
+    """Reload comparison rows from a run dir; returns ``[]`` if either variant is absent."""
+    aiperf_dir = run_dir / "aiperf_results"
+    with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS)
+    without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS)
+    if not with_guardrails or not without_guardrails:
+        return []
+    return compare(with_guardrails, without_guardrails)
+
+
+@dataclass(frozen=True)
+class LatencyReport:
+    """Latency results for a single concurrency level, rendered as one row of the report.
+
+    Each instance represents a single concurrency level from the benchmark
+    run: what we measured (observed_ms), what we expected from the
+    baseline (baseline_ms), and how much they're allowed to differ
+    (tolerance_ms).
+    The check passes when |observed_ms - baseline_ms| <= tolerance_ms.
+    """
+
+    concurrency: int
+    metric: str
+    baseline_ms: float
+    observed_ms: float
+    tolerance_ms: float
+
+    @property
+    def diff_ms(self) -> float:
+        return self.observed_ms - self.baseline_ms
+
+    @property
+    def passed(self) -> bool:
+        return abs(self.diff_ms) <= self.tolerance_ms
+
+
+def check_against_baseline(rows: list[ComparisonRow]) -> tuple[str, int]:
+    """Compare the delta_p50 for each concurrency level against the baseline latencies.
+
+    Returns ``(report_text, failed_count)``. Concurrencies missing from
+    either the run or the baseline are skipped with a note.
+    """
+    rows_by_concurrency = {r.concurrency: r for r in rows}
+
+    latency_reports: list[LatencyReport] = []
+    skipped_concurrencies: list[int] = []
+
+    for concurrency in sorted(CONCURRENCIES_TO_VALIDATE):
+        if concurrency not in rows_by_concurrency or concurrency not in DELTA_P50_BASELINE_BY_CONCURRENCY:
+            skipped_concurrencies.append(concurrency)
+            continue
+        latency_reports.append(
+            LatencyReport(
+                concurrency=concurrency,
+                metric="delta_p50",
+                baseline_ms=float(DELTA_P50_BASELINE_BY_CONCURRENCY[concurrency]),
+                observed_ms=rows_by_concurrency[concurrency].delta_p50,
+                tolerance_ms=float(DELTA_P50_TOLERANCE_OVERRIDES_MS.get(concurrency, DEFAULT_DELTA_P50_TOLERANCE_MS)),
+            )
+        )
+
+    fmt = "{:>9}  {:>4}  {:>10}  {:>10}  {:>9}  {:>11}  {:>6}"
+    header_line = fmt.format("metric", "conc", "baseline", "observed", "diff", "tolerance", "status")
+    lines = [
+        "Guardrails Overhead vs. Baseline (ms):",
+        header_line,
+        "-" * len(header_line),
+    ]
+    failed_count = 0
+    for report in latency_reports:
+        status = "PASS" if report.passed else "FAIL"
+        if not report.passed:
+            failed_count += 1
+        lines.append(
+            fmt.format(
+                report.metric,
+                report.concurrency,
+                f"{report.baseline_ms:.0f}",
+                f"{report.observed_ms:.0f}",
+                f"{report.diff_ms:+.0f}",
+                f"±{report.tolerance_ms:.0f}ms",
+                status,
+            )
+        )
+    if skipped_concurrencies:
+        lines.append("")
+        lines.append(f"Skipped (missing from results or baseline): {skipped_concurrencies}")
+    if failed_count:
+        lines.append("")
+        lines.append(f"FAIL: {failed_count} of {len(latency_reports)} check(s) exceeded tolerance.")
+
+    return "\n".join(lines), failed_count
+
+
+def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str:
+    """Render one variant's table when the other variant didn't run."""
+    fmt = "{:>4}  {:>9}  {:>9}  {:>9}  {:>9}"
+    header_line = fmt.format("conc", "avg", "p50", "p90", "std")
+    lines = [
+        f"Only one variant present: {variant}",
+        header_line,
+        "-" * len(header_line),
+    ]
+    for concurrency in sorted(latency_by_concurrency):
+        row = latency_by_concurrency[concurrency]
+        lines.append(fmt.format(concurrency, f"{row.avg:.0f}", f"{row.p50:.0f}", f"{row.p90:.0f}", f"{row.std:.0f}"))
+    lines.append("")
+    lines.append("All values in milliseconds.")
+    return "\n".join(lines)
+
+
+def _parse_concurrency_from_label(label: str) -> int | None:
+    """Extract N from a sweep label like ``concurrency16``; ``None`` otherwise."""
+    if not label.startswith("concurrency"):
+        return None
+    try:
+        return int(label.removeprefix("concurrency"))
+    except ValueError:
+        return None
+
+
+def _read_latency_row(csv_path: Path, concurrency: int) -> LatencyRow | None:
+    """Pull the ``Request Latency (ms)`` row from an AIPerf CSV's first block."""
+    if not csv_path.is_file():
+        log.debug("Missing CSV at %s; skipping", csv_path)
+        return None
+
+    try:
+        with csv_path.open(encoding="utf-8") as f:
+            reader = csv.reader(f)
+            header = next(reader, None)
+            if not header or header[0] != "Metric":
+                log.warning("Unexpected header in %s: %s", csv_path, header)
+                return None
+            try:
+                col = {name: header.index(name) for name in ("avg", "p50", "p90", "p99", "std")}
+            except ValueError as exc:
+                log.warning("Missing expected column in %s: %s", csv_path, exc)
+                return None
+            for row in reader:
+                if not row:
+                    break  # end of first block
+                if row[0] == _LATENCY_METRIC:
+                    return LatencyRow(
+                        concurrency=concurrency,
+                        avg=float(row[col["avg"]]),
+                        p50=float(row[col["p50"]]),
+                        p90=float(row[col["p90"]]),
+                        p99=float(row[col["p99"]]),
+                        std=float(row[col["std"]]),
+                    )
+    except (OSError, ValueError, IndexError) as exc:
+        log.warning("Failed to parse %s: %s", csv_path, exc)
+        return None
+
+    log.warning("Did not find '%s' row in %s", _LATENCY_METRIC, csv_path)
+    return None
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="nemo-guardrails-benchmark-analyze",
+        description=__doc__,
+    )
+    parser.add_argument(
+        "run_dir",
+        type=Path,
+        help="Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/`.",
+    )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help="Exit non-zero when any baseline check exceeds tolerance. CI sets this; local runs default off so you can iterate without the gate failing.",
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=("DEBUG", "INFO", "WARNING", "ERROR"),
+    )
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level=args.log_level, format="%(levelname)s %(message)s")
+
+    run_dir: Path = args.run_dir.resolve()
+    if not run_dir.is_dir():
+        print(f"Not a directory: {run_dir}", file=sys.stderr)
+        return 2
+
+    print(analyze_run(run_dir))
+
+    rows = _load_comparison_rows(run_dir)
+    if not rows:
+        print("Skipping baseline check: no comparable rows from this run.", file=sys.stderr)
+        return 0 if not args.strict else 2
+
+    report, failed_count = check_against_baseline(rows)
+    print()
+    print(report)
+    return 1 if (args.strict and failed_count) else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py
index cbe4b19960..14db3561f4 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py
@@ -8,6 +8,17 @@
 WORKSPACE = "benchmark"
 GUARDRAIL_CONFIG = "content-safety-local"
 VM_NAME = "guardrails-vm"
+# Control VirtualModel with no middleware attached. Used by the benchmark
+# harness to measure NMP+IGW latency *without* the guardrails middleware so
+# the with-vs-without delta isolates middleware overhead.
+NO_GUARDRAILS_VM_NAME = "no-guardrails-vm"
+
+# Logical identifiers for the two benchmark variants. Used as subdirectory
+# names under `aiperf_results/` and `logs/`, and as the value of the
+# harness's `--variant` flag.
+VARIANT_WITH_GUARDRAILS = "with-guardrails"
+VARIANT_WITHOUT_GUARDRAILS = "without-guardrails"
+ALL_VARIANTS = (VARIANT_WITH_GUARDRAILS, VARIANT_WITHOUT_GUARDRAILS)
 
 # ModelProvider that proxies requests to the mock main model
 APP_PROVIDER = "benchmark-app-llm"
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
index b1113fa2c2..00af57580a 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py
@@ -54,6 +54,10 @@ class RunPaths:
     nmp_data_dir: Path
     # Checked-in YAML template for the AIPerf sweep config.
     config_template: Path
+    # In-repo mock-LLM `.env` files. Versioned with the benchmark so mock
+    # behavior is independent of the NeMo-Guardrails checkout.
+    mock_app_env: Path
+    mock_content_safety_env: Path
     # Per-run materialized copy of `config_template` with `output_base_dir`
     # overridden to `aiperf_output_dir`. AIPerf is invoked against this file.
     runtime_config: Path
@@ -75,6 +79,18 @@ def ensure_directories(self) -> None:
         ):
             path.mkdir(parents=True, exist_ok=True)
 
+    def aiperf_output_dir_for(self, variant: str) -> Path:
+        """Per-variant AIPerf output dir; keeps side-by-side sweeps from colliding."""
+        return self.aiperf_output_dir / variant
+
+    def runtime_config_for(self, variant: str) -> Path:
+        """Per-variant materialized AIPerf config under ``generated/``."""
+        return self.generated_dir / f"aiperf_config_{variant}.yaml"
+
+    def aiperf_log_for(self, variant: str) -> Path:
+        """Per-variant AIPerf stdout/stderr log under ``logs/``."""
+        return self.log_dir / f"aiperf_{variant}.log"
+
 
 def _now_run_id() -> str:
     return dt.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -120,4 +136,6 @@ def build_run_paths(
         config_template=benchmark_dir / "configs" / "nmp_igw_guardrails_sweep_concurrency.yaml",
         runtime_config=run_dir / "generated" / "nmp_igw_guardrails_sweep_concurrency.yaml",
         aiperf_venv_dir=artifacts_dir / "venvs" / "aiperf",
+        mock_app_env=benchmark_dir / "configs" / "mock_llm" / "app-llm.env",
+        mock_content_safety_env=benchmark_dir / "configs" / "mock_llm" / "content-safety-llm.env",
     )
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
index 47114e0195..2fc8e7421e 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py
@@ -25,21 +25,27 @@
 import sys
 import time
 from contextlib import ExitStack
+from dataclasses import dataclass
 from pathlib import Path
 
 from nemo_guardrails_plugin.benchmarks.aiperf_runner import (
+    SweepRunResult,
     collect_sweep_results,
     prepare_runtime_aiperf_config,
     run_aiperf_sweep,
 )
+from nemo_guardrails_plugin.benchmarks.analyze import analyze_run
 from nemo_guardrails_plugin.benchmarks.bootstrap import ensure_aiperf_venv
 from nemo_guardrails_plugin.benchmarks.constants import (
     AIPERF_SHIM_BASE_URL,
+    ALL_VARIANTS,
     APP_PROVIDER_URL,
     CS_PROVIDER_URL,
     IGW_CHAT_PATH,
     NMP_BASE_URL,
     NMP_HEALTH_PATH,
+    VARIANT_WITH_GUARDRAILS,
+    VARIANT_WITHOUT_GUARDRAILS,
     WORKSPACE,
 )
 from nemo_guardrails_plugin.benchmarks.paths import (
@@ -66,8 +72,6 @@
     Path("benchmark/aiperf/__main__.py"),
     Path("benchmark/aiperf/run_aiperf.py"),
     Path("benchmark/mock_llm_server/run_server.py"),
-    Path("benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env"),
-    Path("benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env"),
     Path("examples/configs/content_safety_local/config.yml"),
     Path("examples/configs/content_safety_local/prompts.yml"),
 )
@@ -91,6 +95,18 @@ def _validate_nemoguardrails_repo(nemoguardrails_repo_root: Path) -> None:
         )
 
 
+def _validate_in_repo_mock_configs(paths: RunPaths) -> None:
+    """Fail fast if the in-repo mock LLM env files are missing.
+
+    These live in this repo (not upstream) so we control mock behavior
+    independently of the NeMo-Guardrails checkout.
+    """
+    missing = [p for p in (paths.mock_app_env, paths.mock_content_safety_env) if not p.is_file()]
+    if missing:
+        bullet = "\n  - ".join(str(p) for p in missing)
+        raise FileNotFoundError(f"In-repo mock LLM config files missing:\n  - {bullet}")
+
+
 def _build_mock_nim_processes(paths: RunPaths, workers: int) -> list[SupervisedProcess]:
     """Spawn ``python -m benchmark.mock_llm_server.run_server`` for both mocks.
 
@@ -122,20 +138,18 @@ def spec(name: str, port: int, env_file: Path, *, health_url: str) -> Supervised
             health_timeout_seconds=_MOCK_HEALTH_TIMEOUT_SECONDS,
         )
 
+    # Env files come from this repo, rather than the upstream library.
     return [
-        # Main LLM mock server
         spec(
             "mock-app-llm",
             8000,
-            paths.nemoguardrails_repo_root / "benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env",
+            paths.mock_app_env,
             health_url=f"{APP_PROVIDER_URL}/health",
         ),
-        # Content-safety LLM mock server
         spec(
             "mock-content-safety-llm",
             8001,
-            paths.nemoguardrails_repo_root
-            / "benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env",
+            paths.mock_content_safety_env,
             health_url=f"{CS_PROVIDER_URL}/health",
         ),
     ]
@@ -205,6 +219,109 @@ def _smoke_test(client: NeMoPlatform, seeded: SeededResources) -> None:
     raise RuntimeError(f"Smoke test failed after 60 attempts: {last_error}")
 
 
+@dataclass(frozen=True)
+class BenchmarkOutcome:
+    """Outcome of a single benchmark variant's AIPerf sweep, used by the summary."""
+
+    variant: str
+    aiperf_exit: int
+    output_dir: Path
+    sweep_results: list[SweepRunResult]
+
+    @property
+    def failures(self) -> int:
+        return sum(1 for r in self.sweep_results if not r.passed)
+
+    @property
+    def passed(self) -> bool:
+        return self.aiperf_exit == 0 and bool(self.sweep_results) and self.failures == 0
+
+
+def _vm_ref_for_variant(variant: str, seeded: SeededResources) -> str:
+    """Pick which seeded VirtualModel a benchmark variant should target."""
+    if variant == VARIANT_WITH_GUARDRAILS:
+        return seeded.vm_ref
+    if variant == VARIANT_WITHOUT_GUARDRAILS:
+        return seeded.no_guardrails_vm_ref
+    raise ValueError(f"Unknown variant: {variant!r}")
+
+
+def _run_benchmark(
+    *,
+    variant: str,
+    paths: RunPaths,
+    seeded: SeededResources,
+    aiperf_python: Path,
+) -> BenchmarkOutcome:
+    """Materialize a per-variant AIPerf config, run the sweep, collect results."""
+    vm_ref = _vm_ref_for_variant(variant, seeded)
+    runtime_config = paths.runtime_config_for(variant)
+    aiperf_output_dir = paths.aiperf_output_dir_for(variant)
+    aiperf_log = paths.aiperf_log_for(variant)
+
+    sweep_config = prepare_runtime_aiperf_config(
+        template_path=paths.config_template,
+        runtime_config_path=runtime_config,
+        aiperf_output_dir=aiperf_output_dir,
+        model_ref=vm_ref,
+    )
+    log.info(
+        "Benchmark %s: targeting %s; concurrency=%s, duration=%ss",
+        variant,
+        vm_ref,
+        sweep_config.get("sweeps", {}).get("concurrency"),
+        sweep_config.get("base_config", {}).get("benchmark_duration"),
+    )
+    log.info(
+        "Starting AIPerf sweep [%s] against %s -> shim -> %s%s",
+        variant,
+        AIPERF_SHIM_BASE_URL,
+        NMP_BASE_URL,
+        IGW_CHAT_PATH,
+    )
+
+    aiperf_exit = run_aiperf_sweep(
+        nemoguardrails_repo_root=paths.nemoguardrails_repo_root,
+        runtime_config=runtime_config,
+        log_path=aiperf_log,
+        python_executable=str(aiperf_python),
+        venv_bin_path=paths.aiperf_venv_dir / "bin",
+    )
+
+    sweep_results = collect_sweep_results(aiperf_output_dir)
+    return BenchmarkOutcome(
+        variant=variant,
+        aiperf_exit=aiperf_exit,
+        output_dir=aiperf_output_dir,
+        sweep_results=sweep_results,
+    )
+
+
+def _summarize_benchmark_results(outcomes: list[BenchmarkOutcome]) -> int:
+    """Log per-benchmark + overall summary; return process exit code."""
+    overall_failed = False
+    for outcome in outcomes:
+        if not outcome.sweep_results:
+            log.error(
+                "Benchmark %s: aiperf exited with code %d and produced no per-sweep results in %s",
+                outcome.variant,
+                outcome.aiperf_exit,
+                outcome.output_dir,
+            )
+        else:
+            log.info(
+                "Benchmark %s: %d run(s), %d failure(s); per-sweep outputs under %s",
+                outcome.variant,
+                len(outcome.sweep_results),
+                outcome.failures,
+                outcome.output_dir,
+            )
+        if not outcome.passed:
+            overall_failed = True
+
+    return 1 if overall_failed else 0
+
+
 def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         prog="nemo-guardrails-benchmark",
@@ -244,10 +361,28 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         default=None,
         help="Override the per-run directory name (default: current timestamp).",
     )
+    parser.add_argument(
+        "--variant",
+        choices=(*ALL_VARIANTS, "all"),
+        default="all",
+        help=(
+            "Which sweep to run. 'all' (default) runs both variants sequentially "
+            "against the same NMP; the with-vs-without delta isolates middleware "
+            "overhead. In CI, run the two variants as parallel jobs against "
+            "separate NMP instances."
+        ),
+    )
     parser.add_argument("--verbose", "-v", action="store_true")
     return parser.parse_args(argv)
 
 
+def _resolve_variants(variant_arg: str) -> tuple[str, ...]:
+    """Translate the ``--variant`` CLI argument into the ordered list to run."""
+    if variant_arg == "all":
+        return ALL_VARIANTS
+    return (variant_arg,)
+
+
 def main(argv: list[str] | None = None) -> int:
     args = parse_args(argv)
     _configure_logging(args.verbose)
@@ -266,20 +401,18 @@ def main(argv: list[str] | None = None) -> int:
         run_id=args.run_id,
     )
     paths.ensure_directories()
+    _validate_in_repo_mock_configs(paths)
 
     log.info("Created directory for benchmark results at: %s", paths.run_dir)
-
-    sweep_config = prepare_runtime_aiperf_config(
-        template_path=paths.config_template,
-        runtime_config_path=paths.runtime_config,
-        aiperf_output_dir=paths.aiperf_output_dir,
-    )
     log.info(
-        "AIPerf sweep: concurrency=%s, duration=%ss",
-        sweep_config.get("sweeps", {}).get("concurrency"),
-        sweep_config.get("base_config", {}).get("benchmark_duration"),
+        "Mock LLM configs: app=%s, content-safety=%s",
+        paths.mock_app_env,
+        paths.mock_content_safety_env,
     )
 
+    variants = _resolve_variants(args.variant)
+    log.info("Will run %d variant(s): %s", len(variants), ", ".join(variants))
+
     # Ensure the dedicated aiperf venv exists *before* we start any supervised
     # processes.
     aiperf_python = ensure_aiperf_venv(paths.aiperf_venv_dir)
@@ -318,43 +451,38 @@ def main(argv: list[str] | None = None) -> int:
         log.info("Waiting for VirtualModel %s to be ready...", seeded.vm_ref)
         _smoke_test(client, seeded)
 
-        log.info(
-            "Starting AIPerf sweep against %s -> shim -> %s%s",
-            AIPERF_SHIM_BASE_URL,
-            NMP_BASE_URL,
-            IGW_CHAT_PATH,
-        )
-        aiperf_exit = run_aiperf_sweep(
-            nemoguardrails_repo_root=paths.nemoguardrails_repo_root,
-            runtime_config=paths.runtime_config,
-            log_path=paths.log_dir / "aiperf.log",
-            python_executable=str(aiperf_python),
-            venv_bin_path=paths.aiperf_venv_dir / "bin",
-        )
+        # Variants run sequentially against the same NMP; only the targeted
+        # VirtualModel differs, so the delta isolates middleware overhead.
+        outcomes: list[BenchmarkOutcome] = []
+        for variant in variants:
+            outcomes.append(
+                _run_benchmark(
+                    variant=variant,
+                    paths=paths,
+                    seeded=seeded,
+                    aiperf_python=aiperf_python,
+                )
+            )
 
-    sweep_results = collect_sweep_results(paths.aiperf_output_dir)
-    failures = sum(1 for r in sweep_results if not r.passed)
+    exit_code = _summarize_benchmark_results(outcomes)
+    _maybe_print_analysis(paths.run_dir, outcomes)
+    return exit_code
 
-    if not sweep_results:
-        # AIPerf exited before producing any per-sweep dirs. Surface that
-        # explicitly so the log isn't ambiguous about why we're failing.
-        log.error(
-            "aiperf exited with code %d and produced no per-sweep results in %s",
-            aiperf_exit,
-            paths.aiperf_output_dir,
-        )
-    else:
-        log.info(
-            "Sweep summary: %d run(s), %d failure(s); per-sweep outputs under %s",
-            len(sweep_results),
-            failures,
-            paths.aiperf_output_dir,
-        )
 
-    if failures or aiperf_exit != 0 or not sweep_results:
-        return 1
+def _maybe_print_analysis(run_dir: Path, outcomes: list[BenchmarkOutcome]) -> None:
+    """Print the analyzer's comparison table when at least one variant has results.
 
-    return 0
+    Wrapped in a broad try/except: the analyzer is post-processing only and
+    must not change the harness's exit code or hide a real benchmark failure.
+    """
+    if not any(o.sweep_results for o in outcomes):
+        return
+    try:
+        report = analyze_run(run_dir)
+    except Exception as exc:
+        log.warning("Analyzer failed; skipping summary table: %s", exc)
+        return
+    log.info("Benchmark analysis:\n%s", report)
 
 
 if __name__ == "__main__":
diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
index 96cd152f0f..cde4f31b3f 100644
--- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
+++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py
@@ -28,6 +28,7 @@
     GUARDRAIL_CONFIG,
     GUARDRAILS_MIDDLEWARE_CONFIG_TYPE,
     GUARDRAILS_MIDDLEWARE_NAME,
+    NO_GUARDRAILS_VM_NAME,
     VM_NAME,
     WORKSPACE,
 )
@@ -52,11 +53,17 @@ class SeededResources:
     cs_model_entity: str
     guardrail_config_name: str
     vm_name: str
+    # Control VM with no middleware; otherwise identical to the guardrails VM.
+    no_guardrails_vm_name: str
 
     @property
     def vm_ref(self) -> str:
         return f"{self.workspace}/{self.vm_name}"
 
+    @property
+    def no_guardrails_vm_ref(self) -> str:
+        return f"{self.workspace}/{self.no_guardrails_vm_name}"
+
     @property
     def guardrail_config_ref(self) -> str:
         return f"{self.workspace}/{self.guardrail_config_name}"
@@ -171,6 +178,20 @@ def seed_benchmark(
     )
     _dump_model(generated_dir / "virtual_model.json", vm)
 
+    # Control VM: identical to the guardrails VM but no middleware, so the
+    # with-vs-without delta isolates middleware overhead.
+    log.info("Creating control VirtualModel %s/%s", WORKSPACE, NO_GUARDRAILS_VM_NAME)
+    no_guardrails_vm = client.inference.virtual_models.create(
+        workspace=WORKSPACE,
+        name=NO_GUARDRAILS_VM_NAME,
+        default_model_entity=app_entity,
+        models=vm_models,
+        request_middleware=[],
+        response_middleware=[],
+        exist_ok=True,
+    )
+    _dump_model(generated_dir / "virtual_model_no_guardrails.json", no_guardrails_vm)
+
     return SeededResources(
         workspace=WORKSPACE,
         app_provider_name=APP_PROVIDER,
@@ -179,6 +200,7 @@ def seed_benchmark(
         cs_model_entity=cs_entity,
         guardrail_config_name=GUARDRAIL_CONFIG,
         vm_name=VM_NAME,
+        no_guardrails_vm_name=NO_GUARDRAILS_VM_NAME,
     )
 
 
diff --git a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py
index f0ad57aeaa..8342315e13 100644
--- a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py
+++ b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py
@@ -14,6 +14,7 @@
     CS_MODEL_NAME,
     CS_PROVIDER,
     GUARDRAIL_CONFIG,
+    NO_GUARDRAILS_VM_NAME,
     VM_NAME,
     WORKSPACE,
 )
@@ -145,12 +146,18 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path
         cs_entity = seeded.cs_model_entity
         assert gc_call.kwargs["data"]["models"][0]["model"] == cs_entity
 
-        # VirtualModel uses the discovered app entity and points middleware at the
-        # guardrail config we just created.
-        vm_call = fake_client.inference.virtual_models.create.call_args
-        assert vm_call.kwargs["name"] == VM_NAME
-        assert vm_call.kwargs["default_model_entity"] == seeded.app_model_entity
-        assert vm_call.kwargs["models"] == [{"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"}]
+        # Two VirtualModels are created: the guardrails VM (with middleware) and
+        # a control VM (no middleware) used by the without-guardrails benchmark
+        # variant.
+        vm_calls = fake_client.inference.virtual_models.create.call_args_list
+        assert len(vm_calls) == 2
+
+        guardrails_vm_call = vm_calls[0]
+        assert guardrails_vm_call.kwargs["name"] == VM_NAME
+        assert guardrails_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity
+        assert guardrails_vm_call.kwargs["models"] == [
+            {"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"}
+        ]
         expected_middleware = [
             {
                 "name": "nemo-guardrails",
@@ -158,8 +165,14 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path
                 "config_id": f"{WORKSPACE}/{GUARDRAIL_CONFIG}",
             }
         ]
-        assert vm_call.kwargs["request_middleware"] == expected_middleware
-        assert vm_call.kwargs["response_middleware"] == expected_middleware
+        assert guardrails_vm_call.kwargs["request_middleware"] == expected_middleware
+        assert guardrails_vm_call.kwargs["response_middleware"] == expected_middleware
+
+        control_vm_call = vm_calls[1]
+        assert control_vm_call.kwargs["name"] == NO_GUARDRAILS_VM_NAME
+        assert control_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity
+        assert control_vm_call.kwargs["request_middleware"] == []
+        assert control_vm_call.kwargs["response_middleware"] == []
 
     def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path: Path) -> None:
         ng_root = tmp_path / "NeMo-Guardrails"
@@ -176,6 +189,7 @@ def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path
         assert (generated_dir / "app_provider.json").is_file()
         assert (generated_dir / "content_safety_provider.json").is_file()
         assert (generated_dir / "virtual_model.json").is_file()
+        assert (generated_dir / "virtual_model_no_guardrails.json").is_file()
 
         request_payload = json.loads(
             (generated_dir / "content_safety_local_nmp_request.json").read_text(encoding="utf-8")
@@ -197,6 +211,7 @@ def test_returns_seeded_resources(self, fake_client: MagicMock, tmp_path: Path)
 
         assert seeded.workspace == WORKSPACE
         assert seeded.vm_ref == f"{WORKSPACE}/{VM_NAME}"
+        assert seeded.no_guardrails_vm_name == NO_GUARDRAILS_VM_NAME
         assert seeded.guardrail_config_ref == f"{WORKSPACE}/{GUARDRAIL_CONFIG}"
 
     def test_raises_if_served_models_never_populated(self, tmp_path: Path) -> None: