From 9addf6179104bd3e0600d72c9ffdb0e60aeee3a9 Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Tue, 16 Jun 2026 17:08:41 -0400 Subject: [PATCH 1/8] feat(guardrails): Benchmark analysis in-progress Signed-off-by: Jash Gulabrai --- .github/workflows/ci.yaml | 84 +++++- .../benchmarks/configs/mock_llm/README.md | 21 ++ .../benchmarks/configs/mock_llm/app-llm.env | 19 ++ .../configs/mock_llm/content-safety-llm.env | 19 ++ .../results/local-baseline-2026-06-16.md | 154 ++++++++++ .../benchmarks/aiperf_runner.py | 12 +- .../benchmarks/analyze.py | 270 ++++++++++++++++++ .../benchmarks/constants.py | 11 + .../benchmarks/paths.py | 26 ++ .../nemo_guardrails_plugin/benchmarks/run.py | 238 +++++++++++---- .../benchmarks/seeding.py | 26 ++ 11 files changed, 826 insertions(+), 54 deletions(-) create mode 100644 plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md create mode 100644 plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env create mode 100644 plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env create mode 100644 plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md create mode 100644 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f8643c9cb5..40d8689002 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1072,10 +1072,21 @@ jobs: path: web/packages/studio/playwright-report/ benchmark-guardrails: - name: Guardrails plugin benchmark + # Run the two benchmark variants as parallel matrix jobs, each with its + # own NMP instance. This isolates them from each other (no shared mocks, + # no cross-talk on :8080) and roughly halves wall-clock vs. the previous + # single-job sequential layout. The `benchmark-guardrails-analyze` job + # below merges both artifacts and prints the with-vs-without comparison. + name: Guardrails plugin benchmark (${{ matrix.variant }}) if: github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest timeout-minutes: 30 + strategy: + # Don't cancel the other variant if one fails; the partial artifact + # is still useful for diagnosing what went wrong. + fail-fast: false + matrix: + variant: [with-guardrails, without-guardrails] steps: - name: Checkout nemo-platform uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -1102,7 +1113,13 @@ jobs: PYTORCH_DEPS: cpu - name: Run benchmark sweep working-directory: nemo-platform - run: make benchmark-guardrails + # Pin both variants to the same `--run-id` so when the analyze job + # downloads both artifacts into one `runs/` parent, they merge into + # a single run directory the analyzer can read normally. + run: | + make benchmark-guardrails BENCHMARK_ARGS="\ + --variant ${{ matrix.variant }} \ + --run-id ci-${{ github.run_id }}-${{ github.run_attempt }}" env: NEMO_GUARDRAILS_REPO_ROOT: ${{ github.workspace }}/NeMo-Guardrails _TYPER_FORCE_DISABLE_TERMINAL: "1" @@ -1110,7 +1127,67 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: benchmark-guardrails-results + # Per-variant artifact name; GHA disallows two artifacts with the + # same name in one workflow run. + name: benchmark-guardrails-results-${{ matrix.variant }} + retention-days: 30 + path: | + nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ + + benchmark-guardrails-analyze: + # Joins the two parallel matrix jobs above, downloads both artifacts into + # one merged `runs//` tree, and prints the with-vs-without + # comparison table. No regression gate yet: this exists to produce + # CI-collected numbers we can use to seed a baseline file later. + name: Guardrails benchmark analysis + needs: [benchmark-guardrails] + if: github.event_name == 'workflow_dispatch' && !cancelled() + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout nemo-platform + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + path: nemo-platform + - name: Download with-guardrails artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: benchmark-guardrails-results-with-guardrails + path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ + - name: Download without-guardrails artifact + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: benchmark-guardrails-results-without-guardrails + path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ + - name: Install uv + uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + with: + working-directory: nemo-platform + python-version: "3.11" + enable-cache: true + - name: Bootstrap Python environment + working-directory: nemo-platform + run: make bootstrap-python + env: + PYTORCH_DEPS: cpu + - name: Print benchmark comparison + working-directory: nemo-platform + # Both matrix jobs above use the same `--run-id`, so there should be + # exactly one merged run directory under `runs/`. `ls -td ... | head -1` + # is defensive in case that ever stops being true (e.g. an artifact + # ever leaks in from a previous workflow attempt). + run: | + RUN_DIR=$(ls -td plugins/nemo-guardrails/benchmarks/artifacts/runs/*/ | head -1) + echo "Analyzing run directory: $RUN_DIR" + uv run --frozen --package nemo-guardrails-plugin --extra bench \ + python -m nemo_guardrails_plugin.benchmarks.analyze "$RUN_DIR" + - name: Upload merged benchmark artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + # Single merged artifact so collecting baseline samples is a matter + # of downloading one thing per workflow run rather than two. + name: benchmark-guardrails-results-merged retention-days: 30 path: | nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ @@ -1262,6 +1339,7 @@ jobs: - web-studio-deps - web-studio-e2e - benchmark-guardrails + - benchmark-guardrails-analyze - opa-policy-test if: always() runs-on: ubuntu-latest diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md new file mode 100644 index 0000000000..1cd5cc3dd4 --- /dev/null +++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md @@ -0,0 +1,21 @@ +# Mock LLM configurations + +These `.env` files configure the upstream `benchmark.mock_llm_server.run_server` +(from the `NeMo-Guardrails` checkout) for the IGW guardrails benchmark. + +We keep our own copies (instead of pointing at the upstream checkout's +`benchmark/mock_llm_server/configs/`) so: + +- We can change mock latency without touching the upstream repo. The original + motivation was tuning `E2E_LATENCY_*` to isolate NMP+middleware overhead + from mandatory NIM sleep (see the benchmark README for the full rationale). +- The exact mock behavior we benchmarked against is versioned alongside the + results, so historical numbers stay reproducible even if upstream changes + its defaults. + +Initial contents are a verbatim copy of the upstream files: + +- `app-llm.env` ← upstream `meta-llama-3.3-70b-instruct.env` +- `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env` + +Update either file to change mock behavior for the next benchmark run. diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env new file mode 100644 index 0000000000..0cec095b38 --- /dev/null +++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env @@ -0,0 +1,19 @@ +MODEL="meta/llama-3.3-70b-instruct" +UNSAFE_PROBABILITY=0.0 +UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?" +SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities." +# End-to-end latency +E2E_LATENCY_MIN_SECONDS=4.0 +E2E_LATENCY_MAX_SECONDS=4.0 +E2E_LATENCY_MEAN_SECONDS=4.0 +E2E_LATENCY_STD_SECONDS=0.0 +# Streaming latency: Time to First Token (TTFT) +TTFT_MIN_SECONDS=0.3 +TTFT_MAX_SECONDS=0.3 +TTFT_MEAN_SECONDS=0.3 +TTFT_STD_SECONDS=0.0 +# Streaming latency: Chunk Latency (ITL) +CHUNK_LATENCY_MIN_SECONDS=0.015 +CHUNK_LATENCY_MAX_SECONDS=0.015 +CHUNK_LATENCY_MEAN_SECONDS=0.015 +CHUNK_LATENCY_STD_SECONDS=0.0 diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env new file mode 100644 index 0000000000..9e2467b1db --- /dev/null +++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env @@ -0,0 +1,19 @@ +MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety" +UNSAFE_PROBABILITY=0.0 +UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}" +SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}" +# End-to-end latency +E2E_LATENCY_MIN_SECONDS=0.5 +E2E_LATENCY_MAX_SECONDS=0.5 +E2E_LATENCY_MEAN_SECONDS=0.5 +E2E_LATENCY_STD_SECONDS=0.0 +# Streaming latency: Time to First Token (TTFT) +TTFT_MIN_SECONDS=0.2 +TTFT_MAX_SECONDS=0.2 +TTFT_MEAN_SECONDS=0.2 +TTFT_STD_SECONDS=0.0 +# Streaming latency: Chunk Latency (ITL) +CHUNK_LATENCY_MIN_SECONDS=0.015 +CHUNK_LATENCY_MAX_SECONDS=0.015 +CHUNK_LATENCY_MEAN_SECONDS=0.015 +CHUNK_LATENCY_STD_SECONDS=0.0 diff --git a/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md b/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md new file mode 100644 index 0000000000..509c9e8c62 --- /dev/null +++ b/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md @@ -0,0 +1,154 @@ +# Local baseline results — 2026-06-16 + +Three back-to-back runs of `make benchmark-guardrails` on a local MacBook Pro +(Apple Silicon), no other heavy workloads running. Goal: characterize the +run-to-run variance of the new with-guardrails / without-guardrails harness so +we can decide what's gateable in CI. + +## Hardware / setup + +- Host: MacBook Pro, Apple Silicon, on AC power +- NMP, mocks, shim: all on localhost +- Mock LLM config: in-repo defaults (`plugins/nemo-guardrails/benchmarks/configs/mock_llm/`) + - app LLM: 4.0s e2e latency, std 0 + - content-safety LLM: 0.5s e2e latency, std 0 +- AIPerf sweep: concurrency `[1, 2, 4, 8, 16, 32, 64]`, `benchmark_duration: 60s`, + `warmup_request_count: 10`, non-streaming chat completions +- Mock workers: 4 (default) +- Three runs in the same afternoon, NMP data dir reused across runs + +## Run inventory + +| Run | Run dir | Notes | +|---|---|---| +| 1 | `20260616_123851` | first run after the with/without harness change | +| 2 | `20260616_145058` | identical config | +| 3 | `20260616_152834` | identical config | + +All three runs completed with 7/7 sweeps passing per variant, exit code 0. + +## Δp50 (with-guardrails − without-guardrails), milliseconds + +This is the headline metric: how much wall-clock time the guardrails middleware +adds on top of the bare NMP+IGW path, including the two content-safety LLM +round-trips that the rails cause but don't do themselves. + +| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | +|---------|-----:|-----:|-----:|-----:|-----:|-----:|--------:| +| Run 1 | 1029 | 1071 | 1068 | 1104 | 1145 | 1260 | 778 | +| Run 2 | 1027 | 1062 | 1096 | 1105 | 1226 | 1256 | -2896 | +| Run 3 | 1030 | 1062 | 1079 | 1070 | 1118 | 1201 | -2077 | +| **mean**| **1029** | **1065** | **1081** | **1093** | **1163** | **1239** | **−1398** | +| range | 3 | 9 | 28 | 35 | 108 | 59 | 3674 | +| range % | 0.3% | 0.8% | 2.6% | 3.2% | 9.3% | 4.8% | n/a | + +## with-guardrails p50 (absolute), milliseconds + +Useful as a sanity check that nothing catastrophic shifted in the absolute +numbers — even if Δp50 stays steady, both variants could slow down together. + +| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | +|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| +| Run 1 | 5049 | 5101 | 5114 | 5152 | 5201 | 5318 | 6164 | +| Run 2 | 5048 | 5093 | 5125 | 5137 | 5255 | 5279 | 5614 | +| Run 3 | 5050 | 5094 | 5123 | 5146 | 5163 | 5250 | 5486 | +| **mean**| **5049** | **5096** | **5121** | **5145** | **5206** | **5282** | **5755** | +| range | 2 | 8 | 11 | 15 | 92 | 68 | 678 | +| range % | 0.0% | 0.2% | 0.2% | 0.3% | 1.8% | 1.3% | 11.8%| + +## without-guardrails p50 (absolute), milliseconds + +For completeness. This is the variant that's wildly unstable at c=64. + +| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | +|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| +| Run 1 | 4020 | 4030 | 4045 | 4048 | 4056 | 4058 | 5386 | +| Run 2 | 4020 | 4031 | 4029 | 4032 | 4029 | 4023 | 8510 | +| Run 3 | 4020 | 4032 | 4044 | 4076 | 4045 | 4049 | 7563 | +| **mean**| **4020** | **4031** | **4039** | **4052** | **4043** | **4043** | **7153** | +| range | 0 | 2 | 16 | 44 | 27 | 35 | 3124 | + +The app mock sleeps for exactly 4.0s. The ~20–80 ms above 4000 across c=1–c=32 +is pure NMP+IGW+shim overhead. At c=64 the mock saturates (4 workers × 1 req/4s += 4 RPS ceiling, vs. 64 requested in-flight) and requests queue. + +## p90 — informational only + +p90 is much noisier than p50 across runs. Not gateable with three samples. + +### Δp90, milliseconds + +| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | +|-------|-----:|-----:|-----:|-----:|-----:|-----:|------:| +| Run 1 | 1039 | 1099 | 1162 | 1025 | 911 | 604 | 3009 | +| Run 2 | 1028 | 1115 | 1160 | 1262 | 783 | 641 | 1015 | +| Run 3 | 1023 | 1076 | 1189 | 1085 | 1209 | 18 | 1998 | + +## Observations + +### What's stable enough to gate on + +**c=1, 2, 4, 8.** The Δp50 ranges are 3–35 ms, well under any tolerance we'd +realistically write. The absolute with-guardrails p50 is even tighter (2–15 ms +across three runs). This is the regime where the harness is genuinely measuring +what we want: NMP+middleware overhead on top of fixed-latency mocks. + +### What's borderline + +**c=16.** Δp50 range is 9.3%. Gateable with a generous tolerance (~10%+) but +adds limited signal beyond c=8. + +### What's not gateable + +**c=32.** ~5% Δp50 range. Still bounded, but the run-to-run distance is +several times larger than at c=1–c=8 and the absolute numbers wobble too. + +**c=64.** Unusable. Δp50 swings from +778 to −2896 across three runs. +Root cause is the app mock's 4-worker saturation at this load level: the +without-guardrails path fires app requests as fast as it can and the mock queues +unpredictably. The with-guardrails path's CS-mock work paces requests enough to +hide most of this. This is a test-rig artifact, not an NMP behavior. + +### Side observation: middleware overhead is small + +Of the ~1029 ms Δp50 at c=1: +- ~1000 ms is the two content-safety mock round-trips (0.5s each, mandatory). +- ~29 ms is the middleware's *own* work (rails orchestration, request/response + shaping, etc.) plus bare NMP+IGW overhead delta vs. without-guardrails. + +The without-guardrails baseline of ~4020 ms at c=1 against a 4000 ms mock means +**bare NMP+IGW+shim overhead is ~20 ms** at idle. + +## Recommendation for the CI gate + +Based on the variance data above: + +| Concurrency | Gate Δp50? | Gate absolute with-guardrails p50? | Notes | +|---|---|---|---| +| 1 | yes | yes | tightest signal | +| 2 | yes | yes | | +| 4 | yes | yes | | +| 8 | yes | yes | | +| 16 | informational | informational | record but don't fail | +| 32 | informational | informational | record but don't fail | +| 64 | exclude | exclude | mock saturation, not gateable | + +Proposed tolerance bands (`max(absolute_ms, relative_%)`): +- Δp50: `max(±100 ms, ±5%)` +- with-guardrails p50: `max(±150 ms, ±3%)` + +Both bands are ~3× the observed local run-to-run range, leaving headroom for +CI hardware noise being noisier than a quiet laptop. + +## Open questions / followups + +- **Local baselines won't transfer to CI hardware.** These numbers should seed + the baseline file but be replaced once we have N runs from the actual CI + runner class. +- **Three samples is a small N.** Worth one more local run (Run 4) before we + treat the means above as canonical, but the c=1–c=8 numbers are unlikely + to budge meaningfully. +- **c=64 instability is downstream of NMP.** Hypothesis: app mock's 4 workers + saturate at concurrency 64 (4 RPS ceiling on 4.0s sleep). Easy to test by + running with `--mock-workers 16`. Not blocking the gate work since c=64 is + excluded anyway. diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py index f7ff6729c1..5cbfe70071 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py @@ -60,12 +60,15 @@ def prepare_runtime_aiperf_config( template_path: Path, runtime_config_path: Path, aiperf_output_dir: Path, + model_ref: str | None = None, ) -> dict[str, Any]: """Materialize the AIPerf config this run will use. Reads the checked-in ``template_path`` config, overrides its - ``output_base_dir`` to point inside the current run's directory, and writes - the result to ``runtime_config_path``. AIPerf is later invoked with + ``output_base_dir`` to point inside the current run's directory, optionally + overrides ``base_config.model`` (so the same template can target multiple + VirtualModels in one harness invocation), and writes the result to + ``runtime_config_path``. AIPerf is later invoked with ``--config-file `` so every artifact lands under a separate per-run directory. @@ -82,6 +85,11 @@ def prepare_runtime_aiperf_config( # Point AIPerf's output_base_dir at this run's directory so its results # nest under our per-run artifacts tree. config["output_base_dir"] = str(aiperf_output_dir) + if model_ref is not None: + base_config = config.get("base_config") + if not isinstance(base_config, dict): + raise ValueError(f"Expected `base_config` mapping in {template_path}, got {type(base_config).__name__}") + base_config["model"] = model_ref runtime_config_path.parent.mkdir(parents=True, exist_ok=True) runtime_config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8") diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py new file mode 100644 index 0000000000..6262159b8a --- /dev/null +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py @@ -0,0 +1,270 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Post-run analyzer for the nemo-guardrails IGW benchmark. + +Reads the ``profile_export_aiperf.csv`` files produced by both benchmark +variants in a single run directory and prints a side-by-side latency table: +``with``, ``without``, and the ``Δ`` per concurrency level. + +The delta is what answers the question "how much latency does the guardrails +middleware add", since the only thing that differs between the two variants +is whether the middleware is attached to the targeted VirtualModel. + +Used two ways: + +* Standalone: + ``python -m nemo_guardrails_plugin.benchmarks.analyze `` +* Auto-invoked from ``run.py`` at the end of a sweep that ran both variants. +""" + +from __future__ import annotations + +import argparse +import csv +import logging +import sys +from dataclasses import dataclass +from pathlib import Path + +from nemo_guardrails_plugin.benchmarks.constants import ( + VARIANT_WITH_GUARDRAILS, + VARIANT_WITHOUT_GUARDRAILS, +) + +log = logging.getLogger(__name__) + +# Concurrency levels we expect from the sweep config. The analyzer only +# reports on levels that exist in both variants; missing rungs are skipped +# silently rather than failing, since a partial run can still be informative. +_EXPECTED_CONCURRENCIES = (1, 2, 4, 8, 16, 32, 64) + +# Metric we read from `profile_export_aiperf.csv`. AIPerf reports a number of +# metrics; this one is end-to-end request wall time including the shim hop, +# IGW, middleware, and any mock-NIM round-trips. +_LATENCY_METRIC = "Request Latency (ms)" + + +@dataclass(frozen=True) +class LatencyRow: + """Per-concurrency latency stats parsed from one AIPerf CSV.""" + + concurrency: int + avg: float + p50: float + p90: float + p99: float + std: float + + +@dataclass(frozen=True) +class ComparisonRow: + """Side-by-side comparison of one concurrency level across variants.""" + + concurrency: int + with_guardrails: LatencyRow + without_guardrails: LatencyRow + + @property + def delta_p50(self) -> float: + return self.with_guardrails.p50 - self.without_guardrails.p50 + + @property + def delta_p90(self) -> float: + return self.with_guardrails.p90 - self.without_guardrails.p90 + + @property + def delta_avg(self) -> float: + return self.with_guardrails.avg - self.without_guardrails.avg + + +def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]: + """Load per-concurrency latency stats for one variant. + + Walks the same ``//concurrency/`` layout that + ``collect_sweep_results`` produces. Missing files are logged and skipped + rather than raising, so a partial run still produces a useful table. + """ + if not variant_output_dir.is_dir(): + return {} + + rows: dict[int, LatencyRow] = {} + for batch_dir in sorted(p for p in variant_output_dir.iterdir() if p.is_dir()): + for timestamp_dir in sorted(p for p in batch_dir.iterdir() if p.is_dir()): + for sweep_dir in sorted(p for p in timestamp_dir.iterdir() if p.is_dir()): + concurrency = _parse_concurrency_from_label(sweep_dir.name) + if concurrency is None: + continue + csv_path = sweep_dir / "profile_export_aiperf.csv" + row = _read_latency_row(csv_path, concurrency) + if row is not None: + rows[concurrency] = row + return rows + + +def compare( + with_rows: dict[int, LatencyRow], + without_rows: dict[int, LatencyRow], +) -> list[ComparisonRow]: + """Build per-concurrency comparison rows, ordered by concurrency. + + Only concurrencies present in *both* variants are included; an asymmetry + means one variant failed for that level and the comparison is undefined. + """ + shared = sorted(set(with_rows) & set(without_rows)) + return [ComparisonRow(c, with_rows[c], without_rows[c]) for c in shared] + + +def format_table(rows: list[ComparisonRow]) -> str: + """Render the comparison as a fixed-width text table.""" + if not rows: + return "No comparable sweep results found (need both variants to share concurrency levels)." + + header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg") + fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" + lines = [fmt.format(*header), "-" * 109] + for r in rows: + lines.append( + fmt.format( + r.concurrency, + f"{r.with_guardrails.p50:.0f}", + f"{r.without_guardrails.p50:.0f}", + f"{r.delta_p50:+.0f}", + f"{r.with_guardrails.p90:.0f}", + f"{r.without_guardrails.p90:.0f}", + f"{r.delta_p90:+.0f}", + f"{r.with_guardrails.avg:.0f}", + f"{r.without_guardrails.avg:.0f}", + f"{r.delta_avg:+.0f}", + ) + ) + lines.append("") + lines.append("All values in milliseconds. 'Δ' = with-guardrails minus without-guardrails.") + return "\n".join(lines) + + +def analyze_run(run_dir: Path) -> str: + """Top-level: read both variants from one run dir and return a printable table.""" + aiperf_dir = run_dir / "aiperf_results" + with_rows = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS) + without_rows = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS) + + if not with_rows and not without_rows: + return f"No AIPerf results found under {aiperf_dir}" + if not with_rows or not without_rows: + # Single-variant run: dump whichever side is present without trying + # to compute deltas. + only = with_rows or without_rows + which = VARIANT_WITH_GUARDRAILS if with_rows else VARIANT_WITHOUT_GUARDRAILS + return _format_single_variant(which, only) + + rows = compare(with_rows, without_rows) + return format_table(rows) + + +def _format_single_variant(variant: str, rows: dict[int, LatencyRow]) -> str: + """Render one variant's table when the other variant didn't run.""" + fmt = "{:>4} {:>9} {:>9} {:>9} {:>9}" + lines = [ + f"Only one variant present: {variant}", + fmt.format("conc", "avg", "p50", "p90", "std"), + "-" * 49, + ] + for c in sorted(rows): + r = rows[c] + lines.append(fmt.format(c, f"{r.avg:.0f}", f"{r.p50:.0f}", f"{r.p90:.0f}", f"{r.std:.0f}")) + lines.append("") + lines.append("All values in milliseconds.") + return "\n".join(lines) + + +def _parse_concurrency_from_label(label: str) -> int | None: + """Extract the integer N from a sweep label like ``concurrency16``. + + Returns ``None`` for labels that don't match the expected pattern so + unrelated subdirectories (logs, etc.) are skipped silently. + """ + if not label.startswith("concurrency"): + return None + try: + return int(label.removeprefix("concurrency")) + except ValueError: + return None + + +def _read_latency_row(csv_path: Path, concurrency: int) -> LatencyRow | None: + """Pull the ``Request Latency (ms)`` line out of an AIPerf CSV. + + AIPerf writes a header row followed by one ``Metric,avg,min,max,sum,p1,...`` + row per metric, then a blank line and a second small block. We only care + about the first block. + """ + if not csv_path.is_file(): + log.debug("Missing CSV at %s; skipping", csv_path) + return None + + try: + with csv_path.open(encoding="utf-8") as f: + reader = csv.reader(f) + header = next(reader, None) + if not header or header[0] != "Metric": + log.warning("Unexpected header in %s: %s", csv_path, header) + return None + try: + col = {name: header.index(name) for name in ("avg", "p50", "p90", "p99", "std")} + except ValueError as exc: + log.warning("Missing expected column in %s: %s", csv_path, exc) + return None + for row in reader: + if not row: + break # end of first block + if row[0] == _LATENCY_METRIC: + return LatencyRow( + concurrency=concurrency, + avg=float(row[col["avg"]]), + p50=float(row[col["p50"]]), + p90=float(row[col["p90"]]), + p99=float(row[col["p99"]]), + std=float(row[col["std"]]), + ) + except (OSError, ValueError, IndexError) as exc: + log.warning("Failed to parse %s: %s", csv_path, exc) + return None + + log.warning("Did not find '%s' row in %s", _LATENCY_METRIC, csv_path) + return None + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="nemo-guardrails-benchmark-analyze", + description=__doc__, + ) + parser.add_argument( + "run_dir", + type=Path, + help=( + "Path to a run directory under " + "`plugins/nemo-guardrails/benchmarks/artifacts/runs//`." + ), + ) + parser.add_argument( + "--log-level", + default="INFO", + choices=("DEBUG", "INFO", "WARNING", "ERROR"), + ) + args = parser.parse_args(argv) + + logging.basicConfig(level=args.log_level, format="%(levelname)s %(message)s") + + run_dir: Path = args.run_dir.resolve() + if not run_dir.is_dir(): + print(f"Not a directory: {run_dir}", file=sys.stderr) + return 2 + + print(analyze_run(run_dir)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py index cbe4b19960..14db3561f4 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py @@ -8,6 +8,17 @@ WORKSPACE = "benchmark" GUARDRAIL_CONFIG = "content-safety-local" VM_NAME = "guardrails-vm" +# Control VirtualModel with no middleware attached. Used by the benchmark +# harness to measure NMP+IGW latency *without* the guardrails middleware so +# the with-vs-without delta isolates middleware overhead. +NO_GUARDRAILS_VM_NAME = "no-guardrails-vm" + +# Logical identifiers for the two benchmark variants. Used as subdirectory +# names under `aiperf_results/` and `logs/`, and as the value of the +# harness's `--variant` flag. +VARIANT_WITH_GUARDRAILS = "with-guardrails" +VARIANT_WITHOUT_GUARDRAILS = "without-guardrails" +ALL_VARIANTS = (VARIANT_WITH_GUARDRAILS, VARIANT_WITHOUT_GUARDRAILS) # ModelProvider that proxies requests to the mock main model APP_PROVIDER = "benchmark-app-llm" diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py index b1113fa2c2..80dda5d329 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py @@ -54,6 +54,13 @@ class RunPaths: nmp_data_dir: Path # Checked-in YAML template for the AIPerf sweep config. config_template: Path + # In-repo copies of the upstream mock-LLM `.env` files. We point the + # mock server processes at these (via `--config-file`) so we can change + # mock latency or refusal behavior without touching the NeMo-Guardrails + # checkout, and so the exact mock config a benchmark ran against stays + # versioned alongside our results. + mock_app_env: Path + mock_content_safety_env: Path # Per-run materialized copy of `config_template` with `output_base_dir` # overridden to `aiperf_output_dir`. AIPerf is invoked against this file. runtime_config: Path @@ -75,6 +82,23 @@ def ensure_directories(self) -> None: ): path.mkdir(parents=True, exist_ok=True) + def aiperf_output_dir_for(self, variant: str) -> Path: + """Per-variant AIPerf output dir under ``aiperf_results//``. + + Each variant gets its own subtree so a single run can hold side-by-side + sweeps (e.g. ``with-guardrails`` and ``without-guardrails``) without + their outputs colliding. + """ + return self.aiperf_output_dir / variant + + def runtime_config_for(self, variant: str) -> Path: + """Per-variant materialized AIPerf config under ``generated/``.""" + return self.generated_dir / f"aiperf_config_{variant}.yaml" + + def aiperf_log_for(self, variant: str) -> Path: + """Per-variant AIPerf stdout/stderr log under ``logs/``.""" + return self.log_dir / f"aiperf_{variant}.log" + def _now_run_id() -> str: return dt.datetime.now().strftime("%Y%m%d_%H%M%S") @@ -120,4 +144,6 @@ def build_run_paths( config_template=benchmark_dir / "configs" / "nmp_igw_guardrails_sweep_concurrency.yaml", runtime_config=run_dir / "generated" / "nmp_igw_guardrails_sweep_concurrency.yaml", aiperf_venv_dir=artifacts_dir / "venvs" / "aiperf", + mock_app_env=benchmark_dir / "configs" / "mock_llm" / "app-llm.env", + mock_content_safety_env=benchmark_dir / "configs" / "mock_llm" / "content-safety-llm.env", ) diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py index 47114e0195..e3386b303c 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py @@ -25,21 +25,27 @@ import sys import time from contextlib import ExitStack +from dataclasses import dataclass from pathlib import Path from nemo_guardrails_plugin.benchmarks.aiperf_runner import ( + SweepRunResult, collect_sweep_results, prepare_runtime_aiperf_config, run_aiperf_sweep, ) +from nemo_guardrails_plugin.benchmarks.analyze import analyze_run from nemo_guardrails_plugin.benchmarks.bootstrap import ensure_aiperf_venv from nemo_guardrails_plugin.benchmarks.constants import ( AIPERF_SHIM_BASE_URL, + ALL_VARIANTS, APP_PROVIDER_URL, CS_PROVIDER_URL, IGW_CHAT_PATH, NMP_BASE_URL, NMP_HEALTH_PATH, + VARIANT_WITH_GUARDRAILS, + VARIANT_WITHOUT_GUARDRAILS, WORKSPACE, ) from nemo_guardrails_plugin.benchmarks.paths import ( @@ -66,8 +72,6 @@ Path("benchmark/aiperf/__main__.py"), Path("benchmark/aiperf/run_aiperf.py"), Path("benchmark/mock_llm_server/run_server.py"), - Path("benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env"), - Path("benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env"), Path("examples/configs/content_safety_local/config.yml"), Path("examples/configs/content_safety_local/prompts.yml"), ) @@ -91,6 +95,20 @@ def _validate_nemoguardrails_repo(nemoguardrails_repo_root: Path) -> None: ) +def _validate_in_repo_mock_configs(paths: RunPaths) -> None: + """Fail fast if the in-repo mock LLM env files are missing. + + These configure the upstream mock server but live in this repo (under + `plugins/nemo-guardrails/benchmarks/configs/mock_llm/`) so the harness can + tune mock behavior without forking the upstream checkout. A missing file + here is almost certainly a checkout problem on the NMP side. + """ + missing = [p for p in (paths.mock_app_env, paths.mock_content_safety_env) if not p.is_file()] + if missing: + bullet = "\n - ".join(str(p) for p in missing) + raise FileNotFoundError(f"In-repo mock LLM config files missing:\n - {bullet}") + + def _build_mock_nim_processes(paths: RunPaths, workers: int) -> list[SupervisedProcess]: """Spawn ``python -m benchmark.mock_llm_server.run_server`` for both mocks. @@ -122,20 +140,20 @@ def spec(name: str, port: int, env_file: Path, *, health_url: str) -> Supervised health_timeout_seconds=_MOCK_HEALTH_TIMEOUT_SECONDS, ) + # Both env files are sourced from this repo (see + # plugins/nemo-guardrails/benchmarks/configs/mock_llm/) so mock behavior is + # versioned alongside the benchmark and independent of the upstream checkout. return [ - # Main LLM mock server spec( "mock-app-llm", 8000, - paths.nemoguardrails_repo_root / "benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env", + paths.mock_app_env, health_url=f"{APP_PROVIDER_URL}/health", ), - # Content-safety LLM mock server spec( "mock-content-safety-llm", 8001, - paths.nemoguardrails_repo_root - / "benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env", + paths.mock_content_safety_env, health_url=f"{CS_PROVIDER_URL}/health", ), ] @@ -205,6 +223,109 @@ def _smoke_test(client: NeMoPlatform, seeded: SeededResources) -> None: raise RuntimeError(f"Smoke test failed after 60 attempts: {last_error}") +@dataclass(frozen=True) +class BenchmarkOutcome: + """Outcome of a single benchmark variant's AIPerf sweep, used by the summary.""" + + variant: str + aiperf_exit: int + output_dir: Path + sweep_results: list[SweepRunResult] + + @property + def failures(self) -> int: + return sum(1 for r in self.sweep_results if not r.passed) + + @property + def passed(self) -> bool: + return self.aiperf_exit == 0 and bool(self.sweep_results) and self.failures == 0 + + +def _vm_ref_for_variant(variant: str, seeded: SeededResources) -> str: + """Pick which seeded VirtualModel a benchmark variant should target.""" + if variant == VARIANT_WITH_GUARDRAILS: + return seeded.vm_ref + if variant == VARIANT_WITHOUT_GUARDRAILS: + return seeded.no_guardrails_vm_ref + raise ValueError(f"Unknown variant: {variant!r}") + + +def _run_benchmark( + *, + variant: str, + paths: RunPaths, + seeded: SeededResources, + aiperf_python: Path, +) -> BenchmarkOutcome: + """Materialize a per-variant AIPerf config, run the sweep, collect results.""" + vm_ref = _vm_ref_for_variant(variant, seeded) + runtime_config = paths.runtime_config_for(variant) + aiperf_output_dir = paths.aiperf_output_dir_for(variant) + aiperf_log = paths.aiperf_log_for(variant) + + sweep_config = prepare_runtime_aiperf_config( + template_path=paths.config_template, + runtime_config_path=runtime_config, + aiperf_output_dir=aiperf_output_dir, + model_ref=vm_ref, + ) + log.info( + "Benchmark %s: targeting %s; concurrency=%s, duration=%ss", + variant, + vm_ref, + sweep_config.get("sweeps", {}).get("concurrency"), + sweep_config.get("base_config", {}).get("benchmark_duration"), + ) + log.info( + "Starting AIPerf sweep [%s] against %s -> shim -> %s%s", + variant, + AIPERF_SHIM_BASE_URL, + NMP_BASE_URL, + IGW_CHAT_PATH, + ) + + aiperf_exit = run_aiperf_sweep( + nemoguardrails_repo_root=paths.nemoguardrails_repo_root, + runtime_config=runtime_config, + log_path=aiperf_log, + python_executable=str(aiperf_python), + venv_bin_path=paths.aiperf_venv_dir / "bin", + ) + + sweep_results = collect_sweep_results(aiperf_output_dir) + return BenchmarkOutcome( + variant=variant, + aiperf_exit=aiperf_exit, + output_dir=aiperf_output_dir, + sweep_results=sweep_results, + ) + + +def _summarize_benchmark_results(outcomes: list[BenchmarkOutcome]) -> int: + """Log per-benchmark + overall summary; return process exit code.""" + overall_failed = False + for outcome in outcomes: + if not outcome.sweep_results: + log.error( + "Benchmark %s: aiperf exited with code %d and produced no per-sweep results in %s", + outcome.variant, + outcome.aiperf_exit, + outcome.output_dir, + ) + else: + log.info( + "Benchmark %s: %d run(s), %d failure(s); per-sweep outputs under %s", + outcome.variant, + len(outcome.sweep_results), + outcome.failures, + outcome.output_dir, + ) + if not outcome.passed: + overall_failed = True + + return 1 if overall_failed else 0 + + def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser( prog="nemo-guardrails-benchmark", @@ -244,10 +365,30 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: default=None, help="Override the per-run directory name (default: current timestamp).", ) + parser.add_argument( + "--variant", + choices=(*ALL_VARIANTS, "all"), + default="all", + help=( + "Which sweep to run: 'with-guardrails' targets the VirtualModel with " + "middleware attached; 'without-guardrails' targets a control VM with " + "no middleware; 'all' (default) runs them sequentially against the " + "same NMP so the with-vs-without delta isolates middleware overhead. " + "In CI, run two parallel jobs with --variant=with-guardrails and " + "--variant=without-guardrails against isolated NMP instances." + ), + ) parser.add_argument("--verbose", "-v", action="store_true") return parser.parse_args(argv) +def _resolve_variants(variant_arg: str) -> tuple[str, ...]: + """Translate the ``--variant`` CLI argument into the ordered list to run.""" + if variant_arg == "all": + return ALL_VARIANTS + return (variant_arg,) + + def main(argv: list[str] | None = None) -> int: args = parse_args(argv) _configure_logging(args.verbose) @@ -266,20 +407,18 @@ def main(argv: list[str] | None = None) -> int: run_id=args.run_id, ) paths.ensure_directories() + _validate_in_repo_mock_configs(paths) log.info("Created directory for benchmark results at: %s", paths.run_dir) - - sweep_config = prepare_runtime_aiperf_config( - template_path=paths.config_template, - runtime_config_path=paths.runtime_config, - aiperf_output_dir=paths.aiperf_output_dir, - ) log.info( - "AIPerf sweep: concurrency=%s, duration=%ss", - sweep_config.get("sweeps", {}).get("concurrency"), - sweep_config.get("base_config", {}).get("benchmark_duration"), + "Mock LLM configs: app=%s, content-safety=%s", + paths.mock_app_env, + paths.mock_content_safety_env, ) + variants = _resolve_variants(args.variant) + log.info("Will run %d variant(s): %s", len(variants), ", ".join(variants)) + # Ensure the dedicated aiperf venv exists *before* we start any supervised # processes. aiperf_python = ensure_aiperf_venv(paths.aiperf_venv_dir) @@ -318,43 +457,44 @@ def main(argv: list[str] | None = None) -> int: log.info("Waiting for VirtualModel %s to be ready...", seeded.vm_ref) _smoke_test(client, seeded) - log.info( - "Starting AIPerf sweep against %s -> shim -> %s%s", - AIPERF_SHIM_BASE_URL, - NMP_BASE_URL, - IGW_CHAT_PATH, - ) - aiperf_exit = run_aiperf_sweep( - nemoguardrails_repo_root=paths.nemoguardrails_repo_root, - runtime_config=paths.runtime_config, - log_path=paths.log_dir / "aiperf.log", - python_executable=str(aiperf_python), - venv_bin_path=paths.aiperf_venv_dir / "bin", - ) + # Run each requested benchmark variant sequentially against the same + # NMP. The bootstrap (mocks, NMP, shim) is shared so the only thing + # that differs between variants is which VirtualModel AIPerf targets, + # which is exactly what makes the with-vs-without delta interpretable + # as middleware overhead. + outcomes: list[BenchmarkOutcome] = [] + for variant in variants: + outcomes.append( + _run_benchmark( + variant=variant, + paths=paths, + seeded=seeded, + aiperf_python=aiperf_python, + ) + ) - sweep_results = collect_sweep_results(paths.aiperf_output_dir) - failures = sum(1 for r in sweep_results if not r.passed) + exit_code = _summarize_benchmark_results(outcomes) + _maybe_print_analysis(paths.run_dir, outcomes) + return exit_code - if not sweep_results: - # AIPerf exited before producing any per-sweep dirs. Surface that - # explicitly so the log isn't ambiguous about why we're failing. - log.error( - "aiperf exited with code %d and produced no per-sweep results in %s", - aiperf_exit, - paths.aiperf_output_dir, - ) - else: - log.info( - "Sweep summary: %d run(s), %d failure(s); per-sweep outputs under %s", - len(sweep_results), - failures, - paths.aiperf_output_dir, - ) - if failures or aiperf_exit != 0 or not sweep_results: - return 1 +def _maybe_print_analysis(run_dir: Path, outcomes: list[BenchmarkOutcome]) -> None: + """Print the with-vs-without comparison table when it's meaningful. - return 0 + Only invoked when at least one variant produced results, since the + analyzer can also dump a single-variant table for partial runs. The call + is wrapped in a broad try/except: the analyzer is pure post-processing + over already-written artifacts, so a bug here must not change the + harness's exit code or hide a real benchmark failure. + """ + if not any(o.sweep_results for o in outcomes): + return + try: + report = analyze_run(run_dir) + except Exception as exc: + log.warning("Analyzer failed; skipping summary table: %s", exc) + return + log.info("Benchmark analysis:\n%s", report) if __name__ == "__main__": diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py index 96cd152f0f..97f9b0df0f 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py @@ -28,6 +28,7 @@ GUARDRAIL_CONFIG, GUARDRAILS_MIDDLEWARE_CONFIG_TYPE, GUARDRAILS_MIDDLEWARE_NAME, + NO_GUARDRAILS_VM_NAME, VM_NAME, WORKSPACE, ) @@ -52,11 +53,19 @@ class SeededResources: cs_model_entity: str guardrail_config_name: str vm_name: str + # Sibling VM with no middleware attached, used as the control variant by + # the benchmark harness. Same default_model_entity and models list as the + # guardrails VM so the only difference is middleware attachment. + no_guardrails_vm_name: str @property def vm_ref(self) -> str: return f"{self.workspace}/{self.vm_name}" + @property + def no_guardrails_vm_ref(self) -> str: + return f"{self.workspace}/{self.no_guardrails_vm_name}" + @property def guardrail_config_ref(self) -> str: return f"{self.workspace}/{self.guardrail_config_name}" @@ -171,6 +180,22 @@ def seed_benchmark( ) _dump_model(generated_dir / "virtual_model.json", vm) + # Control VirtualModel: identical to the guardrails VM except no + # middleware. The harness uses this to measure NMP+IGW latency without + # the guardrails middleware so the with-vs-without delta isolates + # middleware overhead. + log.info("Creating control VirtualModel %s/%s", WORKSPACE, NO_GUARDRAILS_VM_NAME) + no_guardrails_vm = client.inference.virtual_models.create( + workspace=WORKSPACE, + name=NO_GUARDRAILS_VM_NAME, + default_model_entity=app_entity, + models=vm_models, + request_middleware=[], + response_middleware=[], + exist_ok=True, + ) + _dump_model(generated_dir / "virtual_model_no_guardrails.json", no_guardrails_vm) + return SeededResources( workspace=WORKSPACE, app_provider_name=APP_PROVIDER, @@ -179,6 +204,7 @@ def seed_benchmark( cs_model_entity=cs_entity, guardrail_config_name=GUARDRAIL_CONFIG, vm_name=VM_NAME, + no_guardrails_vm_name=NO_GUARDRAILS_VM_NAME, ) From f78ffa8a890f38b98c7f5c36cbd9d3270a83f73c Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Mon, 22 Jun 2026 09:54:10 -0400 Subject: [PATCH 2/8] Fix CI Signed-off-by: Jash Gulabrai --- .github/workflows/ci.yaml | 2 +- .../benchmarks/analyze.py | 5 +-- .../tests/unit/benchmarks/test_seeding.py | 31 ++++++++++++++----- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 40d8689002..30117b985b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1177,7 +1177,7 @@ jobs: # is defensive in case that ever stops being true (e.g. an artifact # ever leaks in from a previous workflow attempt). run: | - RUN_DIR=$(ls -td plugins/nemo-guardrails/benchmarks/artifacts/runs/*/ | head -1) + RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-) echo "Analyzing run directory: $RUN_DIR" uv run --frozen --package nemo-guardrails-plugin --extra bench \ python -m nemo_guardrails_plugin.benchmarks.analyze "$RUN_DIR" diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py index 6262159b8a..54fa91ec50 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py @@ -243,10 +243,7 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument( "run_dir", type=Path, - help=( - "Path to a run directory under " - "`plugins/nemo-guardrails/benchmarks/artifacts/runs//`." - ), + help=("Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs//`."), ) parser.add_argument( "--log-level", diff --git a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py index f0ad57aeaa..8342315e13 100644 --- a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py +++ b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py @@ -14,6 +14,7 @@ CS_MODEL_NAME, CS_PROVIDER, GUARDRAIL_CONFIG, + NO_GUARDRAILS_VM_NAME, VM_NAME, WORKSPACE, ) @@ -145,12 +146,18 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path cs_entity = seeded.cs_model_entity assert gc_call.kwargs["data"]["models"][0]["model"] == cs_entity - # VirtualModel uses the discovered app entity and points middleware at the - # guardrail config we just created. - vm_call = fake_client.inference.virtual_models.create.call_args - assert vm_call.kwargs["name"] == VM_NAME - assert vm_call.kwargs["default_model_entity"] == seeded.app_model_entity - assert vm_call.kwargs["models"] == [{"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"}] + # Two VirtualModels are created: the guardrails VM (with middleware) and + # a control VM (no middleware) used by the without-guardrails benchmark + # variant. + vm_calls = fake_client.inference.virtual_models.create.call_args_list + assert len(vm_calls) == 2 + + guardrails_vm_call = vm_calls[0] + assert guardrails_vm_call.kwargs["name"] == VM_NAME + assert guardrails_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity + assert guardrails_vm_call.kwargs["models"] == [ + {"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"} + ] expected_middleware = [ { "name": "nemo-guardrails", @@ -158,8 +165,14 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path "config_id": f"{WORKSPACE}/{GUARDRAIL_CONFIG}", } ] - assert vm_call.kwargs["request_middleware"] == expected_middleware - assert vm_call.kwargs["response_middleware"] == expected_middleware + assert guardrails_vm_call.kwargs["request_middleware"] == expected_middleware + assert guardrails_vm_call.kwargs["response_middleware"] == expected_middleware + + control_vm_call = vm_calls[1] + assert control_vm_call.kwargs["name"] == NO_GUARDRAILS_VM_NAME + assert control_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity + assert control_vm_call.kwargs["request_middleware"] == [] + assert control_vm_call.kwargs["response_middleware"] == [] def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path: Path) -> None: ng_root = tmp_path / "NeMo-Guardrails" @@ -176,6 +189,7 @@ def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path assert (generated_dir / "app_provider.json").is_file() assert (generated_dir / "content_safety_provider.json").is_file() assert (generated_dir / "virtual_model.json").is_file() + assert (generated_dir / "virtual_model_no_guardrails.json").is_file() request_payload = json.loads( (generated_dir / "content_safety_local_nmp_request.json").read_text(encoding="utf-8") @@ -197,6 +211,7 @@ def test_returns_seeded_resources(self, fake_client: MagicMock, tmp_path: Path) assert seeded.workspace == WORKSPACE assert seeded.vm_ref == f"{WORKSPACE}/{VM_NAME}" + assert seeded.no_guardrails_vm_name == NO_GUARDRAILS_VM_NAME assert seeded.guardrail_config_ref == f"{WORKSPACE}/{GUARDRAIL_CONFIG}" def test_raises_if_served_models_never_populated(self, tmp_path: Path) -> None: From 78b7730f75232f9e7d8eb43cea96f4cd2d3d977f Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Mon, 22 Jun 2026 11:22:24 -0400 Subject: [PATCH 3/8] always run benchmark Signed-off-by: Jash Gulabrai --- .github/workflows/ci.yaml | 35 ++++---- .../benchmarks/analyze.py | 81 ++++++++++++------- 2 files changed, 67 insertions(+), 49 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 30117b985b..4181069c1a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1078,7 +1078,9 @@ jobs: # single-job sequential layout. The `benchmark-guardrails-analyze` job # below merges both artifacts and prints the with-vs-without comparison. name: Guardrails plugin benchmark (${{ matrix.variant }}) - if: github.event_name == 'workflow_dispatch' + # TODO(jgulabrai): gate back to `workflow_dispatch` once we're done + # collecting baseline samples and the harness is stable enough that we + # don't need it firing on every PR push. runs-on: ubuntu-latest timeout-minutes: 30 strategy: @@ -1141,7 +1143,7 @@ jobs: # CI-collected numbers we can use to seed a baseline file later. name: Guardrails benchmark analysis needs: [benchmark-guardrails] - if: github.event_name == 'workflow_dispatch' && !cancelled() + if: ${{ !cancelled() }} runs-on: ubuntu-latest timeout-minutes: 10 steps: @@ -1150,37 +1152,34 @@ jobs: with: path: nemo-platform - name: Download with-guardrails artifact + # If a variant failed entirely it may have uploaded no artifact; + # the analyzer handles the single-variant case so don't fail here. + continue-on-error: true uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: benchmark-guardrails-results-with-guardrails path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ - name: Download without-guardrails artifact + continue-on-error: true uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: name: benchmark-guardrails-results-without-guardrails path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ - - name: Install uv - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 - with: - working-directory: nemo-platform - python-version: "3.11" - enable-cache: true - - name: Bootstrap Python environment - working-directory: nemo-platform - run: make bootstrap-python - env: - PYTORCH_DEPS: cpu - name: Print benchmark comparison working-directory: nemo-platform + # `analyze.py` is intentionally stdlib-only and import-free w.r.t. the + # rest of the plugin, so we run it on the runner's stock python3 + # rather than spinning up the full `uv` workspace (which costs ~3 + # minutes per CI run for ~50ms of CSV parsing). + # # Both matrix jobs above use the same `--run-id`, so there should be - # exactly one merged run directory under `runs/`. `ls -td ... | head -1` - # is defensive in case that ever stops being true (e.g. an artifact - # ever leaks in from a previous workflow attempt). + # exactly one merged run directory under `runs/`. The `find | sort` + # pipeline is defensive in case that ever stops being true. + # NOTE: `find -printf` is GNU-only (works on ubuntu-latest, not macOS). run: | RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-) echo "Analyzing run directory: $RUN_DIR" - uv run --frozen --package nemo-guardrails-plugin --extra bench \ - python -m nemo_guardrails_plugin.benchmarks.analyze "$RUN_DIR" + python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" - name: Upload merged benchmark artifacts if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py index 54fa91ec50..1a31dff306 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py @@ -27,18 +27,15 @@ from dataclasses import dataclass from pathlib import Path -from nemo_guardrails_plugin.benchmarks.constants import ( - VARIANT_WITH_GUARDRAILS, - VARIANT_WITHOUT_GUARDRAILS, -) +# Variant identifiers are duplicated from `constants.py` (rather than imported) +# so this module has zero intra-repo imports and can run on bare `python3` in +# CI without the full `uv`/`make bootstrap-python` setup. Keep these in sync +# with `constants.VARIANT_WITH_GUARDRAILS` / `VARIANT_WITHOUT_GUARDRAILS`. +VARIANT_WITH_GUARDRAILS = "with-guardrails" +VARIANT_WITHOUT_GUARDRAILS = "without-guardrails" log = logging.getLogger(__name__) -# Concurrency levels we expect from the sweep config. The analyzer only -# reports on levels that exist in both variants; missing rungs are skipped -# silently rather than failing, since a partial run can still be informative. -_EXPECTED_CONCURRENCIES = (1, 2, 4, 8, 16, 32, 64) - # Metric we read from `profile_export_aiperf.csv`. AIPerf reports a number of # metrics; this one is end-to-end request wall time including the shim hop, # IGW, middleware, and any mock-NIM round-trips. @@ -84,11 +81,13 @@ def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]: Walks the same ``//concurrency/`` layout that ``collect_sweep_results`` produces. Missing files are logged and skipped rather than raising, so a partial run still produces a useful table. + + Returns a mapping of ``concurrency_level -> LatencyRow``. """ if not variant_output_dir.is_dir(): return {} - rows: dict[int, LatencyRow] = {} + latency_by_concurrency: dict[int, LatencyRow] = {} for batch_dir in sorted(p for p in variant_output_dir.iterdir() if p.is_dir()): for timestamp_dir in sorted(p for p in batch_dir.iterdir() if p.is_dir()): for sweep_dir in sorted(p for p in timestamp_dir.iterdir() if p.is_dir()): @@ -98,21 +97,39 @@ def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]: csv_path = sweep_dir / "profile_export_aiperf.csv" row = _read_latency_row(csv_path, concurrency) if row is not None: - rows[concurrency] = row - return rows + latency_by_concurrency[concurrency] = row + return latency_by_concurrency def compare( - with_rows: dict[int, LatencyRow], - without_rows: dict[int, LatencyRow], + latency_by_concurrency_with_guardrails: dict[int, LatencyRow], + latency_by_concurrency_without_guardrails: dict[int, LatencyRow], ) -> list[ComparisonRow]: """Build per-concurrency comparison rows, ordered by concurrency. Only concurrencies present in *both* variants are included; an asymmetry means one variant failed for that level and the comparison is undefined. + Asymmetric levels are logged at WARNING so silent drops are visible. """ - shared = sorted(set(with_rows) & set(without_rows)) - return [ComparisonRow(c, with_rows[c], without_rows[c]) for c in shared] + concurrencies_with_guardrails = set(latency_by_concurrency_with_guardrails) + concurrencies_without_guardrails = set(latency_by_concurrency_without_guardrails) + concurrencies_in_both_variants = sorted(concurrencies_with_guardrails & concurrencies_without_guardrails) + + concurrencies_in_only_one_variant = sorted(concurrencies_with_guardrails ^ concurrencies_without_guardrails) + if concurrencies_in_only_one_variant: + log.warning( + "Concurrency levels present in only one variant, excluded from comparison: %s", + concurrencies_in_only_one_variant, + ) + + return [ + ComparisonRow( + concurrency, + latency_by_concurrency_with_guardrails[concurrency], + latency_by_concurrency_without_guardrails[concurrency], + ) + for concurrency in concurrencies_in_both_variants + ] def format_table(rows: list[ComparisonRow]) -> str: @@ -122,7 +139,8 @@ def format_table(rows: list[ComparisonRow]) -> str: header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg") fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" - lines = [fmt.format(*header), "-" * 109] + header_line = fmt.format(*header) + lines = [header_line, "-" * len(header_line)] for r in rows: lines.append( fmt.format( @@ -146,33 +164,34 @@ def format_table(rows: list[ComparisonRow]) -> str: def analyze_run(run_dir: Path) -> str: """Top-level: read both variants from one run dir and return a printable table.""" aiperf_dir = run_dir / "aiperf_results" - with_rows = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS) - without_rows = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS) + latency_by_concurrency_with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS) + latency_by_concurrency_without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS) - if not with_rows and not without_rows: + if not latency_by_concurrency_with_guardrails and not latency_by_concurrency_without_guardrails: return f"No AIPerf results found under {aiperf_dir}" - if not with_rows or not without_rows: + if not latency_by_concurrency_with_guardrails or not latency_by_concurrency_without_guardrails: # Single-variant run: dump whichever side is present without trying # to compute deltas. - only = with_rows or without_rows - which = VARIANT_WITH_GUARDRAILS if with_rows else VARIANT_WITHOUT_GUARDRAILS - return _format_single_variant(which, only) + if latency_by_concurrency_with_guardrails: + return _format_single_variant(VARIANT_WITH_GUARDRAILS, latency_by_concurrency_with_guardrails) + return _format_single_variant(VARIANT_WITHOUT_GUARDRAILS, latency_by_concurrency_without_guardrails) - rows = compare(with_rows, without_rows) + rows = compare(latency_by_concurrency_with_guardrails, latency_by_concurrency_without_guardrails) return format_table(rows) -def _format_single_variant(variant: str, rows: dict[int, LatencyRow]) -> str: +def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str: """Render one variant's table when the other variant didn't run.""" fmt = "{:>4} {:>9} {:>9} {:>9} {:>9}" + header_line = fmt.format("conc", "avg", "p50", "p90", "std") lines = [ f"Only one variant present: {variant}", - fmt.format("conc", "avg", "p50", "p90", "std"), - "-" * 49, + header_line, + "-" * len(header_line), ] - for c in sorted(rows): - r = rows[c] - lines.append(fmt.format(c, f"{r.avg:.0f}", f"{r.p50:.0f}", f"{r.p90:.0f}", f"{r.std:.0f}")) + for concurrency in sorted(latency_by_concurrency): + row = latency_by_concurrency[concurrency] + lines.append(fmt.format(concurrency, f"{row.avg:.0f}", f"{row.p50:.0f}", f"{row.p90:.0f}", f"{row.std:.0f}")) lines.append("") lines.append("All values in milliseconds.") return "\n".join(lines) From 2acf55bc4817e6e03bda18e1354bf68cd1d59156 Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Mon, 22 Jun 2026 12:24:26 -0400 Subject: [PATCH 4/8] Cleanup Signed-off-by: Jash Gulabrai --- .github/workflows/ci.yaml | 37 ++---- .../benchmarks/aiperf_runner.py | 15 +-- .../benchmarks/analyze.py | 124 ++++++++++++------ .../benchmarks/paths.py | 14 +- .../nemo_guardrails_plugin/benchmarks/run.py | 36 ++--- .../benchmarks/seeding.py | 10 +- 6 files changed, 114 insertions(+), 122 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4181069c1a..37d5864c9f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1072,20 +1072,15 @@ jobs: path: web/packages/studio/playwright-report/ benchmark-guardrails: - # Run the two benchmark variants as parallel matrix jobs, each with its - # own NMP instance. This isolates them from each other (no shared mocks, - # no cross-talk on :8080) and roughly halves wall-clock vs. the previous - # single-job sequential layout. The `benchmark-guardrails-analyze` job - # below merges both artifacts and prints the with-vs-without comparison. + # Parallel matrix jobs (one NMP per variant) so the two sweeps don't + # share mocks or contend on :8080. `benchmark-guardrails-analyze` merges + # the artifacts and prints the comparison. name: Guardrails plugin benchmark (${{ matrix.variant }}) - # TODO(jgulabrai): gate back to `workflow_dispatch` once we're done - # collecting baseline samples and the harness is stable enough that we - # don't need it firing on every PR push. + # TODO(jgulabrai): gate back to `workflow_dispatch` once we have a CI baseline. runs-on: ubuntu-latest timeout-minutes: 30 strategy: - # Don't cancel the other variant if one fails; the partial artifact - # is still useful for diagnosing what went wrong. + # Keep the partial artifact if one variant fails. fail-fast: false matrix: variant: [with-guardrails, without-guardrails] @@ -1129,18 +1124,14 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - # Per-variant artifact name; GHA disallows two artifacts with the - # same name in one workflow run. + # Ensure we use a unique artifact name per benchmark vaiant. name: benchmark-guardrails-results-${{ matrix.variant }} retention-days: 30 path: | nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ benchmark-guardrails-analyze: - # Joins the two parallel matrix jobs above, downloads both artifacts into - # one merged `runs//` tree, and prints the with-vs-without - # comparison table. No regression gate yet: this exists to produce - # CI-collected numbers we can use to seed a baseline file later. + # Merge both variant artifacts and print the comparison table. name: Guardrails benchmark analysis needs: [benchmark-guardrails] if: ${{ !cancelled() }} @@ -1167,15 +1158,8 @@ jobs: path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ - name: Print benchmark comparison working-directory: nemo-platform - # `analyze.py` is intentionally stdlib-only and import-free w.r.t. the - # rest of the plugin, so we run it on the runner's stock python3 - # rather than spinning up the full `uv` workspace (which costs ~3 - # minutes per CI run for ~50ms of CSV parsing). - # - # Both matrix jobs above use the same `--run-id`, so there should be - # exactly one merged run directory under `runs/`. The `find | sort` - # pipeline is defensive in case that ever stops being true. - # NOTE: `find -printf` is GNU-only (works on ubuntu-latest, not macOS). + # `analyze.py` doesn't rely on NMP or AIPerf, so we skip the uv bootstrap + # step and run it with the runner's `python3` CLI directly. run: | RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-) echo "Analyzing run directory: $RUN_DIR" @@ -1184,8 +1168,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - # Single merged artifact so collecting baseline samples is a matter - # of downloading one thing per workflow run rather than two. + # Single artifact so baseline collection is one download per run. name: benchmark-guardrails-results-merged retention-days: 30 path: | diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py index 5cbfe70071..20963e11f3 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py @@ -64,16 +64,11 @@ def prepare_runtime_aiperf_config( ) -> dict[str, Any]: """Materialize the AIPerf config this run will use. - Reads the checked-in ``template_path`` config, overrides its - ``output_base_dir`` to point inside the current run's directory, optionally - overrides ``base_config.model`` (so the same template can target multiple - VirtualModels in one harness invocation), and writes the result to - ``runtime_config_path``. AIPerf is later invoked with - ``--config-file `` so every artifact lands under a - separate per-run directory. - - Returns the parsed config dict so callers can log fields (sweep params, - benchmark_duration) without re-reading the file. + Reads ``template_path``, overrides ``output_base_dir`` (so AIPerf + artifacts nest under this run) and optionally ``base_config.model`` + (so one template can target multiple VirtualModels), and writes the + result to ``runtime_config_path``. Returns the parsed config so + callers can log sweep params without re-reading the file. """ if not template_path.is_file(): raise FileNotFoundError(f"AIPerf template not found: {template_path}") diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py index 1a31dff306..60545a70b1 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py @@ -3,19 +3,13 @@ """Post-run analyzer for the nemo-guardrails IGW benchmark. -Reads the ``profile_export_aiperf.csv`` files produced by both benchmark -variants in a single run directory and prints a side-by-side latency table: -``with``, ``without``, and the ``Δ`` per concurrency level. +Reads ``profile_export_aiperf.csv`` files from both variants in one run dir +and prints a with-vs-without latency comparison. The delta isolates +middleware overhead since the only difference between variants is whether +middleware is attached to the targeted VirtualModel. -The delta is what answers the question "how much latency does the guardrails -middleware add", since the only thing that differs between the two variants -is whether the middleware is attached to the targeted VirtualModel. - -Used two ways: - -* Standalone: - ``python -m nemo_guardrails_plugin.benchmarks.analyze `` -* Auto-invoked from ``run.py`` at the end of a sweep that ran both variants. +Used both as a script (``python -m ... ``) and auto-invoked from +``run.py`` after a multi-variant sweep. """ from __future__ import annotations @@ -27,20 +21,26 @@ from dataclasses import dataclass from pathlib import Path -# Variant identifiers are duplicated from `constants.py` (rather than imported) -# so this module has zero intra-repo imports and can run on bare `python3` in -# CI without the full `uv`/`make bootstrap-python` setup. Keep these in sync -# with `constants.VARIANT_WITH_GUARDRAILS` / `VARIANT_WITHOUT_GUARDRAILS`. +# Duplicated from `constants.py` so this module stays import-free and can +# run on bare `python3` in CI without bootstrapping the uv workspace. VARIANT_WITH_GUARDRAILS = "with-guardrails" VARIANT_WITHOUT_GUARDRAILS = "without-guardrails" log = logging.getLogger(__name__) -# Metric we read from `profile_export_aiperf.csv`. AIPerf reports a number of -# metrics; this one is end-to-end request wall time including the shim hop, -# IGW, middleware, and any mock-NIM round-trips. _LATENCY_METRIC = "Request Latency (ms)" +# Mock-LLM time per request, subtracted to isolate platform overhead. Mirrors +# `E2E_LATENCY_MEAN_SECONDS` in configs/mock_llm/*.env and the 2 CS calls +# (input + output rails) of `content_safety_local`. Update in lock-step. +_APP_MOCK_LATENCY_MS = 4000.0 +_CONTENT_SAFETY_MOCK_LATENCY_MS = 500.0 +_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST = 2 +_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS = _APP_MOCK_LATENCY_MS +_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS = ( + _APP_MOCK_LATENCY_MS + _CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST * _CONTENT_SAFETY_MOCK_LATENCY_MS +) + @dataclass(frozen=True) class LatencyRow: @@ -78,11 +78,9 @@ def delta_avg(self) -> float: def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]: """Load per-concurrency latency stats for one variant. - Walks the same ``//concurrency/`` layout that - ``collect_sweep_results`` produces. Missing files are logged and skipped - rather than raising, so a partial run still produces a useful table. - - Returns a mapping of ``concurrency_level -> LatencyRow``. + Walks the ``//concurrency/`` layout produced by + ``collect_sweep_results``. Missing CSVs are skipped, not raised, so + partial runs still produce a table. """ if not variant_output_dir.is_dir(): return {} @@ -105,11 +103,10 @@ def compare( latency_by_concurrency_with_guardrails: dict[int, LatencyRow], latency_by_concurrency_without_guardrails: dict[int, LatencyRow], ) -> list[ComparisonRow]: - """Build per-concurrency comparison rows, ordered by concurrency. + """Build per-concurrency comparison rows, sorted by concurrency. - Only concurrencies present in *both* variants are included; an asymmetry - means one variant failed for that level and the comparison is undefined. - Asymmetric levels are logged at WARNING so silent drops are visible. + Only levels present in both variants are compared; asymmetric levels are + logged at WARNING and excluded. """ concurrencies_with_guardrails = set(latency_by_concurrency_with_guardrails) concurrencies_without_guardrails = set(latency_by_concurrency_without_guardrails) @@ -161,8 +158,60 @@ def format_table(rows: list[ComparisonRow]) -> str: return "\n".join(lines) +def format_platform_overhead_table(rows: list[ComparisonRow]) -> str: + """Render a table with mock-LLM time subtracted from p50/p90/avg. + + Isolates NMP + IGW + shim + middleware overhead from the much larger + mock sleeps. The Δ columns are the middleware's own cost over the bare + path. + """ + if not rows: + return "No comparable sweep results found (need both variants to share concurrency levels)." + + header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg") + fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" + header_line = fmt.format(*header) + lines = [header_line, "-" * len(header_line)] + + for r in rows: + with_p50 = r.with_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS + without_p50 = r.without_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS + with_p90 = r.with_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS + without_p90 = r.without_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS + with_avg = r.with_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS + without_avg = r.without_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS + lines.append( + fmt.format( + r.concurrency, + f"{with_p50:+.0f}", + f"{without_p50:+.0f}", + f"{with_p50 - without_p50:+.0f}", + f"{with_p90:+.0f}", + f"{without_p90:+.0f}", + f"{with_p90 - without_p90:+.0f}", + f"{with_avg:+.0f}", + f"{without_avg:+.0f}", + f"{with_avg - without_avg:+.0f}", + ) + ) + lines.append("") + lines.append( + "All values in milliseconds, with mock-LLM time subtracted " + f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms = 1× app + " + f"{_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST}× content-safety; " + f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms = 1× app)." + ) + lines.append("'Δ' columns are the middleware's own overhead over the bare NMP+IGW path.") + return "\n".join(lines) + + def analyze_run(run_dir: Path) -> str: - """Top-level: read both variants from one run dir and return a printable table.""" + """Read both variants from one run dir and return a printable report. + + Output is the raw comparison table followed by a platform-overhead table + (mock time subtracted). Falls back to a single-variant table if only one + variant has results. + """ aiperf_dir = run_dir / "aiperf_results" latency_by_concurrency_with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS) latency_by_concurrency_without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS) @@ -170,14 +219,12 @@ def analyze_run(run_dir: Path) -> str: if not latency_by_concurrency_with_guardrails and not latency_by_concurrency_without_guardrails: return f"No AIPerf results found under {aiperf_dir}" if not latency_by_concurrency_with_guardrails or not latency_by_concurrency_without_guardrails: - # Single-variant run: dump whichever side is present without trying - # to compute deltas. if latency_by_concurrency_with_guardrails: return _format_single_variant(VARIANT_WITH_GUARDRAILS, latency_by_concurrency_with_guardrails) return _format_single_variant(VARIANT_WITHOUT_GUARDRAILS, latency_by_concurrency_without_guardrails) rows = compare(latency_by_concurrency_with_guardrails, latency_by_concurrency_without_guardrails) - return format_table(rows) + return f"{format_table(rows)}\n\n{format_platform_overhead_table(rows)}" def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str: @@ -198,11 +245,7 @@ def _format_single_variant(variant: str, latency_by_concurrency: dict[int, Laten def _parse_concurrency_from_label(label: str) -> int | None: - """Extract the integer N from a sweep label like ``concurrency16``. - - Returns ``None`` for labels that don't match the expected pattern so - unrelated subdirectories (logs, etc.) are skipped silently. - """ + """Extract N from a sweep label like ``concurrency16``; ``None`` otherwise.""" if not label.startswith("concurrency"): return None try: @@ -212,12 +255,7 @@ def _parse_concurrency_from_label(label: str) -> int | None: def _read_latency_row(csv_path: Path, concurrency: int) -> LatencyRow | None: - """Pull the ``Request Latency (ms)`` line out of an AIPerf CSV. - - AIPerf writes a header row followed by one ``Metric,avg,min,max,sum,p1,...`` - row per metric, then a blank line and a second small block. We only care - about the first block. - """ + """Pull the ``Request Latency (ms)`` row from an AIPerf CSV's first block.""" if not csv_path.is_file(): log.debug("Missing CSV at %s; skipping", csv_path) return None diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py index 80dda5d329..00af57580a 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py @@ -54,11 +54,8 @@ class RunPaths: nmp_data_dir: Path # Checked-in YAML template for the AIPerf sweep config. config_template: Path - # In-repo copies of the upstream mock-LLM `.env` files. We point the - # mock server processes at these (via `--config-file`) so we can change - # mock latency or refusal behavior without touching the NeMo-Guardrails - # checkout, and so the exact mock config a benchmark ran against stays - # versioned alongside our results. + # In-repo mock-LLM `.env` files. Versioned with the benchmark so mock + # behavior is independent of the NeMo-Guardrails checkout. mock_app_env: Path mock_content_safety_env: Path # Per-run materialized copy of `config_template` with `output_base_dir` @@ -83,12 +80,7 @@ def ensure_directories(self) -> None: path.mkdir(parents=True, exist_ok=True) def aiperf_output_dir_for(self, variant: str) -> Path: - """Per-variant AIPerf output dir under ``aiperf_results//``. - - Each variant gets its own subtree so a single run can hold side-by-side - sweeps (e.g. ``with-guardrails`` and ``without-guardrails``) without - their outputs colliding. - """ + """Per-variant AIPerf output dir; keeps side-by-side sweeps from colliding.""" return self.aiperf_output_dir / variant def runtime_config_for(self, variant: str) -> Path: diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py index e3386b303c..2fc8e7421e 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py @@ -98,10 +98,8 @@ def _validate_nemoguardrails_repo(nemoguardrails_repo_root: Path) -> None: def _validate_in_repo_mock_configs(paths: RunPaths) -> None: """Fail fast if the in-repo mock LLM env files are missing. - These configure the upstream mock server but live in this repo (under - `plugins/nemo-guardrails/benchmarks/configs/mock_llm/`) so the harness can - tune mock behavior without forking the upstream checkout. A missing file - here is almost certainly a checkout problem on the NMP side. + These live in this repo (not upstream) so we control mock behavior + independently of the NeMo-Guardrails checkout. """ missing = [p for p in (paths.mock_app_env, paths.mock_content_safety_env) if not p.is_file()] if missing: @@ -140,9 +138,7 @@ def spec(name: str, port: int, env_file: Path, *, health_url: str) -> Supervised health_timeout_seconds=_MOCK_HEALTH_TIMEOUT_SECONDS, ) - # Both env files are sourced from this repo (see - # plugins/nemo-guardrails/benchmarks/configs/mock_llm/) so mock behavior is - # versioned alongside the benchmark and independent of the upstream checkout. + # Env files come from this repo, rather than the upstream library. return [ spec( "mock-app-llm", @@ -370,12 +366,10 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: choices=(*ALL_VARIANTS, "all"), default="all", help=( - "Which sweep to run: 'with-guardrails' targets the VirtualModel with " - "middleware attached; 'without-guardrails' targets a control VM with " - "no middleware; 'all' (default) runs them sequentially against the " - "same NMP so the with-vs-without delta isolates middleware overhead. " - "In CI, run two parallel jobs with --variant=with-guardrails and " - "--variant=without-guardrails against isolated NMP instances." + "Which sweep to run. 'all' (default) runs both variants sequentially " + "against the same NMP; the with-vs-without delta isolates middleware " + "overhead. In CI, run the two variants as parallel jobs against " + "separate NMP instances." ), ) parser.add_argument("--verbose", "-v", action="store_true") @@ -457,11 +451,8 @@ def main(argv: list[str] | None = None) -> int: log.info("Waiting for VirtualModel %s to be ready...", seeded.vm_ref) _smoke_test(client, seeded) - # Run each requested benchmark variant sequentially against the same - # NMP. The bootstrap (mocks, NMP, shim) is shared so the only thing - # that differs between variants is which VirtualModel AIPerf targets, - # which is exactly what makes the with-vs-without delta interpretable - # as middleware overhead. + # Variants run sequentially against the same NMP; only the targeted + # VirtualModel differs, so the delta isolates middleware overhead. outcomes: list[BenchmarkOutcome] = [] for variant in variants: outcomes.append( @@ -479,13 +470,10 @@ def main(argv: list[str] | None = None) -> int: def _maybe_print_analysis(run_dir: Path, outcomes: list[BenchmarkOutcome]) -> None: - """Print the with-vs-without comparison table when it's meaningful. + """Print the analyzer's comparison table when at least one variant has results. - Only invoked when at least one variant produced results, since the - analyzer can also dump a single-variant table for partial runs. The call - is wrapped in a broad try/except: the analyzer is pure post-processing - over already-written artifacts, so a bug here must not change the - harness's exit code or hide a real benchmark failure. + Wrapped in a broad try/except: the analyzer is post-processing only and + must not change the harness's exit code or hide a real benchmark failure. """ if not any(o.sweep_results for o in outcomes): return diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py index 97f9b0df0f..cde4f31b3f 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py @@ -53,9 +53,7 @@ class SeededResources: cs_model_entity: str guardrail_config_name: str vm_name: str - # Sibling VM with no middleware attached, used as the control variant by - # the benchmark harness. Same default_model_entity and models list as the - # guardrails VM so the only difference is middleware attachment. + # Control VM with no middleware; otherwise identical to the guardrails VM. no_guardrails_vm_name: str @property @@ -180,10 +178,8 @@ def seed_benchmark( ) _dump_model(generated_dir / "virtual_model.json", vm) - # Control VirtualModel: identical to the guardrails VM except no - # middleware. The harness uses this to measure NMP+IGW latency without - # the guardrails middleware so the with-vs-without delta isolates - # middleware overhead. + # Control VM: identical to the guardrails VM but no middleware, so the + # with-vs-without delta isolates middleware overhead. log.info("Creating control VirtualModel %s/%s", WORKSPACE, NO_GUARDRAILS_VM_NAME) no_guardrails_vm = client.inference.virtual_models.create( workspace=WORKSPACE, From f0fef0a24a313a051f4d107e89a57d656b7a8151 Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Tue, 23 Jun 2026 09:27:22 -0400 Subject: [PATCH 5/8] Updates Signed-off-by: Jash Gulabrai --- .github/workflows/ci.yaml | 2 +- plugins/nemo-guardrails/benchmarks/README.md | 80 +++++++- .../benchmarks/analyze.py | 176 +++++++++++++++++- 3 files changed, 241 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 37d5864c9f..b87ba8ccdd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1163,7 +1163,7 @@ jobs: run: | RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-) echo "Analyzing run directory: $RUN_DIR" - python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" + python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" --strict - name: Upload merged benchmark artifacts if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md index bede28d85e..cb103d2529 100644 --- a/plugins/nemo-guardrails/benchmarks/README.md +++ b/plugins/nemo-guardrails/benchmarks/README.md @@ -15,9 +15,12 @@ benchmark modules with `PYTHONPATH` pointed at that checkout. plugins/nemo-guardrails/benchmarks/ configs/ nmp_igw_guardrails_sweep_concurrency.yaml # AIPerf sweep template + mock_llm/ # in-repo mock LLM env files + results/ # historical baseline notes artifacts/ # per-run outputs (gitignored) plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/ run.py # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run` + analyze.py # post-run analysis; checks latencies against baseline values paths.py # filesystem layout constants.py # workspace / VM / provider names processes.py # subprocess supervision (process groups + ExitStack) @@ -181,15 +184,76 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs// ## CI -A `benchmark-guardrails` job in `.github/workflows/ci.yaml` checks out both -this repo and `NVIDIA/NeMo-Guardrails`, runs `make bootstrap-python` and -`make benchmark-guardrails`, and uploads the per-run artifacts directory -(`logs/`, `generated/`, `aiperf_results/`) on success or failure. +Two jobs in `.github/workflows/ci.yaml`: -Pass/fail is driven by the harness's exit code, which is non-zero if `aiperf` -itself exits non-zero or any sweep returns a non-zero exit code. No latency -thresholds are enforced — those can be layered on later by a separate -analyzer that reads the per-sweep CSVs. +- `benchmark-guardrails` — matrix of two parallel jobs, one per variant + (`with-guardrails`, `without-guardrails`), each on its own NMP instance. + Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`). +- `benchmark-guardrails-analyze` — joins the two matrix jobs, downloads both + artifacts, prints a side-by-side comparison via + `nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check + (see below). Fails the build on a latency regression beyond tolerance. The + analyzer is stdlib-only by design, so this job runs on the runner's stock + `python3` without bootstrapping the uv workspace. + +### Baseline and gating + +CI compares the run's delta_p50 (with-guardrails minus without-guardrails +p50, in ms) against a checked-in baseline. The baseline lives as +module-level constants in: + +```text +plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py +``` + +Why only delta_p50 (and not absolute with-guardrails p50)? delta_p50 +isolates the middleware's contribution — shared CI runner noise cancels +across the two variants. + +#### Baseline constants + +- `CONCURRENCIES_TO_VALIDATE: list[int]` — concurrency levels to gate on. + Other levels still appear in the analyzer's output tables, but pass/fail + is decided only by these. +- `DEFAULT_DELTA_P50_TOLERANCE_MS: int` — default tolerance (in ms) applied + to every validated concurrency. A check fails when + `|observed - baseline| > tolerance`. +- `DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int]` — per-concurrency + tolerance overrides (in ms). Levels without an override fall back to the + default. +- `DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int]` — expected delta_p50 + (in ms) per concurrency level. Edit by hand when a real change shifts + the numbers. + +Worked example: at c=16 the override is 300 ms, so a run with observed +delta_p50 = 1689 (diff +299 from baseline 1390) passes; observed +delta_p50 = 1691 (diff +301) fails. + +Notes on the current values: + +- c=16 and c=32 use wider tolerances than the default because their + absolute delta_p50 is larger. Over time, we can tighten these values + if latencies in CI produce less variance. +- Any change to mock-LLM latencies, the guardrails config, or the runner + class invalidates the current baseline values. The benchmark should be + re-run in CI several tiems to establish updated baseline values. + +#### Running the analyzer locally + +```bash +python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py \ + plugins/nemo-guardrails/benchmarks/artifacts/runs/ +``` + +Local runs print both tables and the baseline-check table. +CI passes `--strict` to make any out-of-tolerance check fail the job. + +#### Updating the baseline + +When a real change shifts the numbers (ex. a deliberate middleware change, +a mock-LLM config change, or a runner-class change), edit the constants at +the top of `analyze.py` by hand and reference the PR / CI run that +justifies it in the commit. ## Cleanup diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py index 60545a70b1..8e7671edcb 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py @@ -26,6 +26,36 @@ VARIANT_WITH_GUARDRAILS = "with-guardrails" VARIANT_WITHOUT_GUARDRAILS = "without-guardrails" +# --- CI baseline gate --------------------------------------------------------- +# For each concurrency level we list: +# - The expected p50 latency delta between requests with guardrails vs. +# without guardrails. +# - The allowed plus/minus tolerance in CI. Benchmark jobs whose p50 +# latency exceeds this tolerance will fail. + +# Concurrency levels we check in CI. +CONCURRENCIES_TO_VALIDATE: list[int] = [1, 2, 4, 8, 16, 32] + +# Tolerance (ms) used for every concurrency level unless overridden below. +DEFAULT_DELTA_P50_TOLERANCE_MS: int = 150 + +# Looser tolerance (ms) for higher concurrencies. With more requests in +# flight at once, they contend for shared resources (the IGW event loop, +# the mock-LLM workers, the CI runner's CPU), so we see more variance in +# latency values. +DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 300} + +# Estimated expected delta_p50 (ms) at each concurrency level, based on +# a few sample runs in CI. +DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int] = { + 1: 1070, + 2: 1110, + 4: 1190, + 8: 1230, + 16: 1390, + 32: 2430, +} + log = logging.getLogger(__name__) _LATENCY_METRIC = "Request Latency (ms)" @@ -134,7 +164,18 @@ def format_table(rows: list[ComparisonRow]) -> str: if not rows: return "No comparable sweep results found (need both variants to share concurrency levels)." - header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg") + header = ( + "conc", + "with p50", + "w/o p50", + "delta p50", + "with p90", + "w/o p90", + "delta p90", + "with avg", + "w/o avg", + "delta avg", + ) fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" header_line = fmt.format(*header) lines = [header_line, "-" * len(header_line)] @@ -154,7 +195,7 @@ def format_table(rows: list[ComparisonRow]) -> str: ) ) lines.append("") - lines.append("All values in milliseconds. 'Δ' = with-guardrails minus without-guardrails.") + lines.append("All values in milliseconds. 'delta' columns = with-guardrails minus without-guardrails.") return "\n".join(lines) @@ -162,13 +203,24 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str: """Render a table with mock-LLM time subtracted from p50/p90/avg. Isolates NMP + IGW + shim + middleware overhead from the much larger - mock sleeps. The Δ columns are the middleware's own cost over the bare - path. + mock sleeps. The delta columns are the middleware's own cost over the + bare path. """ if not rows: return "No comparable sweep results found (need both variants to share concurrency levels)." - header = ("conc", "with p50", "w/o p50", "Δ p50", "with p90", "w/o p90", "Δ p90", "with avg", "w/o avg", "Δ avg") + header = ( + "conc", + "with p50", + "w/o p50", + "delta p50", + "with p90", + "w/o p90", + "delta p90", + "with avg", + "w/o avg", + "delta avg", + ) fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" header_line = fmt.format(*header) lines = [header_line, "-" * len(header_line)] @@ -201,7 +253,7 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str: f"{_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST}× content-safety; " f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms = 1× app)." ) - lines.append("'Δ' columns are the middleware's own overhead over the bare NMP+IGW path.") + lines.append("'delta' columns are the middleware's own overhead over the bare NMP+IGW path.") return "\n".join(lines) @@ -227,6 +279,100 @@ def analyze_run(run_dir: Path) -> str: return f"{format_table(rows)}\n\n{format_platform_overhead_table(rows)}" +def _load_comparison_rows(run_dir: Path) -> list[ComparisonRow]: + """Reload comparison rows from a run dir; returns ``[]`` if either variant is absent.""" + aiperf_dir = run_dir / "aiperf_results" + with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS) + without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS) + if not with_guardrails or not without_guardrails: + return [] + return compare(with_guardrails, without_guardrails) + + +@dataclass(frozen=True) +class LatencyReport: + """Latency results for a single concurrency level, rendered as one row of the report. + + Each instance represents a single concurrency level from the benchmark + run: what we measured (observed_ms), what we expected from the + baseline (baseline_ms), and how much they're allowed to differ + (tolerance_ms). + The check passes when |observed_ms - baseline_ms| <= tolerance_ms. + """ + + concurrency: int + metric: str + baseline_ms: float + observed_ms: float + tolerance_ms: float + + @property + def diff_ms(self) -> float: + return self.observed_ms - self.baseline_ms + + @property + def passed(self) -> bool: + return abs(self.diff_ms) <= self.tolerance_ms + + +def check_against_baseline(rows: list[ComparisonRow]) -> tuple[str, int]: + """Compare the delta_p50 for each concurrency level against the baseline latencies. + + Returns ``(report_text, failed_count)``. Concurrencies missing from + either the run or the baseline are skipped with a note. + """ + rows_by_concurrency = {r.concurrency: r for r in rows} + + latency_reports: list[LatencyReport] = [] + skipped_concurrencies: list[int] = [] + + for concurrency in sorted(CONCURRENCIES_TO_VALIDATE): + if concurrency not in rows_by_concurrency or concurrency not in DELTA_P50_BASELINE_BY_CONCURRENCY: + skipped_concurrencies.append(concurrency) + continue + latency_reports.append( + LatencyReport( + concurrency=concurrency, + metric="delta_p50", + baseline_ms=float(DELTA_P50_BASELINE_BY_CONCURRENCY[concurrency]), + observed_ms=rows_by_concurrency[concurrency].delta_p50, + tolerance_ms=float(DELTA_P50_TOLERANCE_OVERRIDES_MS.get(concurrency, DEFAULT_DELTA_P50_TOLERANCE_MS)), + ) + ) + + fmt = "{:>9} {:>4} {:>10} {:>10} {:>9} {:>11} {:>6}" + header_line = fmt.format("metric", "conc", "baseline", "observed", "diff", "tolerance", "status") + lines = [ + "Check vs baseline (see DELTA_P50_BASELINE_BY_CONCURRENCY in analyze.py):", + header_line, + "-" * len(header_line), + ] + failed_count = 0 + for report in latency_reports: + status = "PASS" if report.passed else "FAIL" + if not report.passed: + failed_count += 1 + lines.append( + fmt.format( + report.metric, + report.concurrency, + f"{report.baseline_ms:.0f}", + f"{report.observed_ms:.0f}", + f"{report.diff_ms:+.0f}", + f"±{report.tolerance_ms:.0f}ms", + status, + ) + ) + if skipped_concurrencies: + lines.append("") + lines.append(f"Skipped (missing from results or baseline): {skipped_concurrencies}") + if failed_count: + lines.append("") + lines.append(f"FAIL: {failed_count} of {len(latency_reports)} check(s) exceeded tolerance.") + + return "\n".join(lines), failed_count + + def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str: """Render one variant's table when the other variant didn't run.""" fmt = "{:>4} {:>9} {:>9} {:>9} {:>9}" @@ -300,7 +446,12 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument( "run_dir", type=Path, - help=("Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs//`."), + help="Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs//`.", + ) + parser.add_argument( + "--strict", + action="store_true", + help="Exit non-zero when any baseline check exceeds tolerance. CI sets this; local runs default off so you can iterate without the gate failing.", ) parser.add_argument( "--log-level", @@ -317,7 +468,16 @@ def main(argv: list[str] | None = None) -> int: return 2 print(analyze_run(run_dir)) - return 0 + + rows = _load_comparison_rows(run_dir) + if not rows: + print("Skipping baseline check: no comparable rows from this run.", file=sys.stderr) + return 0 if not args.strict else 2 + + report, failed_count = check_against_baseline(rows) + print() + print(report) + return 1 if (args.strict and failed_count) else 0 if __name__ == "__main__": From 98f5d457c84218eb12e3e41d3c492983ab976ac6 Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Tue, 23 Jun 2026 10:09:36 -0400 Subject: [PATCH 6/8] Clean up CI config and baselines Signed-off-by: Jash Gulabrai --- .github/actions/changes/action.yaml | 6 + .github/workflows/ci.yaml | 36 ++-- plugins/nemo-guardrails/benchmarks/README.md | 5 +- .../results/local-baseline-2026-06-16.md | 154 ------------------ .../benchmarks/analyze.py | 20 +-- 5 files changed, 39 insertions(+), 182 deletions(-) delete mode 100644 plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md diff --git a/.github/actions/changes/action.yaml b/.github/actions/changes/action.yaml index f6a7a27d2e..948d73015d 100644 --- a/.github/actions/changes/action.yaml +++ b/.github/actions/changes/action.yaml @@ -40,6 +40,9 @@ outputs: k8s-smoke: description: "'true' if Kubernetes smoke test support files changed" value: ${{ steps.filter.outputs.k8s-smoke }} + guardrails-benchmark: + description: "'true' if the nemo-guardrails plugin or guardrails service changed" + value: ${{ steps.filter.outputs.guardrails-benchmark }} cpu-smoke: description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed" value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }} @@ -97,3 +100,6 @@ runs: - 'e2e/k8s/values/**' - 'e2e/test_jobs.py' - '.github/actions/free-disk-space/action.yaml' + guardrails-benchmark: + - 'plugins/nemo-guardrails/**' + - 'services/guardrails/**' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b87ba8ccdd..8f1ac2e718 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -44,6 +44,7 @@ jobs: docker: ${{ steps.changes.outputs.docker }} helm: ${{ steps.changes.outputs.helm }} cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }} + guardrails-benchmark: ${{ steps.changes.outputs.guardrails-benchmark }} steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - uses: ./.github/actions/changes @@ -1071,12 +1072,17 @@ jobs: retention-days: 7 path: web/packages/studio/playwright-report/ - benchmark-guardrails: + guardrails-benchmark: # Parallel matrix jobs (one NMP per variant) so the two sweeps don't - # share mocks or contend on :8080. `benchmark-guardrails-analyze` merges + # share mocks or contend on :8080. `guardrails-benchmark-analyze` merges # the artifacts and prints the comparison. - name: Guardrails plugin benchmark (${{ matrix.variant }}) - # TODO(jgulabrai): gate back to `workflow_dispatch` once we have a CI baseline. + name: nemo-guardrails plugin benchmark (${{ matrix.variant }}) + needs: [changes] + if: > + !cancelled() && ( + github.event_name == 'workflow_dispatch' || + needs.changes.outputs.guardrails-benchmark == 'true' + ) runs-on: ubuntu-latest timeout-minutes: 30 strategy: @@ -1125,16 +1131,20 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: # Ensure we use a unique artifact name per benchmark vaiant. - name: benchmark-guardrails-results-${{ matrix.variant }} + name: guardrails-benchmark-results-${{ matrix.variant }} retention-days: 30 path: | nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ - benchmark-guardrails-analyze: + guardrails-benchmark-analyze: # Merge both variant artifacts and print the comparison table. - name: Guardrails benchmark analysis - needs: [benchmark-guardrails] - if: ${{ !cancelled() }} + name: nemo-guardrails plugin benchmark analysis + needs: [changes, guardrails-benchmark] + if: > + !cancelled() && ( + github.event_name == 'workflow_dispatch' || + needs.changes.outputs.guardrails-benchmark == 'true' + ) runs-on: ubuntu-latest timeout-minutes: 10 steps: @@ -1148,13 +1158,13 @@ jobs: continue-on-error: true uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: - name: benchmark-guardrails-results-with-guardrails + name: guardrails-benchmark-results-with-guardrails path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ - name: Download without-guardrails artifact continue-on-error: true uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: - name: benchmark-guardrails-results-without-guardrails + name: guardrails-benchmark-results-without-guardrails path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ - name: Print benchmark comparison working-directory: nemo-platform @@ -1169,7 +1179,7 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: # Single artifact so baseline collection is one download per run. - name: benchmark-guardrails-results-merged + name: guardrails-benchmark-results-merged retention-days: 30 path: | nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ @@ -1320,8 +1330,6 @@ jobs: - web-sdk-gen - web-studio-deps - web-studio-e2e - - benchmark-guardrails - - benchmark-guardrails-analyze - opa-policy-test if: always() runs-on: ubuntu-latest diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md index cb103d2529..f9084971c8 100644 --- a/plugins/nemo-guardrails/benchmarks/README.md +++ b/plugins/nemo-guardrails/benchmarks/README.md @@ -16,7 +16,6 @@ plugins/nemo-guardrails/benchmarks/ configs/ nmp_igw_guardrails_sweep_concurrency.yaml # AIPerf sweep template mock_llm/ # in-repo mock LLM env files - results/ # historical baseline notes artifacts/ # per-run outputs (gitignored) plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/ run.py # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run` @@ -186,10 +185,10 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs// Two jobs in `.github/workflows/ci.yaml`: -- `benchmark-guardrails` — matrix of two parallel jobs, one per variant +- `guardrails-benchmark` — matrix of two parallel jobs, one per variant (`with-guardrails`, `without-guardrails`), each on its own NMP instance. Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`). -- `benchmark-guardrails-analyze` — joins the two matrix jobs, downloads both +- `guardrails-benchmark-analyze` — joins the two matrix jobs, downloads both artifacts, prints a side-by-side comparison via `nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check (see below). Fails the build on a latency regression beyond tolerance. The diff --git a/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md b/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md deleted file mode 100644 index 509c9e8c62..0000000000 --- a/plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md +++ /dev/null @@ -1,154 +0,0 @@ -# Local baseline results — 2026-06-16 - -Three back-to-back runs of `make benchmark-guardrails` on a local MacBook Pro -(Apple Silicon), no other heavy workloads running. Goal: characterize the -run-to-run variance of the new with-guardrails / without-guardrails harness so -we can decide what's gateable in CI. - -## Hardware / setup - -- Host: MacBook Pro, Apple Silicon, on AC power -- NMP, mocks, shim: all on localhost -- Mock LLM config: in-repo defaults (`plugins/nemo-guardrails/benchmarks/configs/mock_llm/`) - - app LLM: 4.0s e2e latency, std 0 - - content-safety LLM: 0.5s e2e latency, std 0 -- AIPerf sweep: concurrency `[1, 2, 4, 8, 16, 32, 64]`, `benchmark_duration: 60s`, - `warmup_request_count: 10`, non-streaming chat completions -- Mock workers: 4 (default) -- Three runs in the same afternoon, NMP data dir reused across runs - -## Run inventory - -| Run | Run dir | Notes | -|---|---|---| -| 1 | `20260616_123851` | first run after the with/without harness change | -| 2 | `20260616_145058` | identical config | -| 3 | `20260616_152834` | identical config | - -All three runs completed with 7/7 sweeps passing per variant, exit code 0. - -## Δp50 (with-guardrails − without-guardrails), milliseconds - -This is the headline metric: how much wall-clock time the guardrails middleware -adds on top of the bare NMP+IGW path, including the two content-safety LLM -round-trips that the rails cause but don't do themselves. - -| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | -|---------|-----:|-----:|-----:|-----:|-----:|-----:|--------:| -| Run 1 | 1029 | 1071 | 1068 | 1104 | 1145 | 1260 | 778 | -| Run 2 | 1027 | 1062 | 1096 | 1105 | 1226 | 1256 | -2896 | -| Run 3 | 1030 | 1062 | 1079 | 1070 | 1118 | 1201 | -2077 | -| **mean**| **1029** | **1065** | **1081** | **1093** | **1163** | **1239** | **−1398** | -| range | 3 | 9 | 28 | 35 | 108 | 59 | 3674 | -| range % | 0.3% | 0.8% | 2.6% | 3.2% | 9.3% | 4.8% | n/a | - -## with-guardrails p50 (absolute), milliseconds - -Useful as a sanity check that nothing catastrophic shifted in the absolute -numbers — even if Δp50 stays steady, both variants could slow down together. - -| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | -|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| -| Run 1 | 5049 | 5101 | 5114 | 5152 | 5201 | 5318 | 6164 | -| Run 2 | 5048 | 5093 | 5125 | 5137 | 5255 | 5279 | 5614 | -| Run 3 | 5050 | 5094 | 5123 | 5146 | 5163 | 5250 | 5486 | -| **mean**| **5049** | **5096** | **5121** | **5145** | **5206** | **5282** | **5755** | -| range | 2 | 8 | 11 | 15 | 92 | 68 | 678 | -| range % | 0.0% | 0.2% | 0.2% | 0.3% | 1.8% | 1.3% | 11.8%| - -## without-guardrails p50 (absolute), milliseconds - -For completeness. This is the variant that's wildly unstable at c=64. - -| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | -|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| -| Run 1 | 4020 | 4030 | 4045 | 4048 | 4056 | 4058 | 5386 | -| Run 2 | 4020 | 4031 | 4029 | 4032 | 4029 | 4023 | 8510 | -| Run 3 | 4020 | 4032 | 4044 | 4076 | 4045 | 4049 | 7563 | -| **mean**| **4020** | **4031** | **4039** | **4052** | **4043** | **4043** | **7153** | -| range | 0 | 2 | 16 | 44 | 27 | 35 | 3124 | - -The app mock sleeps for exactly 4.0s. The ~20–80 ms above 4000 across c=1–c=32 -is pure NMP+IGW+shim overhead. At c=64 the mock saturates (4 workers × 1 req/4s -= 4 RPS ceiling, vs. 64 requested in-flight) and requests queue. - -## p90 — informational only - -p90 is much noisier than p50 across runs. Not gateable with three samples. - -### Δp90, milliseconds - -| Run | c=1 | c=2 | c=4 | c=8 | c=16 | c=32 | c=64 | -|-------|-----:|-----:|-----:|-----:|-----:|-----:|------:| -| Run 1 | 1039 | 1099 | 1162 | 1025 | 911 | 604 | 3009 | -| Run 2 | 1028 | 1115 | 1160 | 1262 | 783 | 641 | 1015 | -| Run 3 | 1023 | 1076 | 1189 | 1085 | 1209 | 18 | 1998 | - -## Observations - -### What's stable enough to gate on - -**c=1, 2, 4, 8.** The Δp50 ranges are 3–35 ms, well under any tolerance we'd -realistically write. The absolute with-guardrails p50 is even tighter (2–15 ms -across three runs). This is the regime where the harness is genuinely measuring -what we want: NMP+middleware overhead on top of fixed-latency mocks. - -### What's borderline - -**c=16.** Δp50 range is 9.3%. Gateable with a generous tolerance (~10%+) but -adds limited signal beyond c=8. - -### What's not gateable - -**c=32.** ~5% Δp50 range. Still bounded, but the run-to-run distance is -several times larger than at c=1–c=8 and the absolute numbers wobble too. - -**c=64.** Unusable. Δp50 swings from +778 to −2896 across three runs. -Root cause is the app mock's 4-worker saturation at this load level: the -without-guardrails path fires app requests as fast as it can and the mock queues -unpredictably. The with-guardrails path's CS-mock work paces requests enough to -hide most of this. This is a test-rig artifact, not an NMP behavior. - -### Side observation: middleware overhead is small - -Of the ~1029 ms Δp50 at c=1: -- ~1000 ms is the two content-safety mock round-trips (0.5s each, mandatory). -- ~29 ms is the middleware's *own* work (rails orchestration, request/response - shaping, etc.) plus bare NMP+IGW overhead delta vs. without-guardrails. - -The without-guardrails baseline of ~4020 ms at c=1 against a 4000 ms mock means -**bare NMP+IGW+shim overhead is ~20 ms** at idle. - -## Recommendation for the CI gate - -Based on the variance data above: - -| Concurrency | Gate Δp50? | Gate absolute with-guardrails p50? | Notes | -|---|---|---|---| -| 1 | yes | yes | tightest signal | -| 2 | yes | yes | | -| 4 | yes | yes | | -| 8 | yes | yes | | -| 16 | informational | informational | record but don't fail | -| 32 | informational | informational | record but don't fail | -| 64 | exclude | exclude | mock saturation, not gateable | - -Proposed tolerance bands (`max(absolute_ms, relative_%)`): -- Δp50: `max(±100 ms, ±5%)` -- with-guardrails p50: `max(±150 ms, ±3%)` - -Both bands are ~3× the observed local run-to-run range, leaving headroom for -CI hardware noise being noisier than a quiet laptop. - -## Open questions / followups - -- **Local baselines won't transfer to CI hardware.** These numbers should seed - the baseline file but be replaced once we have N runs from the actual CI - runner class. -- **Three samples is a small N.** Worth one more local run (Run 4) before we - treat the means above as canonical, but the c=1–c=8 numbers are unlikely - to budge meaningfully. -- **c=64 instability is downstream of NMP.** Hypothesis: app mock's 4 workers - saturate at concurrency 64 (4 RPS ceiling on 4.0s sleep). Easy to test by - running with `--mock-workers 16`. Not blocking the gate work since c=64 is - excluded anyway. diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py index 8e7671edcb..7413a074cc 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py @@ -43,7 +43,7 @@ # flight at once, they contend for shared resources (the IGW event loop, # the mock-LLM workers, the CI runner's CPU), so we see more variance in # latency values. -DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 300} +DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 450} # Estimated expected delta_p50 (ms) at each concurrency level, based on # a few sample runs in CI. @@ -53,7 +53,7 @@ 4: 1190, 8: 1230, 16: 1390, - 32: 2430, + 32: 2110, } log = logging.getLogger(__name__) @@ -178,7 +178,7 @@ def format_table(rows: list[ComparisonRow]) -> str: ) fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" header_line = fmt.format(*header) - lines = [header_line, "-" * len(header_line)] + lines = ["Measured Latencies (ms), with and without guardrails:", header_line, "-" * len(header_line)] for r in rows: lines.append( fmt.format( @@ -195,7 +195,7 @@ def format_table(rows: list[ComparisonRow]) -> str: ) ) lines.append("") - lines.append("All values in milliseconds. 'delta' columns = with-guardrails minus without-guardrails.") + lines.append("delta = with-guardrails minus without-guardrails.") return "\n".join(lines) @@ -223,7 +223,7 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str: ) fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" header_line = fmt.format(*header) - lines = [header_line, "-" * len(header_line)] + lines = ["Platform Overhead (ms), with and without guardrails:", header_line, "-" * len(header_line)] for r in rows: with_p50 = r.with_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS @@ -248,12 +248,10 @@ def format_platform_overhead_table(rows: list[ComparisonRow]) -> str: ) lines.append("") lines.append( - "All values in milliseconds, with mock-LLM time subtracted " - f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms = 1× app + " - f"{_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST}× content-safety; " - f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms = 1× app)." + f"Minus mock-LLM time " + f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms; " + f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms)." ) - lines.append("'delta' columns are the middleware's own overhead over the bare NMP+IGW path.") return "\n".join(lines) @@ -343,7 +341,7 @@ def check_against_baseline(rows: list[ComparisonRow]) -> tuple[str, int]: fmt = "{:>9} {:>4} {:>10} {:>10} {:>9} {:>11} {:>6}" header_line = fmt.format("metric", "conc", "baseline", "observed", "diff", "tolerance", "status") lines = [ - "Check vs baseline (see DELTA_P50_BASELINE_BY_CONCURRENCY in analyze.py):", + "Guardrails Overhead vs. Baseline (ms):", header_line, "-" * len(header_line), ] From 8a55a3ac14f695c38fc7964113ef33b9fbdcbaa2 Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Tue, 23 Jun 2026 10:15:45 -0400 Subject: [PATCH 7/8] Minor ReadME cleanup Signed-off-by: Jash Gulabrai --- .../benchmarks/configs/mock_llm/README.md | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md index 1cd5cc3dd4..87c1563b59 100644 --- a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md +++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md @@ -1,21 +1,15 @@ # Mock LLM configurations -These `.env` files configure the upstream `benchmark.mock_llm_server.run_server` -(from the `NeMo-Guardrails` checkout) for the IGW guardrails benchmark. +These `.env` files configure the behavior of the mock LLMs, used by the upstream +`nemo-guardrails` library's `benchmark.mock_llm_server.run_server`. -We keep our own copies (instead of pointing at the upstream checkout's -`benchmark/mock_llm_server/configs/`) so: +The library stores these files, but we keep our own copies so: -- We can change mock latency without touching the upstream repo. The original - motivation was tuning `E2E_LATENCY_*` to isolate NMP+middleware overhead - from mandatory NIM sleep (see the benchmark README for the full rationale). +- We can change mock latency without touching the upstream repo. - The exact mock behavior we benchmarked against is versioned alongside the results, so historical numbers stay reproducible even if upstream changes its defaults. -Initial contents are a verbatim copy of the upstream files: - +Mapping to upstream files: - `app-llm.env` ← upstream `meta-llama-3.3-70b-instruct.env` - `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env` - -Update either file to change mock behavior for the next benchmark run. From dcaf12f3bfb11326f494832c86463f4fe1e55a3b Mon Sep 17 00:00:00 2001 From: Jash Gulabrai Date: Tue, 23 Jun 2026 10:34:30 -0400 Subject: [PATCH 8/8] Address CodeRabbit Signed-off-by: Jash Gulabrai --- .github/workflows/ci.yaml | 3 +++ plugins/nemo-guardrails/benchmarks/README.md | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8f1ac2e718..9daeb9c1c2 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1095,11 +1095,13 @@ jobs: uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: path: nemo-platform + persist-credentials: false - name: Checkout NeMo-Guardrails uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: repository: NVIDIA/NeMo-Guardrails path: NeMo-Guardrails + persist-credentials: false - name: Install uv uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 with: @@ -1152,6 +1154,7 @@ jobs: uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: path: nemo-platform + persist-credentials: false - name: Download with-guardrails artifact # If a variant failed entirely it may have uploaded no artifact; # the analyzer handles the single-variant case so don't fail here. diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md index f9084971c8..f0894902ae 100644 --- a/plugins/nemo-guardrails/benchmarks/README.md +++ b/plugins/nemo-guardrails/benchmarks/README.md @@ -224,9 +224,9 @@ across the two variants. (in ms) per concurrency level. Edit by hand when a real change shifts the numbers. -Worked example: at c=16 the override is 300 ms, so a run with observed -delta_p50 = 1689 (diff +299 from baseline 1390) passes; observed -delta_p50 = 1691 (diff +301) fails. +Worked example: at c=16 the override is 200 ms, so a run with observed +delta_p50 = 1689 (diff +199 from baseline 1390) passes; observed +delta_p50 = 1691 (diff +201) fails. Notes on the current values: