diff --git a/.github/actions/changes/action.yaml b/.github/actions/changes/action.yaml index f6a7a27d2e..948d73015d 100644 --- a/.github/actions/changes/action.yaml +++ b/.github/actions/changes/action.yaml @@ -40,6 +40,9 @@ outputs: k8s-smoke: description: "'true' if Kubernetes smoke test support files changed" value: ${{ steps.filter.outputs.k8s-smoke }} + guardrails-benchmark: + description: "'true' if the nemo-guardrails plugin or guardrails service changed" + value: ${{ steps.filter.outputs.guardrails-benchmark }} cpu-smoke: description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed" value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }} @@ -97,3 +100,6 @@ runs: - 'e2e/k8s/values/**' - 'e2e/test_jobs.py' - '.github/actions/free-disk-space/action.yaml' + guardrails-benchmark: + - 'plugins/nemo-guardrails/**' + - 'services/guardrails/**' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f8643c9cb5..9daeb9c1c2 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -44,6 +44,7 @@ jobs: docker: ${{ steps.changes.outputs.docker }} helm: ${{ steps.changes.outputs.helm }} cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }} + guardrails-benchmark: ${{ steps.changes.outputs.guardrails-benchmark }} steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - uses: ./.github/actions/changes @@ -1071,21 +1072,36 @@ jobs: retention-days: 7 path: web/packages/studio/playwright-report/ - benchmark-guardrails: - name: Guardrails plugin benchmark - if: github.event_name == 'workflow_dispatch' + guardrails-benchmark: + # Parallel matrix jobs (one NMP per variant) so the two sweeps don't + # share mocks or contend on :8080. `guardrails-benchmark-analyze` merges + # the artifacts and prints the comparison. + name: nemo-guardrails plugin benchmark (${{ matrix.variant }}) + needs: [changes] + if: > + !cancelled() && ( + github.event_name == 'workflow_dispatch' || + needs.changes.outputs.guardrails-benchmark == 'true' + ) runs-on: ubuntu-latest timeout-minutes: 30 + strategy: + # Keep the partial artifact if one variant fails. + fail-fast: false + matrix: + variant: [with-guardrails, without-guardrails] steps: - name: Checkout nemo-platform uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: path: nemo-platform + persist-credentials: false - name: Checkout NeMo-Guardrails uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: repository: NVIDIA/NeMo-Guardrails path: NeMo-Guardrails + persist-credentials: false - name: Install uv uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 with: @@ -1102,7 +1118,13 @@ jobs: PYTORCH_DEPS: cpu - name: Run benchmark sweep working-directory: nemo-platform - run: make benchmark-guardrails + # Pin both variants to the same `--run-id` so when the analyze job + # downloads both artifacts into one `runs/` parent, they merge into + # a single run directory the analyzer can read normally. + run: | + make benchmark-guardrails BENCHMARK_ARGS="\ + --variant ${{ matrix.variant }} \ + --run-id ci-${{ github.run_id }}-${{ github.run_attempt }}" env: NEMO_GUARDRAILS_REPO_ROOT: ${{ github.workspace }}/NeMo-Guardrails _TYPER_FORCE_DISABLE_TERMINAL: "1" @@ -1110,7 +1132,57 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: benchmark-guardrails-results + # Ensure we use a unique artifact name per benchmark vaiant. + name: guardrails-benchmark-results-${{ matrix.variant }} + retention-days: 30 + path: | + nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ + + guardrails-benchmark-analyze: + # Merge both variant artifacts and print the comparison table. + name: nemo-guardrails plugin benchmark analysis + needs: [changes, guardrails-benchmark] + if: > + !cancelled() && ( + github.event_name == 'workflow_dispatch' || + needs.changes.outputs.guardrails-benchmark == 'true' + ) + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout nemo-platform + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + path: nemo-platform + persist-credentials: false + - name: Download with-guardrails artifact + # If a variant failed entirely it may have uploaded no artifact; + # the analyzer handles the single-variant case so don't fail here. + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: guardrails-benchmark-results-with-guardrails + path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ + - name: Download without-guardrails artifact + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: guardrails-benchmark-results-without-guardrails + path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ + - name: Print benchmark comparison + working-directory: nemo-platform + # `analyze.py` doesn't rely on NMP or AIPerf, so we skip the uv bootstrap + # step and run it with the runner's `python3` CLI directly. + run: | + RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-) + echo "Analyzing run directory: $RUN_DIR" + python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" --strict + - name: Upload merged benchmark artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + # Single artifact so baseline collection is one download per run. + name: guardrails-benchmark-results-merged retention-days: 30 path: | nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/ @@ -1261,7 +1333,6 @@ jobs: - web-sdk-gen - web-studio-deps - web-studio-e2e - - benchmark-guardrails - opa-policy-test if: always() runs-on: ubuntu-latest diff --git a/plugins/nemo-guardrails/benchmarks/README.md b/plugins/nemo-guardrails/benchmarks/README.md index bede28d85e..f0894902ae 100644 --- a/plugins/nemo-guardrails/benchmarks/README.md +++ b/plugins/nemo-guardrails/benchmarks/README.md @@ -15,9 +15,11 @@ benchmark modules with `PYTHONPATH` pointed at that checkout. plugins/nemo-guardrails/benchmarks/ configs/ nmp_igw_guardrails_sweep_concurrency.yaml # AIPerf sweep template + mock_llm/ # in-repo mock LLM env files artifacts/ # per-run outputs (gitignored) plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/ run.py # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run` + analyze.py # post-run analysis; checks latencies against baseline values paths.py # filesystem layout constants.py # workspace / VM / provider names processes.py # subprocess supervision (process groups + ExitStack) @@ -181,15 +183,76 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs// ## CI -A `benchmark-guardrails` job in `.github/workflows/ci.yaml` checks out both -this repo and `NVIDIA/NeMo-Guardrails`, runs `make bootstrap-python` and -`make benchmark-guardrails`, and uploads the per-run artifacts directory -(`logs/`, `generated/`, `aiperf_results/`) on success or failure. +Two jobs in `.github/workflows/ci.yaml`: -Pass/fail is driven by the harness's exit code, which is non-zero if `aiperf` -itself exits non-zero or any sweep returns a non-zero exit code. No latency -thresholds are enforced — those can be layered on later by a separate -analyzer that reads the per-sweep CSVs. +- `guardrails-benchmark` — matrix of two parallel jobs, one per variant + (`with-guardrails`, `without-guardrails`), each on its own NMP instance. + Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`). +- `guardrails-benchmark-analyze` — joins the two matrix jobs, downloads both + artifacts, prints a side-by-side comparison via + `nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check + (see below). Fails the build on a latency regression beyond tolerance. The + analyzer is stdlib-only by design, so this job runs on the runner's stock + `python3` without bootstrapping the uv workspace. + +### Baseline and gating + +CI compares the run's delta_p50 (with-guardrails minus without-guardrails +p50, in ms) against a checked-in baseline. The baseline lives as +module-level constants in: + +```text +plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py +``` + +Why only delta_p50 (and not absolute with-guardrails p50)? delta_p50 +isolates the middleware's contribution — shared CI runner noise cancels +across the two variants. + +#### Baseline constants + +- `CONCURRENCIES_TO_VALIDATE: list[int]` — concurrency levels to gate on. + Other levels still appear in the analyzer's output tables, but pass/fail + is decided only by these. +- `DEFAULT_DELTA_P50_TOLERANCE_MS: int` — default tolerance (in ms) applied + to every validated concurrency. A check fails when + `|observed - baseline| > tolerance`. +- `DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int]` — per-concurrency + tolerance overrides (in ms). Levels without an override fall back to the + default. +- `DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int]` — expected delta_p50 + (in ms) per concurrency level. Edit by hand when a real change shifts + the numbers. + +Worked example: at c=16 the override is 200 ms, so a run with observed +delta_p50 = 1689 (diff +199 from baseline 1390) passes; observed +delta_p50 = 1691 (diff +201) fails. + +Notes on the current values: + +- c=16 and c=32 use wider tolerances than the default because their + absolute delta_p50 is larger. Over time, we can tighten these values + if latencies in CI produce less variance. +- Any change to mock-LLM latencies, the guardrails config, or the runner + class invalidates the current baseline values. The benchmark should be + re-run in CI several tiems to establish updated baseline values. + +#### Running the analyzer locally + +```bash +python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py \ + plugins/nemo-guardrails/benchmarks/artifacts/runs/ +``` + +Local runs print both tables and the baseline-check table. +CI passes `--strict` to make any out-of-tolerance check fail the job. + +#### Updating the baseline + +When a real change shifts the numbers (ex. a deliberate middleware change, +a mock-LLM config change, or a runner-class change), edit the constants at +the top of `analyze.py` by hand and reference the PR / CI run that +justifies it in the commit. ## Cleanup diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md new file mode 100644 index 0000000000..87c1563b59 --- /dev/null +++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md @@ -0,0 +1,15 @@ +# Mock LLM configurations + +These `.env` files configure the behavior of the mock LLMs, used by the upstream +`nemo-guardrails` library's `benchmark.mock_llm_server.run_server`. + +The library stores these files, but we keep our own copies so: + +- We can change mock latency without touching the upstream repo. +- The exact mock behavior we benchmarked against is versioned alongside the + results, so historical numbers stay reproducible even if upstream changes + its defaults. + +Mapping to upstream files: +- `app-llm.env` ← upstream `meta-llama-3.3-70b-instruct.env` +- `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env` diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env new file mode 100644 index 0000000000..0cec095b38 --- /dev/null +++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env @@ -0,0 +1,19 @@ +MODEL="meta/llama-3.3-70b-instruct" +UNSAFE_PROBABILITY=0.0 +UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?" +SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities." +# End-to-end latency +E2E_LATENCY_MIN_SECONDS=4.0 +E2E_LATENCY_MAX_SECONDS=4.0 +E2E_LATENCY_MEAN_SECONDS=4.0 +E2E_LATENCY_STD_SECONDS=0.0 +# Streaming latency: Time to First Token (TTFT) +TTFT_MIN_SECONDS=0.3 +TTFT_MAX_SECONDS=0.3 +TTFT_MEAN_SECONDS=0.3 +TTFT_STD_SECONDS=0.0 +# Streaming latency: Chunk Latency (ITL) +CHUNK_LATENCY_MIN_SECONDS=0.015 +CHUNK_LATENCY_MAX_SECONDS=0.015 +CHUNK_LATENCY_MEAN_SECONDS=0.015 +CHUNK_LATENCY_STD_SECONDS=0.0 diff --git a/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env new file mode 100644 index 0000000000..9e2467b1db --- /dev/null +++ b/plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env @@ -0,0 +1,19 @@ +MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety" +UNSAFE_PROBABILITY=0.0 +UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}" +SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}" +# End-to-end latency +E2E_LATENCY_MIN_SECONDS=0.5 +E2E_LATENCY_MAX_SECONDS=0.5 +E2E_LATENCY_MEAN_SECONDS=0.5 +E2E_LATENCY_STD_SECONDS=0.0 +# Streaming latency: Time to First Token (TTFT) +TTFT_MIN_SECONDS=0.2 +TTFT_MAX_SECONDS=0.2 +TTFT_MEAN_SECONDS=0.2 +TTFT_STD_SECONDS=0.0 +# Streaming latency: Chunk Latency (ITL) +CHUNK_LATENCY_MIN_SECONDS=0.015 +CHUNK_LATENCY_MAX_SECONDS=0.015 +CHUNK_LATENCY_MEAN_SECONDS=0.015 +CHUNK_LATENCY_STD_SECONDS=0.0 diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py index f7ff6729c1..20963e11f3 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py @@ -60,17 +60,15 @@ def prepare_runtime_aiperf_config( template_path: Path, runtime_config_path: Path, aiperf_output_dir: Path, + model_ref: str | None = None, ) -> dict[str, Any]: """Materialize the AIPerf config this run will use. - Reads the checked-in ``template_path`` config, overrides its - ``output_base_dir`` to point inside the current run's directory, and writes - the result to ``runtime_config_path``. AIPerf is later invoked with - ``--config-file `` so every artifact lands under a - separate per-run directory. - - Returns the parsed config dict so callers can log fields (sweep params, - benchmark_duration) without re-reading the file. + Reads ``template_path``, overrides ``output_base_dir`` (so AIPerf + artifacts nest under this run) and optionally ``base_config.model`` + (so one template can target multiple VirtualModels), and writes the + result to ``runtime_config_path``. Returns the parsed config so + callers can log sweep params without re-reading the file. """ if not template_path.is_file(): raise FileNotFoundError(f"AIPerf template not found: {template_path}") @@ -82,6 +80,11 @@ def prepare_runtime_aiperf_config( # Point AIPerf's output_base_dir at this run's directory so its results # nest under our per-run artifacts tree. config["output_base_dir"] = str(aiperf_output_dir) + if model_ref is not None: + base_config = config.get("base_config") + if not isinstance(base_config, dict): + raise ValueError(f"Expected `base_config` mapping in {template_path}, got {type(base_config).__name__}") + base_config["model"] = model_ref runtime_config_path.parent.mkdir(parents=True, exist_ok=True) runtime_config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8") diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py new file mode 100644 index 0000000000..7413a074cc --- /dev/null +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py @@ -0,0 +1,482 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Post-run analyzer for the nemo-guardrails IGW benchmark. + +Reads ``profile_export_aiperf.csv`` files from both variants in one run dir +and prints a with-vs-without latency comparison. The delta isolates +middleware overhead since the only difference between variants is whether +middleware is attached to the targeted VirtualModel. + +Used both as a script (``python -m ... ``) and auto-invoked from +``run.py`` after a multi-variant sweep. +""" + +from __future__ import annotations + +import argparse +import csv +import logging +import sys +from dataclasses import dataclass +from pathlib import Path + +# Duplicated from `constants.py` so this module stays import-free and can +# run on bare `python3` in CI without bootstrapping the uv workspace. +VARIANT_WITH_GUARDRAILS = "with-guardrails" +VARIANT_WITHOUT_GUARDRAILS = "without-guardrails" + +# --- CI baseline gate --------------------------------------------------------- +# For each concurrency level we list: +# - The expected p50 latency delta between requests with guardrails vs. +# without guardrails. +# - The allowed plus/minus tolerance in CI. Benchmark jobs whose p50 +# latency exceeds this tolerance will fail. + +# Concurrency levels we check in CI. +CONCURRENCIES_TO_VALIDATE: list[int] = [1, 2, 4, 8, 16, 32] + +# Tolerance (ms) used for every concurrency level unless overridden below. +DEFAULT_DELTA_P50_TOLERANCE_MS: int = 150 + +# Looser tolerance (ms) for higher concurrencies. With more requests in +# flight at once, they contend for shared resources (the IGW event loop, +# the mock-LLM workers, the CI runner's CPU), so we see more variance in +# latency values. +DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int] = {16: 200, 32: 450} + +# Estimated expected delta_p50 (ms) at each concurrency level, based on +# a few sample runs in CI. +DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int] = { + 1: 1070, + 2: 1110, + 4: 1190, + 8: 1230, + 16: 1390, + 32: 2110, +} + +log = logging.getLogger(__name__) + +_LATENCY_METRIC = "Request Latency (ms)" + +# Mock-LLM time per request, subtracted to isolate platform overhead. Mirrors +# `E2E_LATENCY_MEAN_SECONDS` in configs/mock_llm/*.env and the 2 CS calls +# (input + output rails) of `content_safety_local`. Update in lock-step. +_APP_MOCK_LATENCY_MS = 4000.0 +_CONTENT_SAFETY_MOCK_LATENCY_MS = 500.0 +_CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST = 2 +_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS = _APP_MOCK_LATENCY_MS +_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS = ( + _APP_MOCK_LATENCY_MS + _CONTENT_SAFETY_CALLS_PER_GUARDED_REQUEST * _CONTENT_SAFETY_MOCK_LATENCY_MS +) + + +@dataclass(frozen=True) +class LatencyRow: + """Per-concurrency latency stats parsed from one AIPerf CSV.""" + + concurrency: int + avg: float + p50: float + p90: float + p99: float + std: float + + +@dataclass(frozen=True) +class ComparisonRow: + """Side-by-side comparison of one concurrency level across variants.""" + + concurrency: int + with_guardrails: LatencyRow + without_guardrails: LatencyRow + + @property + def delta_p50(self) -> float: + return self.with_guardrails.p50 - self.without_guardrails.p50 + + @property + def delta_p90(self) -> float: + return self.with_guardrails.p90 - self.without_guardrails.p90 + + @property + def delta_avg(self) -> float: + return self.with_guardrails.avg - self.without_guardrails.avg + + +def load_variant_results(variant_output_dir: Path) -> dict[int, LatencyRow]: + """Load per-concurrency latency stats for one variant. + + Walks the ``//concurrency/`` layout produced by + ``collect_sweep_results``. Missing CSVs are skipped, not raised, so + partial runs still produce a table. + """ + if not variant_output_dir.is_dir(): + return {} + + latency_by_concurrency: dict[int, LatencyRow] = {} + for batch_dir in sorted(p for p in variant_output_dir.iterdir() if p.is_dir()): + for timestamp_dir in sorted(p for p in batch_dir.iterdir() if p.is_dir()): + for sweep_dir in sorted(p for p in timestamp_dir.iterdir() if p.is_dir()): + concurrency = _parse_concurrency_from_label(sweep_dir.name) + if concurrency is None: + continue + csv_path = sweep_dir / "profile_export_aiperf.csv" + row = _read_latency_row(csv_path, concurrency) + if row is not None: + latency_by_concurrency[concurrency] = row + return latency_by_concurrency + + +def compare( + latency_by_concurrency_with_guardrails: dict[int, LatencyRow], + latency_by_concurrency_without_guardrails: dict[int, LatencyRow], +) -> list[ComparisonRow]: + """Build per-concurrency comparison rows, sorted by concurrency. + + Only levels present in both variants are compared; asymmetric levels are + logged at WARNING and excluded. + """ + concurrencies_with_guardrails = set(latency_by_concurrency_with_guardrails) + concurrencies_without_guardrails = set(latency_by_concurrency_without_guardrails) + concurrencies_in_both_variants = sorted(concurrencies_with_guardrails & concurrencies_without_guardrails) + + concurrencies_in_only_one_variant = sorted(concurrencies_with_guardrails ^ concurrencies_without_guardrails) + if concurrencies_in_only_one_variant: + log.warning( + "Concurrency levels present in only one variant, excluded from comparison: %s", + concurrencies_in_only_one_variant, + ) + + return [ + ComparisonRow( + concurrency, + latency_by_concurrency_with_guardrails[concurrency], + latency_by_concurrency_without_guardrails[concurrency], + ) + for concurrency in concurrencies_in_both_variants + ] + + +def format_table(rows: list[ComparisonRow]) -> str: + """Render the comparison as a fixed-width text table.""" + if not rows: + return "No comparable sweep results found (need both variants to share concurrency levels)." + + header = ( + "conc", + "with p50", + "w/o p50", + "delta p50", + "with p90", + "w/o p90", + "delta p90", + "with avg", + "w/o avg", + "delta avg", + ) + fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" + header_line = fmt.format(*header) + lines = ["Measured Latencies (ms), with and without guardrails:", header_line, "-" * len(header_line)] + for r in rows: + lines.append( + fmt.format( + r.concurrency, + f"{r.with_guardrails.p50:.0f}", + f"{r.without_guardrails.p50:.0f}", + f"{r.delta_p50:+.0f}", + f"{r.with_guardrails.p90:.0f}", + f"{r.without_guardrails.p90:.0f}", + f"{r.delta_p90:+.0f}", + f"{r.with_guardrails.avg:.0f}", + f"{r.without_guardrails.avg:.0f}", + f"{r.delta_avg:+.0f}", + ) + ) + lines.append("") + lines.append("delta = with-guardrails minus without-guardrails.") + return "\n".join(lines) + + +def format_platform_overhead_table(rows: list[ComparisonRow]) -> str: + """Render a table with mock-LLM time subtracted from p50/p90/avg. + + Isolates NMP + IGW + shim + middleware overhead from the much larger + mock sleeps. The delta columns are the middleware's own cost over the + bare path. + """ + if not rows: + return "No comparable sweep results found (need both variants to share concurrency levels)." + + header = ( + "conc", + "with p50", + "w/o p50", + "delta p50", + "with p90", + "w/o p90", + "delta p90", + "with avg", + "w/o avg", + "delta avg", + ) + fmt = "{:>4} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9} {:>9}" + header_line = fmt.format(*header) + lines = ["Platform Overhead (ms), with and without guardrails:", header_line, "-" * len(header_line)] + + for r in rows: + with_p50 = r.with_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS + without_p50 = r.without_guardrails.p50 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS + with_p90 = r.with_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS + without_p90 = r.without_guardrails.p90 - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS + with_avg = r.with_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS + without_avg = r.without_guardrails.avg - _MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS + lines.append( + fmt.format( + r.concurrency, + f"{with_p50:+.0f}", + f"{without_p50:+.0f}", + f"{with_p50 - without_p50:+.0f}", + f"{with_p90:+.0f}", + f"{without_p90:+.0f}", + f"{with_p90 - without_p90:+.0f}", + f"{with_avg:+.0f}", + f"{without_avg:+.0f}", + f"{with_avg - without_avg:+.0f}", + ) + ) + lines.append("") + lines.append( + f"Minus mock-LLM time " + f"(with-guardrails: {_MOCK_TIME_PER_REQUEST_WITH_GUARDRAILS_MS:.0f} ms; " + f"without-guardrails: {_MOCK_TIME_PER_REQUEST_WITHOUT_GUARDRAILS_MS:.0f} ms)." + ) + return "\n".join(lines) + + +def analyze_run(run_dir: Path) -> str: + """Read both variants from one run dir and return a printable report. + + Output is the raw comparison table followed by a platform-overhead table + (mock time subtracted). Falls back to a single-variant table if only one + variant has results. + """ + aiperf_dir = run_dir / "aiperf_results" + latency_by_concurrency_with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS) + latency_by_concurrency_without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS) + + if not latency_by_concurrency_with_guardrails and not latency_by_concurrency_without_guardrails: + return f"No AIPerf results found under {aiperf_dir}" + if not latency_by_concurrency_with_guardrails or not latency_by_concurrency_without_guardrails: + if latency_by_concurrency_with_guardrails: + return _format_single_variant(VARIANT_WITH_GUARDRAILS, latency_by_concurrency_with_guardrails) + return _format_single_variant(VARIANT_WITHOUT_GUARDRAILS, latency_by_concurrency_without_guardrails) + + rows = compare(latency_by_concurrency_with_guardrails, latency_by_concurrency_without_guardrails) + return f"{format_table(rows)}\n\n{format_platform_overhead_table(rows)}" + + +def _load_comparison_rows(run_dir: Path) -> list[ComparisonRow]: + """Reload comparison rows from a run dir; returns ``[]`` if either variant is absent.""" + aiperf_dir = run_dir / "aiperf_results" + with_guardrails = load_variant_results(aiperf_dir / VARIANT_WITH_GUARDRAILS) + without_guardrails = load_variant_results(aiperf_dir / VARIANT_WITHOUT_GUARDRAILS) + if not with_guardrails or not without_guardrails: + return [] + return compare(with_guardrails, without_guardrails) + + +@dataclass(frozen=True) +class LatencyReport: + """Latency results for a single concurrency level, rendered as one row of the report. + + Each instance represents a single concurrency level from the benchmark + run: what we measured (observed_ms), what we expected from the + baseline (baseline_ms), and how much they're allowed to differ + (tolerance_ms). + The check passes when |observed_ms - baseline_ms| <= tolerance_ms. + """ + + concurrency: int + metric: str + baseline_ms: float + observed_ms: float + tolerance_ms: float + + @property + def diff_ms(self) -> float: + return self.observed_ms - self.baseline_ms + + @property + def passed(self) -> bool: + return abs(self.diff_ms) <= self.tolerance_ms + + +def check_against_baseline(rows: list[ComparisonRow]) -> tuple[str, int]: + """Compare the delta_p50 for each concurrency level against the baseline latencies. + + Returns ``(report_text, failed_count)``. Concurrencies missing from + either the run or the baseline are skipped with a note. + """ + rows_by_concurrency = {r.concurrency: r for r in rows} + + latency_reports: list[LatencyReport] = [] + skipped_concurrencies: list[int] = [] + + for concurrency in sorted(CONCURRENCIES_TO_VALIDATE): + if concurrency not in rows_by_concurrency or concurrency not in DELTA_P50_BASELINE_BY_CONCURRENCY: + skipped_concurrencies.append(concurrency) + continue + latency_reports.append( + LatencyReport( + concurrency=concurrency, + metric="delta_p50", + baseline_ms=float(DELTA_P50_BASELINE_BY_CONCURRENCY[concurrency]), + observed_ms=rows_by_concurrency[concurrency].delta_p50, + tolerance_ms=float(DELTA_P50_TOLERANCE_OVERRIDES_MS.get(concurrency, DEFAULT_DELTA_P50_TOLERANCE_MS)), + ) + ) + + fmt = "{:>9} {:>4} {:>10} {:>10} {:>9} {:>11} {:>6}" + header_line = fmt.format("metric", "conc", "baseline", "observed", "diff", "tolerance", "status") + lines = [ + "Guardrails Overhead vs. Baseline (ms):", + header_line, + "-" * len(header_line), + ] + failed_count = 0 + for report in latency_reports: + status = "PASS" if report.passed else "FAIL" + if not report.passed: + failed_count += 1 + lines.append( + fmt.format( + report.metric, + report.concurrency, + f"{report.baseline_ms:.0f}", + f"{report.observed_ms:.0f}", + f"{report.diff_ms:+.0f}", + f"±{report.tolerance_ms:.0f}ms", + status, + ) + ) + if skipped_concurrencies: + lines.append("") + lines.append(f"Skipped (missing from results or baseline): {skipped_concurrencies}") + if failed_count: + lines.append("") + lines.append(f"FAIL: {failed_count} of {len(latency_reports)} check(s) exceeded tolerance.") + + return "\n".join(lines), failed_count + + +def _format_single_variant(variant: str, latency_by_concurrency: dict[int, LatencyRow]) -> str: + """Render one variant's table when the other variant didn't run.""" + fmt = "{:>4} {:>9} {:>9} {:>9} {:>9}" + header_line = fmt.format("conc", "avg", "p50", "p90", "std") + lines = [ + f"Only one variant present: {variant}", + header_line, + "-" * len(header_line), + ] + for concurrency in sorted(latency_by_concurrency): + row = latency_by_concurrency[concurrency] + lines.append(fmt.format(concurrency, f"{row.avg:.0f}", f"{row.p50:.0f}", f"{row.p90:.0f}", f"{row.std:.0f}")) + lines.append("") + lines.append("All values in milliseconds.") + return "\n".join(lines) + + +def _parse_concurrency_from_label(label: str) -> int | None: + """Extract N from a sweep label like ``concurrency16``; ``None`` otherwise.""" + if not label.startswith("concurrency"): + return None + try: + return int(label.removeprefix("concurrency")) + except ValueError: + return None + + +def _read_latency_row(csv_path: Path, concurrency: int) -> LatencyRow | None: + """Pull the ``Request Latency (ms)`` row from an AIPerf CSV's first block.""" + if not csv_path.is_file(): + log.debug("Missing CSV at %s; skipping", csv_path) + return None + + try: + with csv_path.open(encoding="utf-8") as f: + reader = csv.reader(f) + header = next(reader, None) + if not header or header[0] != "Metric": + log.warning("Unexpected header in %s: %s", csv_path, header) + return None + try: + col = {name: header.index(name) for name in ("avg", "p50", "p90", "p99", "std")} + except ValueError as exc: + log.warning("Missing expected column in %s: %s", csv_path, exc) + return None + for row in reader: + if not row: + break # end of first block + if row[0] == _LATENCY_METRIC: + return LatencyRow( + concurrency=concurrency, + avg=float(row[col["avg"]]), + p50=float(row[col["p50"]]), + p90=float(row[col["p90"]]), + p99=float(row[col["p99"]]), + std=float(row[col["std"]]), + ) + except (OSError, ValueError, IndexError) as exc: + log.warning("Failed to parse %s: %s", csv_path, exc) + return None + + log.warning("Did not find '%s' row in %s", _LATENCY_METRIC, csv_path) + return None + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="nemo-guardrails-benchmark-analyze", + description=__doc__, + ) + parser.add_argument( + "run_dir", + type=Path, + help="Path to a run directory under `plugins/nemo-guardrails/benchmarks/artifacts/runs//`.", + ) + parser.add_argument( + "--strict", + action="store_true", + help="Exit non-zero when any baseline check exceeds tolerance. CI sets this; local runs default off so you can iterate without the gate failing.", + ) + parser.add_argument( + "--log-level", + default="INFO", + choices=("DEBUG", "INFO", "WARNING", "ERROR"), + ) + args = parser.parse_args(argv) + + logging.basicConfig(level=args.log_level, format="%(levelname)s %(message)s") + + run_dir: Path = args.run_dir.resolve() + if not run_dir.is_dir(): + print(f"Not a directory: {run_dir}", file=sys.stderr) + return 2 + + print(analyze_run(run_dir)) + + rows = _load_comparison_rows(run_dir) + if not rows: + print("Skipping baseline check: no comparable rows from this run.", file=sys.stderr) + return 0 if not args.strict else 2 + + report, failed_count = check_against_baseline(rows) + print() + print(report) + return 1 if (args.strict and failed_count) else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py index cbe4b19960..14db3561f4 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/constants.py @@ -8,6 +8,17 @@ WORKSPACE = "benchmark" GUARDRAIL_CONFIG = "content-safety-local" VM_NAME = "guardrails-vm" +# Control VirtualModel with no middleware attached. Used by the benchmark +# harness to measure NMP+IGW latency *without* the guardrails middleware so +# the with-vs-without delta isolates middleware overhead. +NO_GUARDRAILS_VM_NAME = "no-guardrails-vm" + +# Logical identifiers for the two benchmark variants. Used as subdirectory +# names under `aiperf_results/` and `logs/`, and as the value of the +# harness's `--variant` flag. +VARIANT_WITH_GUARDRAILS = "with-guardrails" +VARIANT_WITHOUT_GUARDRAILS = "without-guardrails" +ALL_VARIANTS = (VARIANT_WITH_GUARDRAILS, VARIANT_WITHOUT_GUARDRAILS) # ModelProvider that proxies requests to the mock main model APP_PROVIDER = "benchmark-app-llm" diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py index b1113fa2c2..00af57580a 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/paths.py @@ -54,6 +54,10 @@ class RunPaths: nmp_data_dir: Path # Checked-in YAML template for the AIPerf sweep config. config_template: Path + # In-repo mock-LLM `.env` files. Versioned with the benchmark so mock + # behavior is independent of the NeMo-Guardrails checkout. + mock_app_env: Path + mock_content_safety_env: Path # Per-run materialized copy of `config_template` with `output_base_dir` # overridden to `aiperf_output_dir`. AIPerf is invoked against this file. runtime_config: Path @@ -75,6 +79,18 @@ def ensure_directories(self) -> None: ): path.mkdir(parents=True, exist_ok=True) + def aiperf_output_dir_for(self, variant: str) -> Path: + """Per-variant AIPerf output dir; keeps side-by-side sweeps from colliding.""" + return self.aiperf_output_dir / variant + + def runtime_config_for(self, variant: str) -> Path: + """Per-variant materialized AIPerf config under ``generated/``.""" + return self.generated_dir / f"aiperf_config_{variant}.yaml" + + def aiperf_log_for(self, variant: str) -> Path: + """Per-variant AIPerf stdout/stderr log under ``logs/``.""" + return self.log_dir / f"aiperf_{variant}.log" + def _now_run_id() -> str: return dt.datetime.now().strftime("%Y%m%d_%H%M%S") @@ -120,4 +136,6 @@ def build_run_paths( config_template=benchmark_dir / "configs" / "nmp_igw_guardrails_sweep_concurrency.yaml", runtime_config=run_dir / "generated" / "nmp_igw_guardrails_sweep_concurrency.yaml", aiperf_venv_dir=artifacts_dir / "venvs" / "aiperf", + mock_app_env=benchmark_dir / "configs" / "mock_llm" / "app-llm.env", + mock_content_safety_env=benchmark_dir / "configs" / "mock_llm" / "content-safety-llm.env", ) diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py index 47114e0195..2fc8e7421e 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/run.py @@ -25,21 +25,27 @@ import sys import time from contextlib import ExitStack +from dataclasses import dataclass from pathlib import Path from nemo_guardrails_plugin.benchmarks.aiperf_runner import ( + SweepRunResult, collect_sweep_results, prepare_runtime_aiperf_config, run_aiperf_sweep, ) +from nemo_guardrails_plugin.benchmarks.analyze import analyze_run from nemo_guardrails_plugin.benchmarks.bootstrap import ensure_aiperf_venv from nemo_guardrails_plugin.benchmarks.constants import ( AIPERF_SHIM_BASE_URL, + ALL_VARIANTS, APP_PROVIDER_URL, CS_PROVIDER_URL, IGW_CHAT_PATH, NMP_BASE_URL, NMP_HEALTH_PATH, + VARIANT_WITH_GUARDRAILS, + VARIANT_WITHOUT_GUARDRAILS, WORKSPACE, ) from nemo_guardrails_plugin.benchmarks.paths import ( @@ -66,8 +72,6 @@ Path("benchmark/aiperf/__main__.py"), Path("benchmark/aiperf/run_aiperf.py"), Path("benchmark/mock_llm_server/run_server.py"), - Path("benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env"), - Path("benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env"), Path("examples/configs/content_safety_local/config.yml"), Path("examples/configs/content_safety_local/prompts.yml"), ) @@ -91,6 +95,18 @@ def _validate_nemoguardrails_repo(nemoguardrails_repo_root: Path) -> None: ) +def _validate_in_repo_mock_configs(paths: RunPaths) -> None: + """Fail fast if the in-repo mock LLM env files are missing. + + These live in this repo (not upstream) so we control mock behavior + independently of the NeMo-Guardrails checkout. + """ + missing = [p for p in (paths.mock_app_env, paths.mock_content_safety_env) if not p.is_file()] + if missing: + bullet = "\n - ".join(str(p) for p in missing) + raise FileNotFoundError(f"In-repo mock LLM config files missing:\n - {bullet}") + + def _build_mock_nim_processes(paths: RunPaths, workers: int) -> list[SupervisedProcess]: """Spawn ``python -m benchmark.mock_llm_server.run_server`` for both mocks. @@ -122,20 +138,18 @@ def spec(name: str, port: int, env_file: Path, *, health_url: str) -> Supervised health_timeout_seconds=_MOCK_HEALTH_TIMEOUT_SECONDS, ) + # Env files come from this repo, rather than the upstream library. return [ - # Main LLM mock server spec( "mock-app-llm", 8000, - paths.nemoguardrails_repo_root / "benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env", + paths.mock_app_env, health_url=f"{APP_PROVIDER_URL}/health", ), - # Content-safety LLM mock server spec( "mock-content-safety-llm", 8001, - paths.nemoguardrails_repo_root - / "benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env", + paths.mock_content_safety_env, health_url=f"{CS_PROVIDER_URL}/health", ), ] @@ -205,6 +219,109 @@ def _smoke_test(client: NeMoPlatform, seeded: SeededResources) -> None: raise RuntimeError(f"Smoke test failed after 60 attempts: {last_error}") +@dataclass(frozen=True) +class BenchmarkOutcome: + """Outcome of a single benchmark variant's AIPerf sweep, used by the summary.""" + + variant: str + aiperf_exit: int + output_dir: Path + sweep_results: list[SweepRunResult] + + @property + def failures(self) -> int: + return sum(1 for r in self.sweep_results if not r.passed) + + @property + def passed(self) -> bool: + return self.aiperf_exit == 0 and bool(self.sweep_results) and self.failures == 0 + + +def _vm_ref_for_variant(variant: str, seeded: SeededResources) -> str: + """Pick which seeded VirtualModel a benchmark variant should target.""" + if variant == VARIANT_WITH_GUARDRAILS: + return seeded.vm_ref + if variant == VARIANT_WITHOUT_GUARDRAILS: + return seeded.no_guardrails_vm_ref + raise ValueError(f"Unknown variant: {variant!r}") + + +def _run_benchmark( + *, + variant: str, + paths: RunPaths, + seeded: SeededResources, + aiperf_python: Path, +) -> BenchmarkOutcome: + """Materialize a per-variant AIPerf config, run the sweep, collect results.""" + vm_ref = _vm_ref_for_variant(variant, seeded) + runtime_config = paths.runtime_config_for(variant) + aiperf_output_dir = paths.aiperf_output_dir_for(variant) + aiperf_log = paths.aiperf_log_for(variant) + + sweep_config = prepare_runtime_aiperf_config( + template_path=paths.config_template, + runtime_config_path=runtime_config, + aiperf_output_dir=aiperf_output_dir, + model_ref=vm_ref, + ) + log.info( + "Benchmark %s: targeting %s; concurrency=%s, duration=%ss", + variant, + vm_ref, + sweep_config.get("sweeps", {}).get("concurrency"), + sweep_config.get("base_config", {}).get("benchmark_duration"), + ) + log.info( + "Starting AIPerf sweep [%s] against %s -> shim -> %s%s", + variant, + AIPERF_SHIM_BASE_URL, + NMP_BASE_URL, + IGW_CHAT_PATH, + ) + + aiperf_exit = run_aiperf_sweep( + nemoguardrails_repo_root=paths.nemoguardrails_repo_root, + runtime_config=runtime_config, + log_path=aiperf_log, + python_executable=str(aiperf_python), + venv_bin_path=paths.aiperf_venv_dir / "bin", + ) + + sweep_results = collect_sweep_results(aiperf_output_dir) + return BenchmarkOutcome( + variant=variant, + aiperf_exit=aiperf_exit, + output_dir=aiperf_output_dir, + sweep_results=sweep_results, + ) + + +def _summarize_benchmark_results(outcomes: list[BenchmarkOutcome]) -> int: + """Log per-benchmark + overall summary; return process exit code.""" + overall_failed = False + for outcome in outcomes: + if not outcome.sweep_results: + log.error( + "Benchmark %s: aiperf exited with code %d and produced no per-sweep results in %s", + outcome.variant, + outcome.aiperf_exit, + outcome.output_dir, + ) + else: + log.info( + "Benchmark %s: %d run(s), %d failure(s); per-sweep outputs under %s", + outcome.variant, + len(outcome.sweep_results), + outcome.failures, + outcome.output_dir, + ) + if not outcome.passed: + overall_failed = True + + return 1 if overall_failed else 0 + + def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser( prog="nemo-guardrails-benchmark", @@ -244,10 +361,28 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: default=None, help="Override the per-run directory name (default: current timestamp).", ) + parser.add_argument( + "--variant", + choices=(*ALL_VARIANTS, "all"), + default="all", + help=( + "Which sweep to run. 'all' (default) runs both variants sequentially " + "against the same NMP; the with-vs-without delta isolates middleware " + "overhead. In CI, run the two variants as parallel jobs against " + "separate NMP instances." + ), + ) parser.add_argument("--verbose", "-v", action="store_true") return parser.parse_args(argv) +def _resolve_variants(variant_arg: str) -> tuple[str, ...]: + """Translate the ``--variant`` CLI argument into the ordered list to run.""" + if variant_arg == "all": + return ALL_VARIANTS + return (variant_arg,) + + def main(argv: list[str] | None = None) -> int: args = parse_args(argv) _configure_logging(args.verbose) @@ -266,20 +401,18 @@ def main(argv: list[str] | None = None) -> int: run_id=args.run_id, ) paths.ensure_directories() + _validate_in_repo_mock_configs(paths) log.info("Created directory for benchmark results at: %s", paths.run_dir) - - sweep_config = prepare_runtime_aiperf_config( - template_path=paths.config_template, - runtime_config_path=paths.runtime_config, - aiperf_output_dir=paths.aiperf_output_dir, - ) log.info( - "AIPerf sweep: concurrency=%s, duration=%ss", - sweep_config.get("sweeps", {}).get("concurrency"), - sweep_config.get("base_config", {}).get("benchmark_duration"), + "Mock LLM configs: app=%s, content-safety=%s", + paths.mock_app_env, + paths.mock_content_safety_env, ) + variants = _resolve_variants(args.variant) + log.info("Will run %d variant(s): %s", len(variants), ", ".join(variants)) + # Ensure the dedicated aiperf venv exists *before* we start any supervised # processes. aiperf_python = ensure_aiperf_venv(paths.aiperf_venv_dir) @@ -318,43 +451,38 @@ def main(argv: list[str] | None = None) -> int: log.info("Waiting for VirtualModel %s to be ready...", seeded.vm_ref) _smoke_test(client, seeded) - log.info( - "Starting AIPerf sweep against %s -> shim -> %s%s", - AIPERF_SHIM_BASE_URL, - NMP_BASE_URL, - IGW_CHAT_PATH, - ) - aiperf_exit = run_aiperf_sweep( - nemoguardrails_repo_root=paths.nemoguardrails_repo_root, - runtime_config=paths.runtime_config, - log_path=paths.log_dir / "aiperf.log", - python_executable=str(aiperf_python), - venv_bin_path=paths.aiperf_venv_dir / "bin", - ) + # Variants run sequentially against the same NMP; only the targeted + # VirtualModel differs, so the delta isolates middleware overhead. + outcomes: list[BenchmarkOutcome] = [] + for variant in variants: + outcomes.append( + _run_benchmark( + variant=variant, + paths=paths, + seeded=seeded, + aiperf_python=aiperf_python, + ) + ) - sweep_results = collect_sweep_results(paths.aiperf_output_dir) - failures = sum(1 for r in sweep_results if not r.passed) + exit_code = _summarize_benchmark_results(outcomes) + _maybe_print_analysis(paths.run_dir, outcomes) + return exit_code - if not sweep_results: - # AIPerf exited before producing any per-sweep dirs. Surface that - # explicitly so the log isn't ambiguous about why we're failing. - log.error( - "aiperf exited with code %d and produced no per-sweep results in %s", - aiperf_exit, - paths.aiperf_output_dir, - ) - else: - log.info( - "Sweep summary: %d run(s), %d failure(s); per-sweep outputs under %s", - len(sweep_results), - failures, - paths.aiperf_output_dir, - ) - if failures or aiperf_exit != 0 or not sweep_results: - return 1 +def _maybe_print_analysis(run_dir: Path, outcomes: list[BenchmarkOutcome]) -> None: + """Print the analyzer's comparison table when at least one variant has results. - return 0 + Wrapped in a broad try/except: the analyzer is post-processing only and + must not change the harness's exit code or hide a real benchmark failure. + """ + if not any(o.sweep_results for o in outcomes): + return + try: + report = analyze_run(run_dir) + except Exception as exc: + log.warning("Analyzer failed; skipping summary table: %s", exc) + return + log.info("Benchmark analysis:\n%s", report) if __name__ == "__main__": diff --git a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py index 96cd152f0f..cde4f31b3f 100644 --- a/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py +++ b/plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/seeding.py @@ -28,6 +28,7 @@ GUARDRAIL_CONFIG, GUARDRAILS_MIDDLEWARE_CONFIG_TYPE, GUARDRAILS_MIDDLEWARE_NAME, + NO_GUARDRAILS_VM_NAME, VM_NAME, WORKSPACE, ) @@ -52,11 +53,17 @@ class SeededResources: cs_model_entity: str guardrail_config_name: str vm_name: str + # Control VM with no middleware; otherwise identical to the guardrails VM. + no_guardrails_vm_name: str @property def vm_ref(self) -> str: return f"{self.workspace}/{self.vm_name}" + @property + def no_guardrails_vm_ref(self) -> str: + return f"{self.workspace}/{self.no_guardrails_vm_name}" + @property def guardrail_config_ref(self) -> str: return f"{self.workspace}/{self.guardrail_config_name}" @@ -171,6 +178,20 @@ def seed_benchmark( ) _dump_model(generated_dir / "virtual_model.json", vm) + # Control VM: identical to the guardrails VM but no middleware, so the + # with-vs-without delta isolates middleware overhead. + log.info("Creating control VirtualModel %s/%s", WORKSPACE, NO_GUARDRAILS_VM_NAME) + no_guardrails_vm = client.inference.virtual_models.create( + workspace=WORKSPACE, + name=NO_GUARDRAILS_VM_NAME, + default_model_entity=app_entity, + models=vm_models, + request_middleware=[], + response_middleware=[], + exist_ok=True, + ) + _dump_model(generated_dir / "virtual_model_no_guardrails.json", no_guardrails_vm) + return SeededResources( workspace=WORKSPACE, app_provider_name=APP_PROVIDER, @@ -179,6 +200,7 @@ def seed_benchmark( cs_model_entity=cs_entity, guardrail_config_name=GUARDRAIL_CONFIG, vm_name=VM_NAME, + no_guardrails_vm_name=NO_GUARDRAILS_VM_NAME, ) diff --git a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py index f0ad57aeaa..8342315e13 100644 --- a/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py +++ b/plugins/nemo-guardrails/tests/unit/benchmarks/test_seeding.py @@ -14,6 +14,7 @@ CS_MODEL_NAME, CS_PROVIDER, GUARDRAIL_CONFIG, + NO_GUARDRAILS_VM_NAME, VM_NAME, WORKSPACE, ) @@ -145,12 +146,18 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path cs_entity = seeded.cs_model_entity assert gc_call.kwargs["data"]["models"][0]["model"] == cs_entity - # VirtualModel uses the discovered app entity and points middleware at the - # guardrail config we just created. - vm_call = fake_client.inference.virtual_models.create.call_args - assert vm_call.kwargs["name"] == VM_NAME - assert vm_call.kwargs["default_model_entity"] == seeded.app_model_entity - assert vm_call.kwargs["models"] == [{"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"}] + # Two VirtualModels are created: the guardrails VM (with middleware) and + # a control VM (no middleware) used by the without-guardrails benchmark + # variant. + vm_calls = fake_client.inference.virtual_models.create.call_args_list + assert len(vm_calls) == 2 + + guardrails_vm_call = vm_calls[0] + assert guardrails_vm_call.kwargs["name"] == VM_NAME + assert guardrails_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity + assert guardrails_vm_call.kwargs["models"] == [ + {"model": seeded.app_model_entity, "backend_format": "OPENAI_CHAT"} + ] expected_middleware = [ { "name": "nemo-guardrails", @@ -158,8 +165,14 @@ def test_calls_sdk_with_expected_payloads(self, fake_client: MagicMock, tmp_path "config_id": f"{WORKSPACE}/{GUARDRAIL_CONFIG}", } ] - assert vm_call.kwargs["request_middleware"] == expected_middleware - assert vm_call.kwargs["response_middleware"] == expected_middleware + assert guardrails_vm_call.kwargs["request_middleware"] == expected_middleware + assert guardrails_vm_call.kwargs["response_middleware"] == expected_middleware + + control_vm_call = vm_calls[1] + assert control_vm_call.kwargs["name"] == NO_GUARDRAILS_VM_NAME + assert control_vm_call.kwargs["default_model_entity"] == seeded.app_model_entity + assert control_vm_call.kwargs["request_middleware"] == [] + assert control_vm_call.kwargs["response_middleware"] == [] def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path: Path) -> None: ng_root = tmp_path / "NeMo-Guardrails" @@ -176,6 +189,7 @@ def test_generated_dir_contains_artifacts(self, fake_client: MagicMock, tmp_path assert (generated_dir / "app_provider.json").is_file() assert (generated_dir / "content_safety_provider.json").is_file() assert (generated_dir / "virtual_model.json").is_file() + assert (generated_dir / "virtual_model_no_guardrails.json").is_file() request_payload = json.loads( (generated_dir / "content_safety_local_nmp_request.json").read_text(encoding="utf-8") @@ -197,6 +211,7 @@ def test_returns_seeded_resources(self, fake_client: MagicMock, tmp_path: Path) assert seeded.workspace == WORKSPACE assert seeded.vm_ref == f"{WORKSPACE}/{VM_NAME}" + assert seeded.no_guardrails_vm_name == NO_GUARDRAILS_VM_NAME assert seeded.guardrail_config_ref == f"{WORKSPACE}/{GUARDRAIL_CONFIG}" def test_raises_if_served_models_never_populated(self, tmp_path: Path) -> None: