Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/actions/changes/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ outputs:
k8s-smoke:
description: "'true' if Kubernetes smoke test support files changed"
value: ${{ steps.filter.outputs.k8s-smoke }}
guardrails-benchmark:
description: "'true' if the nemo-guardrails plugin or guardrails service changed"
value: ${{ steps.filter.outputs.guardrails-benchmark }}
cpu-smoke:
description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed"
value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }}
Expand Down Expand Up @@ -97,3 +100,6 @@ runs:
- 'e2e/k8s/values/**'
- 'e2e/test_jobs.py'
- '.github/actions/free-disk-space/action.yaml'
guardrails-benchmark:
- 'plugins/nemo-guardrails/**'
- 'services/guardrails/**'
83 changes: 77 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ jobs:
docker: ${{ steps.changes.outputs.docker }}
helm: ${{ steps.changes.outputs.helm }}
cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }}
guardrails-benchmark: ${{ steps.changes.outputs.guardrails-benchmark }}
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- uses: ./.github/actions/changes
Expand Down Expand Up @@ -1071,21 +1072,36 @@ jobs:
retention-days: 7
path: web/packages/studio/playwright-report/

benchmark-guardrails:
name: Guardrails plugin benchmark
if: github.event_name == 'workflow_dispatch'
guardrails-benchmark:
# Parallel matrix jobs (one NMP per variant) so the two sweeps don't
# share mocks or contend on :8080. `guardrails-benchmark-analyze` merges
# the artifacts and prints the comparison.
name: nemo-guardrails plugin benchmark (${{ matrix.variant }})
needs: [changes]
if: >
!cancelled() && (
github.event_name == 'workflow_dispatch' ||
needs.changes.outputs.guardrails-benchmark == 'true'
)
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
# Keep the partial artifact if one variant fails.
fail-fast: false
matrix:
variant: [with-guardrails, without-guardrails]
steps:
- name: Checkout nemo-platform
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
path: nemo-platform
persist-credentials: false
- name: Checkout NeMo-Guardrails
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
repository: NVIDIA/NeMo-Guardrails
path: NeMo-Guardrails
persist-credentials: false
- name: Install uv
uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
with:
Expand All @@ -1102,15 +1118,71 @@ jobs:
PYTORCH_DEPS: cpu
- name: Run benchmark sweep
working-directory: nemo-platform
run: make benchmark-guardrails
# Pin both variants to the same `--run-id` so when the analyze job
# downloads both artifacts into one `runs/` parent, they merge into
# a single run directory the analyzer can read normally.
run: |
make benchmark-guardrails BENCHMARK_ARGS="\
--variant ${{ matrix.variant }} \
--run-id ci-${{ github.run_id }}-${{ github.run_attempt }}"
env:
NEMO_GUARDRAILS_REPO_ROOT: ${{ github.workspace }}/NeMo-Guardrails
_TYPER_FORCE_DISABLE_TERMINAL: "1"
- name: Upload benchmark artifacts
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: benchmark-guardrails-results
# Ensure we use a unique artifact name per benchmark vaiant.
name: guardrails-benchmark-results-${{ matrix.variant }}
retention-days: 30
path: |
nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/

guardrails-benchmark-analyze:
# Merge both variant artifacts and print the comparison table.
name: nemo-guardrails plugin benchmark analysis
needs: [changes, guardrails-benchmark]
if: >
!cancelled() && (
github.event_name == 'workflow_dispatch' ||
needs.changes.outputs.guardrails-benchmark == 'true'
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Checkout nemo-platform
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
path: nemo-platform
Comment thread
coderabbitai[bot] marked this conversation as resolved.
persist-credentials: false
- name: Download with-guardrails artifact
# If a variant failed entirely it may have uploaded no artifact;
# the analyzer handles the single-variant case so don't fail here.
continue-on-error: true
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: guardrails-benchmark-results-with-guardrails
path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
- name: Download without-guardrails artifact
continue-on-error: true
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: guardrails-benchmark-results-without-guardrails
path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
- name: Print benchmark comparison
working-directory: nemo-platform
# `analyze.py` doesn't rely on NMP or AIPerf, so we skip the uv bootstrap
# step and run it with the runner's `python3` CLI directly.
run: |
RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-)
echo "Analyzing run directory: $RUN_DIR"
python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" --strict
- name: Upload merged benchmark artifacts
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
# Single artifact so baseline collection is one download per run.
name: guardrails-benchmark-results-merged
retention-days: 30
path: |
nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
Expand Down Expand Up @@ -1261,7 +1333,6 @@ jobs:
- web-sdk-gen
- web-studio-deps
- web-studio-e2e
- benchmark-guardrails
- opa-policy-test
if: always()
runs-on: ubuntu-latest
Expand Down
79 changes: 71 additions & 8 deletions plugins/nemo-guardrails/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ benchmark modules with `PYTHONPATH` pointed at that checkout.
plugins/nemo-guardrails/benchmarks/
configs/
nmp_igw_guardrails_sweep_concurrency.yaml # AIPerf sweep template
mock_llm/ # in-repo mock LLM env files
artifacts/ # per-run outputs (gitignored)
plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/
run.py # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run`
analyze.py # post-run analysis; checks latencies against baseline values
paths.py # filesystem layout
constants.py # workspace / VM / provider names
processes.py # subprocess supervision (process groups + ExitStack)
Expand Down Expand Up @@ -181,15 +183,76 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/

## CI

A `benchmark-guardrails` job in `.github/workflows/ci.yaml` checks out both
this repo and `NVIDIA/NeMo-Guardrails`, runs `make bootstrap-python` and
`make benchmark-guardrails`, and uploads the per-run artifacts directory
(`logs/`, `generated/`, `aiperf_results/`) on success or failure.
Two jobs in `.github/workflows/ci.yaml`:

Pass/fail is driven by the harness's exit code, which is non-zero if `aiperf`
itself exits non-zero or any sweep returns a non-zero exit code. No latency
thresholds are enforced — those can be layered on later by a separate
analyzer that reads the per-sweep CSVs.
- `guardrails-benchmark` — matrix of two parallel jobs, one per variant
(`with-guardrails`, `without-guardrails`), each on its own NMP instance.
Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`).
- `guardrails-benchmark-analyze` — joins the two matrix jobs, downloads both
artifacts, prints a side-by-side comparison via
`nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check
(see below). Fails the build on a latency regression beyond tolerance. The
analyzer is stdlib-only by design, so this job runs on the runner's stock
`python3` without bootstrapping the uv workspace.

### Baseline and gating

CI compares the run's delta_p50 (with-guardrails minus without-guardrails
p50, in ms) against a checked-in baseline. The baseline lives as
module-level constants in:

```text
plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
```

Why only delta_p50 (and not absolute with-guardrails p50)? delta_p50
isolates the middleware's contribution — shared CI runner noise cancels
across the two variants.

#### Baseline constants

- `CONCURRENCIES_TO_VALIDATE: list[int]` — concurrency levels to gate on.
Other levels still appear in the analyzer's output tables, but pass/fail
is decided only by these.
- `DEFAULT_DELTA_P50_TOLERANCE_MS: int` — default tolerance (in ms) applied
to every validated concurrency. A check fails when
`|observed - baseline| > tolerance`.
- `DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int]` — per-concurrency
tolerance overrides (in ms). Levels without an override fall back to the
default.
- `DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int]` — expected delta_p50
(in ms) per concurrency level. Edit by hand when a real change shifts
the numbers.

Worked example: at c=16 the override is 200 ms, so a run with observed
delta_p50 = 1689 (diff +199 from baseline 1390) passes; observed
delta_p50 = 1691 (diff +201) fails.

Notes on the current values:

- c=16 and c=32 use wider tolerances than the default because their
absolute delta_p50 is larger. Over time, we can tighten these values
if latencies in CI produce less variance.
- Any change to mock-LLM latencies, the guardrails config, or the runner
class invalidates the current baseline values. The benchmark should be
re-run in CI several tiems to establish updated baseline values.

#### Running the analyzer locally

```bash
python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py \
plugins/nemo-guardrails/benchmarks/artifacts/runs/<run-id>
```

Local runs print both tables and the baseline-check table.
CI passes `--strict` to make any out-of-tolerance check fail the job.

#### Updating the baseline

When a real change shifts the numbers (ex. a deliberate middleware change,
a mock-LLM config change, or a runner-class change), edit the constants at
the top of `analyze.py` by hand and reference the PR / CI run that
justifies it in the commit.

## Cleanup

Expand Down
15 changes: 15 additions & 0 deletions plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Mock LLM configurations

These `.env` files configure the behavior of the mock LLMs, used by the upstream
`nemo-guardrails` library's `benchmark.mock_llm_server.run_server`.

The library stores these files, but we keep our own copies so:

- We can change mock latency without touching the upstream repo.
- The exact mock behavior we benchmarked against is versioned alongside the
results, so historical numbers stay reproducible even if upstream changes
its defaults.

Mapping to upstream files:
- `app-llm.env` ← upstream `meta-llama-3.3-70b-instruct.env`
- `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env`
19 changes: 19 additions & 0 deletions plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
MODEL="meta/llama-3.3-70b-instruct"
UNSAFE_PROBABILITY=0.0
UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
# End-to-end latency
E2E_LATENCY_MIN_SECONDS=4.0
E2E_LATENCY_MAX_SECONDS=4.0
E2E_LATENCY_MEAN_SECONDS=4.0
E2E_LATENCY_STD_SECONDS=0.0
# Streaming latency: Time to First Token (TTFT)
TTFT_MIN_SECONDS=0.3
TTFT_MAX_SECONDS=0.3
TTFT_MEAN_SECONDS=0.3
TTFT_STD_SECONDS=0.0
# Streaming latency: Chunk Latency (ITL)
CHUNK_LATENCY_MIN_SECONDS=0.015
CHUNK_LATENCY_MAX_SECONDS=0.015
CHUNK_LATENCY_MEAN_SECONDS=0.015
CHUNK_LATENCY_STD_SECONDS=0.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
UNSAFE_PROBABILITY=0.0
UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
# End-to-end latency
E2E_LATENCY_MIN_SECONDS=0.5
E2E_LATENCY_MAX_SECONDS=0.5
E2E_LATENCY_MEAN_SECONDS=0.5
E2E_LATENCY_STD_SECONDS=0.0
# Streaming latency: Time to First Token (TTFT)
TTFT_MIN_SECONDS=0.2
TTFT_MAX_SECONDS=0.2
TTFT_MEAN_SECONDS=0.2
TTFT_STD_SECONDS=0.0
# Streaming latency: Chunk Latency (ITL)
CHUNK_LATENCY_MIN_SECONDS=0.015
CHUNK_LATENCY_MAX_SECONDS=0.015
CHUNK_LATENCY_MEAN_SECONDS=0.015
CHUNK_LATENCY_STD_SECONDS=0.0
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,15 @@ def prepare_runtime_aiperf_config(
template_path: Path,
runtime_config_path: Path,
aiperf_output_dir: Path,
model_ref: str | None = None,
) -> dict[str, Any]:
"""Materialize the AIPerf config this run will use.

Reads the checked-in ``template_path`` config, overrides its
``output_base_dir`` to point inside the current run's directory, and writes
the result to ``runtime_config_path``. AIPerf is later invoked with
``--config-file <runtime_config_path>`` so every artifact lands under a
separate per-run directory.

Returns the parsed config dict so callers can log fields (sweep params,
benchmark_duration) without re-reading the file.
Reads ``template_path``, overrides ``output_base_dir`` (so AIPerf
artifacts nest under this run) and optionally ``base_config.model``
(so one template can target multiple VirtualModels), and writes the
result to ``runtime_config_path``. Returns the parsed config so
callers can log sweep params without re-reading the file.
"""
if not template_path.is_file():
raise FileNotFoundError(f"AIPerf template not found: {template_path}")
Expand All @@ -82,6 +80,11 @@ def prepare_runtime_aiperf_config(
# Point AIPerf's output_base_dir at this run's directory so its results
# nest under our per-run artifacts tree.
config["output_base_dir"] = str(aiperf_output_dir)
if model_ref is not None:
base_config = config.get("base_config")
if not isinstance(base_config, dict):
raise ValueError(f"Expected `base_config` mapping in {template_path}, got {type(base_config).__name__}")
base_config["model"] = model_ref
runtime_config_path.parent.mkdir(parents=True, exist_ok=True)
runtime_config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")

Expand Down
Loading
Loading