From cf890fd5ab7b8a8025e1167f2578aef12a93cca9 Mon Sep 17 00:00:00 2001 From: Sam Oluwalana Date: Tue, 16 Jun 2026 15:24:31 -0600 Subject: [PATCH 1/3] feat(customizer): expand skill to help evaluate lift on adapters trained Signed-off-by: Sam Oluwalana --- .../skills/nemo-customizer/SKILL.md | 81 +- .../references/dataset-formats.md | 12 + .../references/eval_helpers.py | 737 ++++++++++++++++++ .../references/post-training-eval.md | 209 +++++ .../tests/test_eval_helpers.py | 221 ++++++ 5 files changed, 1254 insertions(+), 6 deletions(-) create mode 100644 plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py create mode 100644 plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md create mode 100644 plugins/nemo-customizer/tests/test_eval_helpers.py diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md index b907063734..895ba73b17 100644 --- a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md @@ -141,6 +141,8 @@ Training never runs inside the `nemo` CLI process. After `submit`, the platform' - For submit/image/plugin errors (both backends), read `references/troubleshooting.md`. Unsloth needs the `nmp-unsloth-training` container image on the **platform host's** Docker daemon (see `docker/unsloth/README.md`). - **Missing training image on a remote platform** — if the user gave a non-localhost `NEMO_BASE_URL` / `NMP_BASE_URL` (e.g. `10.0.0.51:8080`) and the job errors with `Failed to pull image`, `manifest unknown`, or missing `nmp-unsloth-training` / automodel training image: **do not** run `docker build`, `docker pull`, or `docker buildx bake` on the agent machine. Report with **Report to user** (use **Output adapter fileset (planned):** on error), then append on-target build steps from `references/troubleshooting.md` § **Missing training images**. - **Gated HuggingFace models** (Llama, Gemma, …) — confirm `hf-token` + fileset `token_secret` before submit; download fails with `Failed to access upstream storage` / 502 when missing. See **HuggingFace token (gated models)** and `references/troubleshooting.md` § **Gated HuggingFace models**. +- **Post-training eval format** — use the same CHAT `messages` JSONL as training. **Do not** flatten rows to `prompt`/`expected` for the evaluator. Send `messages[:-1]` at inference (exclude final assistant label); score against `messages[-1].content`. See `references/post-training-eval.md` and `references/eval_helpers.py`. +- **LoRA adapters load automatically for eval** — when a job completes, the adapter is registered on the model entity and hot-reloaded on any **READY** deployment with `lora_enabled: true`. **Do not** update deployments or providers before eval. **Do** route LoRA eval through the **provider** gateway (`/provider//-/v1` with `model: default--`); the model-entity path (`/model//-/v1`) always hits the base model. See `references/post-training-eval.md` § **Request routing (base vs LoRA)**. ## Workflow @@ -162,12 +164,14 @@ Common steps then **branch by plugin pick**: - [ ] nemo customization automodel submit /tmp/job.json --workspace default - [ ] Poll until top-level terminal (`poll_customization_job.sh`; default 15s interval, or 30–60s manual polls) - [ ] Report using output template below +- [ ] Optional: compare base vs adapter on validation — `references/eval_helpers.py …` (CHAT format; adapters hot-reload automatically; see `references/post-training-eval.md`) # unsloth branch (submit → Docker GPU job) - [ ] Write /tmp/job.json using the UnslothJobInput shape (see Fast path — unsloth) - [ ] nemo customization unsloth submit /tmp/job.json --workspace default [--profile ] - [ ] Poll until top-level terminal (`poll_customization_job.sh unsloth-`; default 15s interval) - [ ] Report using output template below +- [ ] Optional: compare base vs adapter on validation — `references/eval_helpers.py …` (CHAT format; adapters hot-reload automatically; see `references/post-training-eval.md`) ``` ## Fast path — automodel @@ -513,7 +517,7 @@ After polling reaches a **terminal** status (`completed`, `error`, or `cancelled | Status | Notes | |--------|-------| -| `completed` | Brief success summary (e.g. adapter registered on model entity). When `metrics.train_loss` has ≥2 entries, add a loss-drop sentence: *Loss dropped from \ at step 1 to \ at step \; validation loss was \.* | +| `completed` | Brief success summary (e.g. adapter registered on model entity). When `metrics.train_loss` has ≥2 entries, add a loss-drop sentence: *Loss dropped from \ at step 1 to \ at step \; validation loss was \.* Always append **Using the adapter** with discovered provider name and concrete gateway URLs (see below). | | `error` | Quote `error_details.message` or the failing step; note setup that succeeded before the failure (auth, dataset upload, submit). | | `cancelled` | Cancellation reason if available. | @@ -580,21 +584,85 @@ After polling reaches a **terminal** status (`completed`, `error`, or `cancelled | Output save method | lora | ``` -**Using the adapter (`completed` only)** — after **Training configuration**, run `nemo models get --workspace default` (parse stdout only) to confirm the adapter is listed under `adapters`. Append this section: +**Using the adapter (`completed` only)** — after **Training configuration**, run these discovery commands (parse stdout only; do not pipe `2>&1` into JSON parsers): + +1. `nemo models get --workspace default` — confirm `` appears under `adapters` with `enabled: true`. +2. `nemo inference providers list --workspace default -f json` — pick a **READY** provider whose `served_models` includes `default/` (base or LoRA composite). Record its `name` as `` (often matches the deployment name). + +On a deployment with `lora_enabled: true`, the adapter is **hot-reloaded automatically** — no deployment update or provider reconfiguration is required before inference or post-training eval. Append this section with **concrete URLs and provider name** from discovery: ```markdown ### Using the adapter -The adapter `` is attached to `default/`. List adapters with: +The adapter `` is registered on `default/`. Weights are hot-reloaded on LoRA-enabled deployments — no deployment or provider update is required after training. + +#### Request routing (base vs LoRA) + +| Target | Gateway path | OpenAI base URL | Request `"model"` field | +|--------|--------------|-----------------|-------------------------| +| **Base** weights | model-entity | `/apis/inference-gateway/v2/workspaces/default/model//-/v1` | `default/` | +| **LoRA adapter** | **provider** | `/apis/inference-gateway/v2/workspaces/default/provider//-/v1` | `default--` | + +**Common mistake:** posting to the model-entity URL with `"model": "default--"` still runs the **base** model. Base-vs-adapter eval will look identical until LoRA requests use the **provider** URL above. See `references/post-training-eval.md` § **Request routing (base vs LoRA)**. + +#### Chat inference (CHAT-trained models) + +Match training context at inference — send **`messages[:-1]`** (all turns except the final assistant label). Single-turn rows are just the user message; multi-turn rows keep prior user/assistant history. + +| Setting | Value | Why | +|---------|-------|-----| +| `messages` | All turns except the final assistant label from the JSONL row | Same decode path as SFT | +| `max_tokens` | `64` for short assistant labels | Training targets are brief (e.g. MCQA choice text) | +| `temperature` | `0` | Reproducible eval / regression checks | +| `chat_template_kwargs.enable_thinking` | `false` for Qwen3 short-answer SFT | Thinking mode needs extra tokens and changes output shape vs training | + +#### Example — LoRA adapter via provider + +\`\`\`bash +export NEMO_BASE_URL= # omit when using default localhost +nemo inference gateway provider post v1/chat/completions --workspace default \\ + --body '{ + "model": "default--", + "messages": [], + "max_tokens": 64, + "temperature": 0, + "chat_template_kwargs": {"enable_thinking": false} + }' +\`\`\` + +#### Example — base model via model-entity (comparison) + +\`\`\`bash +export NEMO_BASE_URL= +nemo inference gateway model post v1/chat/completions --workspace default \\ + --body '{ + "model": "default/", + "messages": [], + "max_tokens": 64, + "temperature": 0, + "chat_template_kwargs": {"enable_thinking": false} + }' +\`\`\` + +#### Post-training eval (optional) + +Validation loss from training is **not** accuracy. To compare base vs adapter on the validation split with correct routing: \`\`\`bash -export NEMO_BASE_URL= # omit line when using default localhost cd /path/to/nemo-platform -nemo models get --workspace default +uv run python plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py \\ + --base-url \\ + --model-entity \\ + --adapter \\ + --provider \\ + --dataset-fileset \\ + --split validation.jsonl \`\`\` + +Uses CHAT `messages` rows unchanged from the training fileset (`messages[:-1]` at inference). Repeat `--adapter` for multi-adapter compare. `--provider` is optional when a READY provider is auto-discovered. ``` -Use the user's platform URL in `NEMO_BASE_URL` when they overrode it; omit the export line for default `http://127.0.0.1:8080`. The JSON `adapters` array shows `name`, `fileset`, `finetuning_type`, and `lora_config` for each registered adapter. +Use the user's platform URL in `NEMO_BASE_URL` when they overrode it; omit the export line for default `http://127.0.0.1:8080`. Substitute ``, ``, and entity/adapter names with values from discovery — do not leave generic placeholders in the user-facing report. Do **not** tell the user to update the deployment or add the adapter to a provider before calling it — registration on the model entity is sufficient. **Save report to `/tmp`** — unless the user opts out, write the full Markdown report (header, **Training configuration**, **Using the adapter** when `completed`, and **Resources created** when a slug or new filesets were used) to `/tmp/fine-tune-result-.md`. Use the random slug from the run when one was assigned; otherwise use the job id suffix (e.g. `a925b07ff678`). @@ -626,5 +694,6 @@ For other terminal errors, keep the same header template; put remediation detail | W&B / MLflow field reference | `references/hyperparameters.md` § **Integrations (automodel + unsloth)** | | W&B secret + MLflow local server + jobs-launcher | `references/integrations-setup.md` | | Gated HF model auth (`hf-token`, fileset `token_secret`) | `references/troubleshooting.md` § **Gated HuggingFace models** | +| Post-training eval (base vs LoRA, CHAT format parity) | `references/post-training-eval.md`, `references/eval_helpers.py` | Related: `plugins/nemo-automodel/README.md`, `plugins/nemo-unsloth/README.md`, `plugins/nemo-customizer/docs/CUSTOMIZATION.md`, skills **`nemo-files`**, **`nemo-status`**, **`nemo-secrets`**. diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/dataset-formats.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/dataset-formats.md index b03b43e12c..4677f68474 100644 --- a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/dataset-formats.md +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/dataset-formats.md @@ -54,3 +54,15 @@ Optional fields on the unsloth `dataset` block: - The automodel SFT format `{"prompt": "...", "completion": "..."}` is **not** directly consumable by unsloth — unsloth has no built-in `prompt`/`completion` concatenation. Convert to either messages or pre-rendered text before upload. EMBEDDING and CUSTOM (automodel-only schemas) are not supported by unsloth today. + +## Post-training evaluation + +Eval rows must use the **same CHAT `messages` shape** as training. Do not flatten to `prompt`/`expected` for the evaluator. + +| Training JSONL | Eval dataset | Eval `prompt_template` | Metric reference | +|----------------|--------------|------------------------|------------------| +| `messages` (single- or multi-turn) | Same fileset split (`validation.jsonl`) | `messages[:-1]` — exclude final assistant label — see `post-training-eval.md` | `{{ item.messages[-1].content }}` | + +LoRA inference and eval use the **provider** gateway (`/provider//-/v1`, `model: default--`). Base uses the **model-entity** path. See `post-training-eval.md` § **Request routing** and the **Using the adapter** section in `SKILL.md`. + +Shared helpers and compare CLI: `references/eval_helpers.py`. Full workflow: `references/post-training-eval.md`. diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py new file mode 100644 index 0000000000..dfdddca7e9 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py @@ -0,0 +1,737 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Post-training evaluation helpers — keep eval dataset shape aligned with CHAT training JSONL. + +LoRA adapters registered on the model entity are hot-reloaded automatically on +deployments with ``lora_enabled: true`` — no deployment update before eval. + +Run from the nemo-platform git root:: + + uv run python plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py \\ + --model-entity qwen3-1.7b --adapter lora-a --adapter lora-b \\ + --provider qwen3-1.7b-csqa-lora-deploy --dataset-fileset commonsense_qa --split validation + +Import in agent scripts (add references/ to sys.path or run via uv from repo root). +""" + +from __future__ import annotations + +import argparse +import json +import re +import urllib.error +import urllib.request +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Sequence + +# --- Train/eval format contract (CHAT JSONL) -------------------------------- + +CHAT_ROW_KEYS = frozenset({"messages"}) + +# Inference: all turns except the final assistant label (single- or multi-turn). +CHAT_USER_PROMPT_TEMPLATE: dict[str, Any] = { + "messages": "{{ item.messages[:-1] }}", +} + +# Metric reference: final assistant turn (the label to predict). +CHAT_REFERENCE_TEMPLATE = "{{ item.messages[-1].content }}" + +# Back-compat alias for single-turn MCQA docs/snippets. +CHAT_SINGLE_TURN_USER_PROMPT_TEMPLATE = { + "messages": [{"role": "user", "content": "{{ item.messages[0].content }}"}], +} + + +def assert_chat_row(row: dict[str, Any], *, index: int | None = None) -> None: + """Validate one dataset row matches automodel/unsloth CHAT training shape.""" + label = f"row {index}" if index is not None else "row" + if "messages" not in row: + raise ValueError( + f"{label}: expected CHAT format with 'messages' array; got keys {sorted(row)}. " + "Do not flatten to prompt/expected — use references/post-training-eval.md." + ) + messages = row["messages"] + if not isinstance(messages, list) or len(messages) < 2: + raise ValueError(f"{label}: messages must be a list with at least one prompt turn + final assistant label") + if messages[0].get("role") != "user": + raise ValueError(f"{label}: expected messages[0]=user") + if messages[-1].get("role") != "assistant": + raise ValueError(f"{label}: expected final messages[-1]=assistant (the label to score)") + + +def reference_content(row: dict[str, Any]) -> str: + """Return the assistant label for a CHAT row (final turn).""" + assert_chat_row(row) + return row["messages"][-1]["content"] + + +def load_chat_jsonl(path: Path | str) -> list[dict[str, Any]]: + """Load JSONL rows; validate CHAT shape; return rows unchanged.""" + rows: list[dict[str, Any]] = [] + with Path(path).open(encoding="utf-8") as handle: + for index, line in enumerate(handle, start=1): + if not line.strip(): + continue + row = json.loads(line) + assert_chat_row(row, index=index) + rows.append(row) + return rows + + +def load_chat_jsonl_from_platform( + *, + base_url: str, + workspace: str, + fileset: str, + remote_path: str, +) -> list[dict[str, Any]]: + """Download a JSONL split from a platform fileset and validate CHAT rows.""" + url = ( + f"{base_url.rstrip('/')}/apis/files/v2/workspaces/{workspace}/filesets/" + f"{fileset}/-/{remote_path.lstrip('/')}" + ) + with urllib.request.urlopen(url) as response: + content = response.read().decode("utf-8") + rows: list[dict[str, Any]] = [] + for index, line in enumerate(content.splitlines(), start=1): + if not line.strip(): + continue + row = json.loads(line) + assert_chat_row(row, index=index) + rows.append(row) + return rows + + +def chat_metrics(): + """Build default metrics for CHAT SFT eval (exact match + ROUGE + BLEU).""" + from nemo_evaluator_sdk import BLEUMetric, ROUGEMetric + from nemo_evaluator_sdk.metrics.exact_match import ExactMatchMetric + + ref = CHAT_REFERENCE_TEMPLATE + return [ + ExactMatchMetric(reference=ref), + ROUGEMetric(reference=ref), + BLEUMetric(references=[ref]), + ] + + +def normalize_mcqa_answer(text: str) -> str: + """Normalize MCQA model output for comparison with bare choice-text references.""" + text = text.strip() + bold = re.search(r"\*\*(?:[A-E]\.\s*)?([^*]+)\*\*", text) + if bold: + text = bold.group(1) + text = re.sub(r"^[A-E]\.\s*", "", text) + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) + return text.strip().lower() + + +def served_model_name(*, workspace: str, entity_or_adapter: str, finetuning: str = "base") -> str: + """Return the ``model`` field for base entity or LoRA adapter requests.""" + if finetuning == "base": + return f"{workspace}/{entity_or_adapter}" + if finetuning == "lora": + return f"{workspace}--{entity_or_adapter}" + raise ValueError("finetuning must be 'base' or 'lora'") + + +def adapter_composite_entity_name( + *, model_entity: str, workspace: str, adapter_name: str +) -> str: + """LoRA composite model-entity path segment (for reference / OpenAI-route body only). + + The model-entity proxy path ``model/{composite}/-/v1`` requires a dedicated + VirtualModel per composite and typically 404s on stock deployments. Prefer + :func:`provider_gateway_url` for adapter eval. + """ + return f"{model_entity}&adapters/{workspace}/{adapter_name}" + + +def model_entity_gateway_url(*, base_url: str, workspace: str, model_entity: str) -> str: + """OpenAI-compatible inference-gateway URL for a registered base model entity.""" + return ( + f"{base_url.rstrip('/')}/apis/inference-gateway/v2/workspaces/{workspace}/" + f"model/{model_entity}/-/v1" + ) + + +def provider_gateway_url(*, base_url: str, workspace: str, provider_name: str) -> str: + """OpenAI-compatible inference-gateway URL for a model provider (LoRA eval route).""" + return ( + f"{base_url.rstrip('/')}/apis/inference-gateway/v2/workspaces/{workspace}/" + f"provider/{provider_name}/-/v1" + ) + + +def gateway_path_from_url(url: str) -> str: + """Return ``model-entity`` or ``provider`` from a gateway base URL.""" + if "/provider/" in url: + return "provider" + if "/model/" in url: + return "model-entity" + return "unknown" + + +def _platform_get_json(url: str) -> dict[str, Any]: + with urllib.request.urlopen(url) as response: + return json.loads(response.read().decode("utf-8")) + + +def find_ready_provider_for_model_entity( + *, + base_url: str, + workspace: str, + model_entity: str, +) -> str | None: + """Return a READY provider name that serves ``workspace/model_entity`` (base or LoRA).""" + url = ( + f"{base_url.rstrip('/')}/apis/models/v2/workspaces/{workspace}/providers" + f"?page_size=100&filter.status=READY" + ) + payload = _platform_get_json(url) + base_entity_id = f"{workspace}/{model_entity}" + matches: list[str] = [] + for provider in payload.get("data", []): + if provider.get("status") != "READY": + continue + for served in provider.get("served_models") or []: + entity_id = served.get("model_entity_id") or "" + if entity_id == base_entity_id or entity_id.startswith(f"{base_entity_id}&adapters/"): + matches.append(provider["name"]) + break + if not matches: + return None + # Prefer deployment-backed providers (stable) over arbitrary first hit. + return sorted(set(matches))[0] + + +@dataclass +class JobAdapterInfo: + job_name: str + adapter_name: str + epochs: int | None + backend: str + model_entity: str + dataset_ref: str + status: str + created_at: str | None = None + + def to_dict(self) -> dict[str, Any]: + return { + "job_name": self.job_name, + "adapter_name": self.adapter_name, + "epochs": self.epochs, + "backend": self.backend, + "model_entity": self.model_entity, + "dataset_ref": self.dataset_ref, + "status": self.status, + "created_at": self.created_at, + } + + +def adapter_from_completed_job( + *, + base_url: str, + workspace: str, + job_name: str, +) -> JobAdapterInfo: + """Resolve adapter output name and training epochs from a platform job.""" + url = f"{base_url.rstrip('/')}/apis/jobs/v2/workspaces/{workspace}/jobs/{job_name}" + try: + job = _platform_get_json(url) + except urllib.error.HTTPError as exc: + raise ValueError(f"Job not found: {workspace}/{job_name}") from exc + spec = job.get("spec") or {} + output_name = (spec.get("output") or {}).get("name") or spec.get("name") + if not output_name: + raise ValueError(f"Job {job_name} has no output adapter name in spec") + model = spec.get("model") + model_entity = model.get("name", "") if isinstance(model, dict) else (model or "") + if model_entity.startswith(f"{workspace}/"): + model_entity = model_entity.split("/", 1)[1] + dataset = spec.get("dataset") or {} + dataset_ref = dataset.get("path") or dataset.get("training") or "" + backend = job_name.split("-", 1)[0] if "-" in job_name else "unknown" + return JobAdapterInfo( + job_name=job_name, + adapter_name=output_name, + epochs=(spec.get("schedule") or {}).get("epochs"), + backend=backend, + model_entity=model_entity, + dataset_ref=dataset_ref, + status=job.get("status", "unknown"), + created_at=job.get("created_at"), + ) + + +def list_completed_job_adapters( + *, + base_url: str, + workspace: str, + model_entity: str, + dataset_fileset: str | None = None, + page_size: int = 500, +) -> list[JobAdapterInfo]: + """List completed customization jobs and their output adapter names.""" + url = ( + f"{base_url.rstrip('/')}/apis/jobs/v2/workspaces/{workspace}/jobs" + f"?page_size={page_size}&filter.status=completed" + ) + payload = _platform_get_json(url) + dataset_ref = f"{workspace}/{dataset_fileset}" if dataset_fileset else None + model_ref = f"{workspace}/{model_entity}" + results: list[JobAdapterInfo] = [] + for job in payload.get("data", []): + if job.get("status") != "completed": + continue + spec = job.get("spec") or {} + out = (spec.get("output") or {}).get("name") or spec.get("name") + if not out: + continue + model = spec.get("model") + job_model = model.get("name", "") if isinstance(model, dict) else (model or "") + ds = spec.get("dataset") or {} + job_ds = ds.get("path") or ds.get("training") or "" + if model_ref not in str(job_model): + continue + if dataset_ref and dataset_ref not in str(job_ds): + continue + backend = job["name"].split("-", 1)[0] if "-" in job["name"] else "unknown" + results.append( + JobAdapterInfo( + job_name=job["name"], + adapter_name=out, + epochs=(spec.get("schedule") or {}).get("epochs"), + backend=backend, + model_entity=model_entity, + dataset_ref=job_ds, + status=job.get("status", "completed"), + created_at=job.get("created_at"), + ) + ) + results.sort(key=lambda item: item.created_at or "", reverse=True) + return results + + +def build_online_eval_config( + *, + max_tokens: int = 64, + temperature: float = 0, + parallelism: int = 8, + enable_thinking: bool = False, + limit_samples: int | None = None, +): + """RunConfigOnlineModel defaults aligned with Qwen3 CHAT SFT eval.""" + from nemo_evaluator_sdk.values import InferenceParams, RunConfigOnlineModel + + extra_body = {"chat_template_kwargs": {"enable_thinking": enable_thinking}} if not enable_thinking else None + inference_kwargs: dict[str, Any] = {"max_tokens": max_tokens, "temperature": temperature} + if extra_body: + inference_kwargs["extra_body"] = extra_body + return RunConfigOnlineModel( + parallelism=parallelism, + limit_samples=limit_samples, + inference=InferenceParams(**inference_kwargs), + ) + + +def build_platform_model_target( + *, + base_url: str, + workspace: str, + model_entity: str, + adapter_name: str | None = None, + provider_name: str | None = None, +): + """SDK Model target for base entity or LoRA adapter on the platform gateway. + + Base weights use the **model-entity** proxy + (``/model/{entity}/-/v1``). LoRA adapters must use the **provider** proxy + (``/provider/{name}/-/v1``) with ``model: {workspace}--{adapter}`` — the + model-entity path always routes to the base VirtualModel and ignores adapter + names in the request body. + """ + from nemo_evaluator_sdk.enums import ModelFormat + from nemo_evaluator_sdk.values.models import Model + + if adapter_name: + resolved_provider = provider_name or find_ready_provider_for_model_entity( + base_url=base_url, + workspace=workspace, + model_entity=model_entity, + ) + if not resolved_provider: + raise ValueError( + f"No READY inference provider serves {workspace}/{model_entity}. " + "Deploy the base model with lora_enabled: true or pass --provider ." + ) + return Model( + url=provider_gateway_url( + base_url=base_url, + workspace=workspace, + provider_name=resolved_provider, + ), + name=served_model_name( + workspace=workspace, entity_or_adapter=adapter_name, finetuning="lora" + ), + format=ModelFormat.NVIDIA_NIM, + ) + + return Model( + url=model_entity_gateway_url(base_url=base_url, workspace=workspace, model_entity=model_entity), + name=served_model_name(workspace=workspace, entity_or_adapter=model_entity, finetuning="base"), + format=ModelFormat.NVIDIA_NIM, + ) + + +@dataclass +class EvalSummary: + target: str + model_name: str + gateway_url: str + gateway_path: str + num_samples: int + raw_exact_match: float + normalized_accuracy: float + aggregate_metrics: dict[str, dict[str, float | None]] + + def to_dict(self) -> dict[str, Any]: + return { + "target": self.target, + "model_name": self.model_name, + "gateway_url": self.gateway_url, + "gateway_path": self.gateway_path, + "num_samples": self.num_samples, + "raw_exact_match": self.raw_exact_match, + "normalized_accuracy": self.normalized_accuracy, + "metrics": self.aggregate_metrics, + } + + +def summarize_chat_eval_result(*, target: str, model_name: str, gateway_url: str, result) -> EvalSummary: + """Summarize Evaluator benchmark result for CHAT rows.""" + em_rows = result.per_metric["exact-match"].row_scores + num_samples = len(em_rows) + raw_correct = sum( + 1 + for rs in em_rows + if rs.sample.get("output_text", "").strip() == reference_content(rs.item).strip() + ) + norm_correct = sum( + 1 + for rs in em_rows + if normalize_mcqa_answer(rs.sample.get("output_text", "")) + == normalize_mcqa_answer(reference_content(rs.item)) + ) + aggregate_metrics: dict[str, dict[str, float | None]] = {} + for metric_name, metric_result in result.per_metric.items(): + aggregate_metrics[metric_name] = { + score.name.split(".")[-1]: round(score.mean, 4) if score.mean is not None else None + for score in metric_result.aggregate_scores.scores + } + return EvalSummary( + target=target, + model_name=model_name, + gateway_url=gateway_url, + gateway_path=gateway_path_from_url(gateway_url), + num_samples=num_samples, + raw_exact_match=round(raw_correct / num_samples, 4) if num_samples else 0.0, + normalized_accuracy=round(norm_correct / num_samples, 4) if num_samples else 0.0, + aggregate_metrics=aggregate_metrics, + ) + + +def run_chat_online_eval( + *, + rows: Sequence[dict[str, Any]], + target, + config, + metrics=None, + prompt_template: dict[str, Any] | None = None, +): + """Run online eval on CHAT rows using shared templates.""" + from nemo_evaluator_sdk import Evaluator + + for index, row in enumerate(rows): + assert_chat_row(row, index=index) + if metrics is None: + metrics = chat_metrics() + return Evaluator().run_sync( + metrics=metrics, + dataset=list(rows), + target=target, + prompt_template=prompt_template or CHAT_USER_PROMPT_TEMPLATE, + config=config, + ) + + +def _eval_target( + *, + base_url: str, + workspace: str, + model_entity: str, + adapter_name: str | None, + provider_name: str | None, + rows: Sequence[dict[str, Any]], + config, + target_label: str, +) -> EvalSummary: + target = build_platform_model_target( + base_url=base_url, + workspace=workspace, + model_entity=model_entity, + adapter_name=adapter_name, + provider_name=provider_name, + ) + result = run_chat_online_eval(rows=rows, target=target, config=config) + return summarize_chat_eval_result( + target=target_label, + model_name=target.name, + gateway_url=target.url, + result=result, + ) + + +def compare_adapters( + *, + base_url: str, + workspace: str, + model_entity: str, + adapter_names: Sequence[str], + rows: Sequence[dict[str, Any]], + include_base: bool = True, + provider_name: str | None = None, + max_tokens: int = 64, + enable_thinking: bool = False, + parallelism: int = 8, + limit_samples: int | None = None, +) -> list[EvalSummary]: + """Compare base (optional) and one or more LoRA adapters on the same CHAT rows.""" + config = build_online_eval_config( + max_tokens=max_tokens, + enable_thinking=enable_thinking, + parallelism=parallelism, + limit_samples=limit_samples, + ) + summaries: list[EvalSummary] = [] + if include_base: + summaries.append( + _eval_target( + base_url=base_url, + workspace=workspace, + model_entity=model_entity, + adapter_name=None, + provider_name=provider_name, + rows=rows, + config=config, + target_label="base", + ) + ) + for adapter_name in adapter_names: + summaries.append( + _eval_target( + base_url=base_url, + workspace=workspace, + model_entity=model_entity, + adapter_name=adapter_name, + provider_name=provider_name, + rows=rows, + config=config, + target_label=adapter_name, + ) + ) + return summaries + + +def compare_base_vs_adapter( + *, + base_url: str, + workspace: str, + model_entity: str, + adapter_name: str, + rows: Sequence[dict[str, Any]], + provider_name: str | None = None, + max_tokens: int = 64, + enable_thinking: bool = False, + parallelism: int = 8, + limit_samples: int | None = None, +) -> list[EvalSummary]: + """Compare base model vs one LoRA adapter on the same CHAT validation rows.""" + summaries = compare_adapters( + base_url=base_url, + workspace=workspace, + model_entity=model_entity, + adapter_names=[adapter_name], + rows=rows, + include_base=True, + provider_name=provider_name, + max_tokens=max_tokens, + enable_thinking=enable_thinking, + parallelism=parallelism, + limit_samples=limit_samples, + ) + if len(summaries) == 2: + summaries[1].target = "lora" + return summaries + + +def lift_vs_base(summaries: Sequence[EvalSummary]) -> dict[str, float]: + """Normalized accuracy delta vs the base summary (if present).""" + base = next((summary for summary in summaries if summary.target == "base"), None) + if base is None: + return {} + return { + summary.target: round(summary.normalized_accuracy - base.normalized_accuracy, 4) + for summary in summaries + if summary.target != "base" + } + + +def routing_sanity_warnings( + summaries: Sequence[EvalSummary], + *, + routing_tolerance_pp: float = 0.015, +) -> list[str]: + """Return human-readable warnings when LoRA routing or scores look suspicious.""" + warnings: list[str] = [] + base = next((summary for summary in summaries if summary.target == "base"), None) + for summary in summaries: + if summary.target == "base": + if summary.gateway_path != "model-entity": + warnings.append( + f"base eval used {summary.gateway_path} route; expected model-entity " + f"({summary.gateway_url})" + ) + continue + if summary.gateway_path != "provider": + warnings.append( + f"{summary.target}: LoRA eval used {summary.gateway_path} route " + f"({summary.gateway_url}); expected provider gateway — scores may match base" + ) + if base and abs(summary.normalized_accuracy - base.normalized_accuracy) <= routing_tolerance_pp: + warnings.append( + f"{summary.target}: normalized accuracy {summary.normalized_accuracy:.1%} is within " + f"{routing_tolerance_pp:.1%} of base ({base.normalized_accuracy:.1%}) — verify provider routing" + ) + return warnings + + +def build_eval_payload( + *, + summaries: Sequence[EvalSummary], + base_url: str, + workspace: str, + model_entity: str, + adapter_names: Sequence[str], + provider_name: str | None, +) -> dict[str, Any]: + """Assemble CLI/programmatic JSON output with routing metadata and warnings.""" + routing: dict[str, Any] = {} + if any(summary.target == "base" for summary in summaries): + routing["base"] = { + "gateway_path": "model-entity", + "url": model_entity_gateway_url( + base_url=base_url, workspace=workspace, model_entity=model_entity + ), + "model_field": served_model_name( + workspace=workspace, entity_or_adapter=model_entity, finetuning="base" + ), + } + for adapter_name in adapter_names: + target = build_platform_model_target( + base_url=base_url, + workspace=workspace, + model_entity=model_entity, + adapter_name=adapter_name, + provider_name=provider_name, + ) + routing[adapter_name] = { + "gateway_path": "provider", + "url": target.url, + "model_field": target.name, + } + warnings = routing_sanity_warnings(summaries) + payload: dict[str, Any] = { + "dataset_format": "chat (messages)", + "prompt_template": CHAT_USER_PROMPT_TEMPLATE, + "reference_template": CHAT_REFERENCE_TEMPLATE, + "routing": routing, + "results": [summary.to_dict() for summary in summaries], + "lift_vs_base": lift_vs_base(summaries), + "primary_metric": "normalized_accuracy", + } + if warnings: + payload["warnings"] = warnings + return payload + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Compare base vs LoRA on CHAT validation JSONL") + parser.add_argument("--base-url", default="http://127.0.0.1:8080") + parser.add_argument("--workspace", default="default") + parser.add_argument("--model-entity", required=True) + parser.add_argument( + "--adapter", + action="append", + required=True, + help="Adapter name(s) registered on the model entity (repeat for multi-adapter compare)", + ) + parser.add_argument( + "--provider", + default=None, + help="Inference provider name for LoRA requests (auto-discovered when omitted)", + ) + parser.add_argument("--dataset-fileset", required=True) + parser.add_argument("--split", default="validation.jsonl") + parser.add_argument("--max-tokens", type=int, default=64) + parser.add_argument("--enable-thinking", action="store_true") + parser.add_argument("--limit-samples", type=int, default=None) + parser.add_argument("--output", type=Path, default=None) + parser.add_argument( + "--no-base", + action="store_true", + help="Skip base-model eval (adapter-only comparison)", + ) + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + rows = load_chat_jsonl_from_platform( + base_url=args.base_url, + workspace=args.workspace, + fileset=args.dataset_fileset, + remote_path=args.split, + ) + summaries = compare_adapters( + base_url=args.base_url, + workspace=args.workspace, + model_entity=args.model_entity, + adapter_names=args.adapter, + rows=rows, + include_base=not args.no_base, + provider_name=args.provider, + max_tokens=args.max_tokens, + enable_thinking=args.enable_thinking, + limit_samples=args.limit_samples, + ) + payload = build_eval_payload( + summaries=summaries, + base_url=args.base_url, + workspace=args.workspace, + model_entity=args.model_entity, + adapter_names=args.adapter, + provider_name=args.provider, + ) + text = json.dumps(payload, indent=2) + print(text) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(text, encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md new file mode 100644 index 0000000000..0972583d18 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md @@ -0,0 +1,209 @@ +# Post-training evaluation (train/eval format parity) + +Use after a customization job reaches **`completed`** when the user wants to compare **base vs LoRA** on the validation split. + +## Format contract + +Training and evaluation must use the **same CHAT JSONL row shape**: + +```json +{ + "messages": [ + {"role": "user", "content": "Question: …\nChoices:\n…\nAnswer:"}, + {"role": "assistant", "content": "bank"} + ] +} +``` + +Multi-turn rows use the same rule: the **final** `messages[-1]` turn is the assistant label; all preceding turns are context. + +| Do | Don't | +|----|-------| +| Pass rows with `messages` unchanged from the training fileset | Flatten to `prompt` / `expected` or `prompt` / `completion` for eval | +| Send **`messages[:-1]`** at inference (exclude only the final assistant label) | Pass full `messages` including the label turn, or use `{"messages": "{{ item.messages }}"}` unfiltered | +| Score against **`messages[-1].content`** (final assistant turn) | Score against a renamed `expected` field unless you also keep `messages` | + +Single-turn MCQA (user + assistant) is the degenerate case: `messages[:-1]` is just the user turn. + +Automodel and unsloth both train on this shape when `has_chat` is true (see `hf-conversion.md`, `dataset-formats.md`). + +## Evaluator templates (required) + +```python +CHAT_USER_PROMPT_TEMPLATE = { + "messages": "{{ item.messages[:-1] }}", +} +CHAT_REFERENCE_TEMPLATE = "{{ item.messages[-1].content }}" +``` + +Import from `references/eval_helpers.py` — do not re-type these in one-off scripts. + +## Inference defaults (Qwen3 / thinking models) + +| Setting | Recommended | Avoid | +|---------|-------------|-------| +| `enable_thinking` | `false` via `chat_template_kwargs` for short-answer SFT | Thinking on without enough tokens — model never closes ``, strip hook fails | +| `max_tokens` | `64` (short assistant labels) | `16` with thinking on; `1024` thinking on without strip (verbose prose) | +| System prompt | Omit unless user asks — matches training | Extra system prompt changes decode path vs SFT | + +For thinking-enabled eval, set `reasoning=ReasoningParams(end_token="")` **and** ensure `max_tokens` is large enough for the model to emit the end token before generating the answer. + +## Inference after customization (wrap-up) + +Include this in the **Using the adapter** section of every completed customization report. Agents must discover `` from `nemo inference providers list --workspace default -f json` and fill concrete URLs — do not leave placeholders. + +### LoRA adapters load automatically + +After a customization job reaches **`completed`**, the platform registers the adapter on the base **model entity**. On a deployment with **`lora_enabled: true`**, enabled adapters are **hot-reloaded automatically** (adapter sidecar → vLLM). **Do not** update the deployment, re-create providers, or add the adapter to a `served_models` list before post-training eval — run eval as soon as the job completes. + +| Prerequisite (one-time) | Per-adapter step after training | +|-------------------------|----------------------------------| +| A **READY** inference deployment for the base model entity with `lora_enabled: true` | Confirm adapter appears under `nemo models get ` → `adapters` | +| Gateway reachable at the model-entity URL below | Target the adapter by name in the eval request (see table) | + +### Request routing (base vs LoRA) + +The model-entity proxy path **always** resolves to the base VirtualModel. Setting `"model": "default--"` on `/model//-/v1` does **not** select the adapter — gateway logs will show only the base path. + +| Target | Gateway route | URL pattern | Request `model` field | +|--------|---------------|-------------|------------------------| +| Base entity | **Model entity** | `{NEMO_BASE_URL}/apis/inference-gateway/v2/workspaces/default/model//-/v1` | `default/` | +| LoRA adapter | **Provider** | `{NEMO_BASE_URL}/apis/inference-gateway/v2/workspaces/default/provider//-/v1` | `default--` | + +`eval_helpers.py` auto-discovers a READY provider that serves the base entity (or pass `--provider `). Adapter weights still hot-reload on the deployment — no provider update per adapter. + +Optional sanity checks: + +- `nemo models get --workspace default` — adapter listed with `enabled: true` +- `nemo inference providers list --workspace default` — provider status **READY** +- LoRA eval/inference logs should show `path=…/provider//-/v1/chat/completions`, **not** `…/model//-/v1` +- JSON output includes `warnings` when routing looks wrong or adapter scores match base within ~1.5 pp + +### Why earlier evals looked wrong + +If base and LoRA scores were identical (~99% same outputs), the adapter was almost certainly called through the **model-entity** path. That path always resolves to the base VirtualModel — the `"model": "default--"` field in the body is ignored. Fix: route LoRA through the **provider** URL with the same `model` field. `eval_helpers.build_platform_model_target()` and the CLI implement this split automatically. + +### MCQA metric interpretation + +For commonsense_qa-style MCQA, treat **`normalized_accuracy`** as the primary metric (`normalize_mcqa_answer` strips `A. foo` / markdown). + +| Observation | Likely meaning | +|-------------|----------------| +| Base & LoRA both ~59% normalized, within ~1 pp | LoRA hit **model-entity** path (base only) — check `warnings` and gateway logs | +| Base raw exact 0%, normalized ~59% | Normal for base on MCQA (formatted prose answers) | +| LoRA normalized >> base (e.g. 76% vs 59%) | Correct provider routing and real adapter lift | +| Train loss dropped sharply but eval flat | Wrong eval routing or need more epochs — val loss ≠ accuracy | + +### Epoch / adapter ablations + +Resolve adapter names from completed job specs instead of guessing: + +```python +from eval_helpers import list_completed_job_adapters, compare_adapters, build_eval_payload + +jobs = list_completed_job_adapters( + base_url="http://10.0.0.51:8080", + workspace="default", + model_entity="qwen3-1.7b", + dataset_fileset="commonsense_qa", +) +# jobs[0].epochs, jobs[0].adapter_name, jobs[0].backend — sorted newest first + +summaries = compare_adapters( + base_url="...", + workspace="default", + model_entity="qwen3-1.7b", + adapter_names=[jobs[0].adapter_name, jobs[2].adapter_name], + rows=rows, +) +payload = build_eval_payload(..., summaries=summaries, adapter_names=[...]) +# payload["lift_vs_base"], payload.get("warnings") +``` + +When comparing adapters from **different backends** (automodel vs unsloth) or batch configs, note confounds — epoch count alone may not explain the gap. + +### Production chat requests (same rules as eval) + +| Piece | LoRA adapter | Base model | +|-------|--------------|------------| +| HTTP base URL | `…/provider//-/v1` | `…/model//-/v1` | +| `"model"` | `default--` | `default/` | +| `messages` | `messages[:-1]` from the training row (exclude final assistant label) | Same | +| Qwen3 short SFT | `"chat_template_kwargs": {"enable_thinking": false}` | Same | +| `max_tokens` / `temperature` | `64` / `0` typical for short labels | Same | + +CLI shortcuts (substitute names from the job): + +```bash +# LoRA +nemo inference gateway provider post v1/chat/completions --workspace default \ + --body '{"model":"default--","messages":[{"role":"user","content":"…"}],"max_tokens":64,"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' + +# Base +nemo inference gateway model post v1/chat/completions --workspace default \ + --body '{"model":"default/","messages":[{"role":"user","content":"…"}],"max_tokens":64,"temperature":0,"chat_template_kwargs":{"enable_thinking":false}}' +``` + +## Metrics + +| Task | Metrics | +|------|---------| +| MCQA / exact label | `ExactMatchMetric` + `normalize_mcqa_answer()` when models return `A. foo` or markdown | +| Similarity | `ROUGEMetric`, `BLEUMetric` with `CHAT_REFERENCE_TEMPLATE` | + +Val loss from training is **not** accuracy — always run a generation eval for user-facing quality. + +## Helper script + +From **nemo-platform** git root: + +```bash +# Base vs one adapter +uv run python plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py \ + --base-url http://10.0.0.51:8080 \ + --model-entity qwen3-1.7b \ + --adapter qwen3-1.7b-csqa-unsloth-jun16-e3 \ + --provider qwen3-1.7b-csqa-lora-deploy \ + --dataset-fileset commonsense_qa \ + --split validation.jsonl \ + --output /tmp/fine-tune-eval.json + +# Base vs multiple adapters (epoch ablation) +uv run python plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py \ + --base-url http://10.0.0.51:8080 \ + --model-entity qwen3-1.7b \ + --adapter qwen3-1.7b-commonsense-qa-lora-jun12-v2 \ + --adapter qwen3-1.7b-csqa-unsloth-jun16-e3 \ + --dataset-fileset commonsense_qa \ + --split validation.jsonl \ + --output /tmp/fine-tune-eval-multi.json +``` + +Programmatic use: + +```python +from eval_helpers import ( + load_chat_jsonl_from_platform, + compare_adapters, + compare_base_vs_adapter, + build_eval_payload, + list_completed_job_adapters, + routing_sanity_warnings, + CHAT_USER_PROMPT_TEMPLATE, +) +``` + +(Add `references/` to `sys.path` or run via `uv run python` from repo root.) + +## Report to user + +After compare, report for **base and each adapter**: + +- **Normalized accuracy** (primary for MCQA) +- Raw exact match (strict string — often 0% on base for formatted answers) +- Lift vs base (`lift_vs_base` in JSON output) +- ROUGE / BLEU aggregates if requested +- Any `warnings` from routing sanity checks +- Inference settings (`enable_thinking`, `max_tokens`) and dataset fileset ref + +Uses the **nemo-evaluator SDK** (`Evaluator`, metrics, `RunConfigOnlineModel`) under the hood — no separate evaluator skill doc required. For general BYOB/rubric eval outside customization, use the **nemo-evaluator** skill. diff --git a/plugins/nemo-customizer/tests/test_eval_helpers.py b/plugins/nemo-customizer/tests/test_eval_helpers.py new file mode 100644 index 0000000000..26755d53dc --- /dev/null +++ b/plugins/nemo-customizer/tests/test_eval_helpers.py @@ -0,0 +1,221 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +import pytest + +SKILL_REFERENCES = ( + Path(__file__).resolve().parents[1] + / "src" + / "nemo_customizer" + / "skills" + / "nemo-customizer" + / "references" +) +sys.path.insert(0, str(SKILL_REFERENCES)) + +import eval_helpers # noqa: E402 + + +def test_served_model_names() -> None: + assert eval_helpers.served_model_name(workspace="default", entity_or_adapter="qwen3-1.7b") == "default/qwen3-1.7b" + assert ( + eval_helpers.served_model_name(workspace="default", entity_or_adapter="my-lora", finetuning="lora") + == "default--my-lora" + ) + + +def test_adapter_composite_entity_name() -> None: + assert ( + eval_helpers.adapter_composite_entity_name( + model_entity="qwen3-1.7b", + workspace="default", + adapter_name="my-lora", + ) + == "qwen3-1.7b&adapters/default/my-lora" + ) + + +def test_build_platform_model_target_routes_lora_via_provider() -> None: + target = eval_helpers.build_platform_model_target( + base_url="http://10.0.0.51:8080", + workspace="default", + model_entity="qwen3-1.7b", + adapter_name="my-lora", + provider_name="my-provider", + ) + assert "/provider/my-provider/-/v1" in target.url + assert "/model/qwen3-1.7b/-/v1" not in target.url + assert target.name == "default--my-lora" + + +def test_build_platform_model_target_routes_base_via_model_entity() -> None: + target = eval_helpers.build_platform_model_target( + base_url="http://10.0.0.51:8080", + workspace="default", + model_entity="qwen3-1.7b", + ) + assert "/model/qwen3-1.7b/-/v1" in target.url + assert target.name == "default/qwen3-1.7b" + + +def test_gateway_path_from_url() -> None: + assert eval_helpers.gateway_path_from_url("http://x/provider/p/-/v1") == "provider" + assert eval_helpers.gateway_path_from_url("http://x/model/m/-/v1") == "model-entity" + + +def test_normalize_mcqa_answer() -> None: + assert eval_helpers.normalize_mcqa_answer("bank") == "bank" + assert eval_helpers.normalize_mcqa_answer("A. bank") == "bank" + assert eval_helpers.normalize_mcqa_answer("The correct answer is: **A. bank**") == "bank" + + +def test_assert_chat_row_rejects_flattened() -> None: + with pytest.raises(ValueError, match="messages"): + eval_helpers.assert_chat_row({"prompt": "hi", "expected": "bye"}) + + +def test_assert_chat_row_accepts_single_turn() -> None: + row = { + "messages": [ + {"role": "user", "content": "Question?"}, + {"role": "assistant", "content": "yes"}, + ] + } + eval_helpers.assert_chat_row(row) + + +def test_assert_chat_row_accepts_multi_turn() -> None: + row = { + "messages": [ + {"role": "user", "content": "Turn 1"}, + {"role": "assistant", "content": "Reply 1"}, + {"role": "user", "content": "Turn 2"}, + {"role": "assistant", "content": "final label"}, + ] + } + eval_helpers.assert_chat_row(row) + assert eval_helpers.reference_content(row) == "final label" + + +def test_assert_chat_row_rejects_missing_final_assistant() -> None: + row = { + "messages": [ + {"role": "user", "content": "Turn 1"}, + {"role": "assistant", "content": "Reply 1"}, + {"role": "user", "content": "Turn 2"}, + ] + } + with pytest.raises(ValueError, match="assistant"): + eval_helpers.assert_chat_row(row) + + +def test_load_chat_jsonl(tmp_path: Path) -> None: + path = tmp_path / "val.jsonl" + path.write_text( + json.dumps( + { + "messages": [ + {"role": "user", "content": "Q"}, + {"role": "assistant", "content": "A"}, + ] + } + ) + + "\n", + encoding="utf-8", + ) + rows = eval_helpers.load_chat_jsonl(path) + assert len(rows) == 1 + assert rows[0]["messages"][-1]["content"] == "A" + + +def test_chat_templates_use_messages_slice() -> None: + assert "item.messages[:-1]" in eval_helpers.CHAT_USER_PROMPT_TEMPLATE["messages"] + assert "item.messages[-1]" in eval_helpers.CHAT_REFERENCE_TEMPLATE + + +def test_lift_vs_base() -> None: + summaries = [ + eval_helpers.EvalSummary( + target="base", + model_name="default/m", + gateway_url="http://x/model/m/-/v1", + gateway_path="model-entity", + num_samples=10, + raw_exact_match=0.0, + normalized_accuracy=0.5, + aggregate_metrics={}, + ), + eval_helpers.EvalSummary( + target="lora-a", + model_name="default--a", + gateway_url="http://x/provider/p/-/v1", + gateway_path="provider", + num_samples=10, + raw_exact_match=0.7, + normalized_accuracy=0.75, + aggregate_metrics={}, + ), + ] + assert eval_helpers.lift_vs_base(summaries) == {"lora-a": 0.25} + + +def test_routing_sanity_warnings_detects_flat_scores() -> None: + summaries = [ + eval_helpers.EvalSummary( + target="base", + model_name="default/m", + gateway_url="http://x/model/m/-/v1", + gateway_path="model-entity", + num_samples=10, + raw_exact_match=0.0, + normalized_accuracy=0.59, + aggregate_metrics={}, + ), + eval_helpers.EvalSummary( + target="lora-a", + model_name="default--a", + gateway_url="http://x/model/m/-/v1", + gateway_path="model-entity", + num_samples=10, + raw_exact_match=0.0, + normalized_accuracy=0.59, + aggregate_metrics={}, + ), + ] + warnings = eval_helpers.routing_sanity_warnings(summaries) + assert any("provider" in warning for warning in warnings) + assert any("within" in warning for warning in warnings) + + +def test_adapter_from_completed_job_parses_spec(monkeypatch: pytest.MonkeyPatch) -> None: + payload = { + "name": "unsloth-abc", + "status": "completed", + "created_at": "2026-06-16T20:22:09", + "spec": { + "schedule": {"epochs": 3}, + "output": {"name": "my-adapter"}, + "model": {"name": "default/qwen3-1.7b"}, + "dataset": {"path": "default/commonsense_qa"}, + }, + } + + def fake_get(url: str) -> dict: + assert url.endswith("/jobs/unsloth-abc") + return payload + + monkeypatch.setattr(eval_helpers, "_platform_get_json", fake_get) + info = eval_helpers.adapter_from_completed_job( + base_url="http://10.0.0.51:8080", + workspace="default", + job_name="unsloth-abc", + ) + assert info.adapter_name == "my-adapter" + assert info.epochs == 3 + assert info.backend == "unsloth" From f75c273453176348429e491870fd11a3fd84f572 Mon Sep 17 00:00:00 2001 From: Sam Oluwalana Date: Tue, 16 Jun 2026 15:45:43 -0600 Subject: [PATCH 2/3] Make the skill more generic, fix lint, fix code-rabbit Signed-off-by: Sam Oluwalana --- .../skills/nemo-customizer/SKILL.md | 17 +- .../references/eval_helpers.py | 88 ++-- .../references/post-training-eval.md | 69 +-- .../tests/test_eval_helpers.py | 31 +- third_party/requirements-main.txt | 472 +++++++++++++++++- 5 files changed, 568 insertions(+), 109 deletions(-) diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md index 895ba73b17..745bc56c10 100644 --- a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md @@ -113,7 +113,7 @@ Training never runs inside the `nemo` CLI process. After `submit`, the platform' - Resolve the CLI per **Pre-flight — CLI resolution** before any `nemo …` command; run from the **nemo-platform** git root, not a plugin subfolder. - Set `NEMO_BASE_URL` (or `NMP_BASE_URL`) only when the user gives a platform URL; default `http://127.0.0.1:8080` (same as `http://localhost:8080`). Track whether the user **overrode** the base URL — see **Platform unreachable** below. - **Platform unreachable** — if any platform API call fails with a connection error (`Connection error`, timeout, refused): - - **User gave a custom URL** (e.g. `10.0.0.51:8080`) or you exported a non-default `NEMO_BASE_URL` / `NMP_BASE_URL`: stop and tell the user the platform is not reachable at that address. Do **not** offer to start local services. + - **User gave a custom URL** (e.g. `$NMP_BASE_URL`) or you exported a non-default `NMP_BASE_URL` / `NEMO_BASE_URL`: stop and tell the user the platform is not reachable at that address. Do **not** offer to start local services. - **Default URL only** (no user override): **ask** whether to start the platform locally. If they agree, from the **nemo-platform** git root run in the **background**: ```bash @@ -139,7 +139,7 @@ Training never runs inside the `nemo` CLI process. After `submit`, the platform' - **Do not use local `docker info`** to pick automodel vs unsloth. Run `nemo jobs list-execution-profiles -f json` against the user's platform (login first only if auth is enabled — see **Authentication**; see `references/troubleshooting.md`). Default output is a table — **`-f json` is required** for scripting; parse **stdout only** (do not pipe `2>&1` into `json.load`). - **Do not merge stderr into stdout when parsing JSON** — `submit`, `explain`, and `-f json` commands write **JSON on stdout**; harmless warnings like `Configuration file not found, using defaults` go to **stderr**. Piping with **`2>&1`** before `json.load` raises `JSONDecodeError` even when submit **succeeded** — a common cause of **duplicate jobs** when the agent re-submits after a parse error. Parse stdout only; redirect stderr if needed (`2>/dev/null`). See `references/troubleshooting.md` § **Parsing CLI JSON**. - For submit/image/plugin errors (both backends), read `references/troubleshooting.md`. Unsloth needs the `nmp-unsloth-training` container image on the **platform host's** Docker daemon (see `docker/unsloth/README.md`). -- **Missing training image on a remote platform** — if the user gave a non-localhost `NEMO_BASE_URL` / `NMP_BASE_URL` (e.g. `10.0.0.51:8080`) and the job errors with `Failed to pull image`, `manifest unknown`, or missing `nmp-unsloth-training` / automodel training image: **do not** run `docker build`, `docker pull`, or `docker buildx bake` on the agent machine. Report with **Report to user** (use **Output adapter fileset (planned):** on error), then append on-target build steps from `references/troubleshooting.md` § **Missing training images**. +- **Missing training image on a remote platform** — if the user gave a non-localhost `NMP_BASE_URL` / `NEMO_BASE_URL` and the job errors with `Failed to pull image`, `manifest unknown`, or missing `nmp-unsloth-training` / automodel training image: **do not** run `docker build`, `docker pull`, or `docker buildx bake` on the agent machine. Report with **Report to user** (use **Output adapter fileset (planned):** on error), then append on-target build steps from `references/troubleshooting.md` § **Missing training images**. - **Gated HuggingFace models** (Llama, Gemma, …) — confirm `hf-token` + fileset `token_secret` before submit; download fails with `Failed to access upstream storage` / 502 when missing. See **HuggingFace token (gated models)** and `references/troubleshooting.md` § **Gated HuggingFace models**. - **Post-training eval format** — use the same CHAT `messages` JSONL as training. **Do not** flatten rows to `prompt`/`expected` for the evaluator. Send `messages[:-1]` at inference (exclude final assistant label); score against `messages[-1].content`. See `references/post-training-eval.md` and `references/eval_helpers.py`. - **LoRA adapters load automatically for eval** — when a job completes, the adapter is registered on the model entity and hot-reloaded on any **READY** deployment with `lora_enabled: true`. **Do not** update deployments or providers before eval. **Do** route LoRA eval through the **provider** gateway (`/provider//-/v1` with `model: default--`); the model-entity path (`/model//-/v1`) always hits the base model. See `references/post-training-eval.md` § **Request routing (base vs LoRA)**. @@ -600,8 +600,8 @@ The adapter `` is registered on `default/`. Weights a | Target | Gateway path | OpenAI base URL | Request `"model"` field | |--------|--------------|-----------------|-------------------------| -| **Base** weights | model-entity | `/apis/inference-gateway/v2/workspaces/default/model//-/v1` | `default/` | -| **LoRA adapter** | **provider** | `/apis/inference-gateway/v2/workspaces/default/provider//-/v1` | `default--` | +| **Base** weights | model-entity | `$NMP_BASE_URL/apis/inference-gateway/v2/workspaces/default/model//-/v1` | `default/` | +| **LoRA adapter** | **provider** | `$NMP_BASE_URL/apis/inference-gateway/v2/workspaces/default/provider//-/v1` | `default--` | **Common mistake:** posting to the model-entity URL with `"model": "default--"` still runs the **base** model. Base-vs-adapter eval will look identical until LoRA requests use the **provider** URL above. See `references/post-training-eval.md` § **Request routing (base vs LoRA)**. @@ -619,7 +619,7 @@ Match training context at inference — send **`messages[:-1]`** (all turns exce #### Example — LoRA adapter via provider \`\`\`bash -export NEMO_BASE_URL= # omit when using default localhost +export NMP_BASE_URL= # omit when using default localhost; NEMO_BASE_URL also works nemo inference gateway provider post v1/chat/completions --workspace default \\ --body '{ "model": "default--", @@ -633,7 +633,7 @@ nemo inference gateway provider post v1/chat/completions --workspace #### Example — base model via model-entity (comparison) \`\`\`bash -export NEMO_BASE_URL= +export NMP_BASE_URL= nemo inference gateway model post v1/chat/completions --workspace default \\ --body '{ "model": "default/", @@ -651,7 +651,6 @@ Validation loss from training is **not** accuracy. To compare base vs adapter on \`\`\`bash cd /path/to/nemo-platform uv run python plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py \\ - --base-url \\ --model-entity \\ --adapter \\ --provider \\ @@ -659,10 +658,10 @@ uv run python plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer --split validation.jsonl \`\`\` -Uses CHAT `messages` rows unchanged from the training fileset (`messages[:-1]` at inference). Repeat `--adapter` for multi-adapter compare. `--provider` is optional when a READY provider is auto-discovered. +Uses CHAT `messages` rows unchanged from the training fileset (`messages[:-1]` at inference). Repeat `--adapter` for multi-adapter compare. `--provider` is optional when a READY provider is auto-discovered. Set `NMP_BASE_URL` (or pass `--base-url`) when the platform is not localhost — the helper reads `$NMP_BASE_URL` / `$NEMO_BASE_URL` by default. ``` -Use the user's platform URL in `NEMO_BASE_URL` when they overrode it; omit the export line for default `http://127.0.0.1:8080`. Substitute ``, ``, and entity/adapter names with values from discovery — do not leave generic placeholders in the user-facing report. Do **not** tell the user to update the deployment or add the adapter to a provider before calling it — registration on the model entity is sufficient. +Use the user's platform URL in `NMP_BASE_URL` when they overrode it; omit the export line for default `http://127.0.0.1:8080`. Substitute ``, concrete URLs, and entity/adapter names with values from discovery — do not leave generic placeholders in the user-facing report. Do **not** tell the user to update the deployment or add the adapter to a provider before calling it — registration on the model entity is sufficient. **Save report to `/tmp`** — unless the user opts out, write the full Markdown report (header, **Training configuration**, **Using the adapter** when `completed`, and **Resources created** when a slug or new filesets were used) to `/tmp/fine-tune-result-.md`. Use the random slug from the run when one was assigned; otherwise use the job id suffix (e.g. `a925b07ff678`). diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py index dfdddca7e9..4bb2280d8f 100644 --- a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py @@ -6,11 +6,13 @@ LoRA adapters registered on the model entity are hot-reloaded automatically on deployments with ``lora_enabled: true`` — no deployment update before eval. -Run from the nemo-platform git root:: +Run from the nemo-platform git root (reads ``$NMP_BASE_URL`` / ``$NEMO_BASE_URL`` when +``--base-url`` is omitted):: + export NMP_BASE_URL=http://127.0.0.1:8080 uv run python plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/eval_helpers.py \\ - --model-entity qwen3-1.7b --adapter lora-a --adapter lora-b \\ - --provider qwen3-1.7b-csqa-lora-deploy --dataset-fileset commonsense_qa --split validation + --model-entity --adapter --adapter \\ + --provider --dataset-fileset --split validation.jsonl Import in agent scripts (add references/ to sys.path or run via uv from repo root). """ @@ -19,6 +21,7 @@ import argparse import json +import os import re import urllib.error import urllib.request @@ -43,6 +46,15 @@ "messages": [{"role": "user", "content": "{{ item.messages[0].content }}"}], } +PLATFORM_HTTP_TIMEOUT_SEC = 60 + + +def _assert_message_turn(turn: Any, *, label: str, index: int | str) -> dict[str, Any]: + """Validate one messages[] element is a dict before reading role/content.""" + if not isinstance(turn, dict): + raise ValueError(f"{label}: messages[{index}] must be an object with role/content, got {type(turn).__name__}") + return turn + def assert_chat_row(row: dict[str, Any], *, index: int | None = None) -> None: """Validate one dataset row matches automodel/unsloth CHAT training shape.""" @@ -55,9 +67,11 @@ def assert_chat_row(row: dict[str, Any], *, index: int | None = None) -> None: messages = row["messages"] if not isinstance(messages, list) or len(messages) < 2: raise ValueError(f"{label}: messages must be a list with at least one prompt turn + final assistant label") - if messages[0].get("role") != "user": + first = _assert_message_turn(messages[0], label=label, index=0) + if first.get("role") != "user": raise ValueError(f"{label}: expected messages[0]=user") - if messages[-1].get("role") != "assistant": + last = _assert_message_turn(messages[-1], label=label, index=-1) + if last.get("role") != "assistant": raise ValueError(f"{label}: expected final messages[-1]=assistant (the label to score)") @@ -88,11 +102,8 @@ def load_chat_jsonl_from_platform( remote_path: str, ) -> list[dict[str, Any]]: """Download a JSONL split from a platform fileset and validate CHAT rows.""" - url = ( - f"{base_url.rstrip('/')}/apis/files/v2/workspaces/{workspace}/filesets/" - f"{fileset}/-/{remote_path.lstrip('/')}" - ) - with urllib.request.urlopen(url) as response: + url = f"{base_url.rstrip('/')}/apis/files/v2/workspaces/{workspace}/filesets/{fileset}/-/{remote_path.lstrip('/')}" + with urllib.request.urlopen(url, timeout=PLATFORM_HTTP_TIMEOUT_SEC) as response: content = response.read().decode("utf-8") rows: list[dict[str, Any]] = [] for index, line in enumerate(content.splitlines(), start=1): @@ -137,9 +148,7 @@ def served_model_name(*, workspace: str, entity_or_adapter: str, finetuning: str raise ValueError("finetuning must be 'base' or 'lora'") -def adapter_composite_entity_name( - *, model_entity: str, workspace: str, adapter_name: str -) -> str: +def adapter_composite_entity_name(*, model_entity: str, workspace: str, adapter_name: str) -> str: """LoRA composite model-entity path segment (for reference / OpenAI-route body only). The model-entity proxy path ``model/{composite}/-/v1`` requires a dedicated @@ -151,18 +160,12 @@ def adapter_composite_entity_name( def model_entity_gateway_url(*, base_url: str, workspace: str, model_entity: str) -> str: """OpenAI-compatible inference-gateway URL for a registered base model entity.""" - return ( - f"{base_url.rstrip('/')}/apis/inference-gateway/v2/workspaces/{workspace}/" - f"model/{model_entity}/-/v1" - ) + return f"{base_url.rstrip('/')}/apis/inference-gateway/v2/workspaces/{workspace}/model/{model_entity}/-/v1" def provider_gateway_url(*, base_url: str, workspace: str, provider_name: str) -> str: """OpenAI-compatible inference-gateway URL for a model provider (LoRA eval route).""" - return ( - f"{base_url.rstrip('/')}/apis/inference-gateway/v2/workspaces/{workspace}/" - f"provider/{provider_name}/-/v1" - ) + return f"{base_url.rstrip('/')}/apis/inference-gateway/v2/workspaces/{workspace}/provider/{provider_name}/-/v1" def gateway_path_from_url(url: str) -> str: @@ -175,7 +178,7 @@ def gateway_path_from_url(url: str) -> str: def _platform_get_json(url: str) -> dict[str, Any]: - with urllib.request.urlopen(url) as response: + with urllib.request.urlopen(url, timeout=PLATFORM_HTTP_TIMEOUT_SEC) as response: return json.loads(response.read().decode("utf-8")) @@ -186,10 +189,7 @@ def find_ready_provider_for_model_entity( model_entity: str, ) -> str | None: """Return a READY provider name that serves ``workspace/model_entity`` (base or LoRA).""" - url = ( - f"{base_url.rstrip('/')}/apis/models/v2/workspaces/{workspace}/providers" - f"?page_size=100&filter.status=READY" - ) + url = f"{base_url.rstrip('/')}/apis/models/v2/workspaces/{workspace}/providers?page_size=100&filter.status=READY" payload = _platform_get_json(url) base_entity_id = f"{workspace}/{model_entity}" matches: list[str] = [] @@ -276,8 +276,7 @@ def list_completed_job_adapters( ) -> list[JobAdapterInfo]: """List completed customization jobs and their output adapter names.""" url = ( - f"{base_url.rstrip('/')}/apis/jobs/v2/workspaces/{workspace}/jobs" - f"?page_size={page_size}&filter.status=completed" + f"{base_url.rstrip('/')}/apis/jobs/v2/workspaces/{workspace}/jobs?page_size={page_size}&filter.status=completed" ) payload = _platform_get_json(url) dataset_ref = f"{workspace}/{dataset_fileset}" if dataset_fileset else None @@ -373,9 +372,7 @@ def build_platform_model_target( workspace=workspace, provider_name=resolved_provider, ), - name=served_model_name( - workspace=workspace, entity_or_adapter=adapter_name, finetuning="lora" - ), + name=served_model_name(workspace=workspace, entity_or_adapter=adapter_name, finetuning="lora"), format=ModelFormat.NVIDIA_NIM, ) @@ -415,15 +412,12 @@ def summarize_chat_eval_result(*, target: str, model_name: str, gateway_url: str em_rows = result.per_metric["exact-match"].row_scores num_samples = len(em_rows) raw_correct = sum( - 1 - for rs in em_rows - if rs.sample.get("output_text", "").strip() == reference_content(rs.item).strip() + 1 for rs in em_rows if rs.sample.get("output_text", "").strip() == reference_content(rs.item).strip() ) norm_correct = sum( 1 for rs in em_rows - if normalize_mcqa_answer(rs.sample.get("output_text", "")) - == normalize_mcqa_answer(reference_content(rs.item)) + if normalize_mcqa_answer(rs.sample.get("output_text", "")) == normalize_mcqa_answer(reference_content(rs.item)) ) aggregate_metrics: dict[str, dict[str, float | None]] = {} for metric_name, metric_result in result.per_metric.items(): @@ -601,8 +595,7 @@ def routing_sanity_warnings( if summary.target == "base": if summary.gateway_path != "model-entity": warnings.append( - f"base eval used {summary.gateway_path} route; expected model-entity " - f"({summary.gateway_url})" + f"base eval used {summary.gateway_path} route; expected model-entity ({summary.gateway_url})" ) continue if summary.gateway_path != "provider": @@ -632,12 +625,8 @@ def build_eval_payload( if any(summary.target == "base" for summary in summaries): routing["base"] = { "gateway_path": "model-entity", - "url": model_entity_gateway_url( - base_url=base_url, workspace=workspace, model_entity=model_entity - ), - "model_field": served_model_name( - workspace=workspace, entity_or_adapter=model_entity, finetuning="base" - ), + "url": model_entity_gateway_url(base_url=base_url, workspace=workspace, model_entity=model_entity), + "model_field": served_model_name(workspace=workspace, entity_or_adapter=model_entity, finetuning="base"), } for adapter_name in adapter_names: target = build_platform_model_target( @@ -667,9 +656,18 @@ def build_eval_payload( return payload +def default_base_url() -> str: + """Platform URL from env or localhost default.""" + return os.environ.get("NMP_BASE_URL") or os.environ.get("NEMO_BASE_URL") or "http://127.0.0.1:8080" + + def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Compare base vs LoRA on CHAT validation JSONL") - parser.add_argument("--base-url", default="http://127.0.0.1:8080") + parser.add_argument( + "--base-url", + default=default_base_url(), + help="Platform URL (default: $NMP_BASE_URL, $NEMO_BASE_URL, or http://127.0.0.1:8080)", + ) parser.add_argument("--workspace", default="default") parser.add_argument("--model-entity", required=True) parser.add_argument( diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md index 0972583d18..2e843f1732 100644 --- a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/post-training-eval.md @@ -9,8 +9,8 @@ Training and evaluation must use the **same CHAT JSONL row shape**: ```json { "messages": [ - {"role": "user", "content": "Question: …\nChoices:\n…\nAnswer:"}, - {"role": "assistant", "content": "bank"} + {"role": "user", "content": ""}, + {"role": "assistant", "content": "