From 3dd31d2fa1d4dc6f333a4e2f1ca01c7e65706870 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh Date: Wed, 27 May 2026 15:16:25 -0700 Subject: [PATCH 1/3] add LC benchmarks Signed-off-by: Cheng-Ping Hsieh --- README.md | 1 + benchmarks/graphwalks/README.md | 82 +++++++ benchmarks/graphwalks/__init__.py | 0 benchmarks/graphwalks/config.yaml | 28 +++ benchmarks/graphwalks/config_n3_1m.yaml | 28 +++ benchmarks/graphwalks/data/.gitignore | 1 + benchmarks/graphwalks/prepare.py | 175 +++++++++++++++ benchmarks/graphwalks/prepare_n3_1m.py | 49 +++++ benchmarks/longbench_v2/README.md | 42 +++- benchmarks/longbench_v2/config_n3_1m.yaml | 18 ++ benchmarks/longbench_v2/prepare.py | 119 +++++++++-- benchmarks/longbench_v2/prepare_n3_1m.py | 47 ++++ benchmarks/longcodebench/README.md | 35 ++- benchmarks/longcodebench/config_n3_1m.yaml | 18 ++ benchmarks/longcodebench/prepare.py | 111 ++++++++-- benchmarks/longcodebench/prepare_n3_1m.py | 45 ++++ benchmarks/mrcr/README.md | 49 ++++- benchmarks/mrcr/config_n3_128k.yaml | 28 +++ benchmarks/mrcr/config_n3_1m.yaml | 28 +++ benchmarks/mrcr/prepare.py | 122 ++++++++--- benchmarks/mrcr/prepare_n3_128k.py | 45 ++++ benchmarks/mrcr/prepare_n3_1m.py | 45 ++++ resources_servers/graphwalks/README.md | 46 ++++ resources_servers/graphwalks/app.py | 170 +++++++++++++++ .../graphwalks/configs/graphwalks.yaml | 25 +++ resources_servers/graphwalks/data/.gitignore | 6 + .../graphwalks/data/example.jsonl | 5 + .../graphwalks/data/example_metrics.json | 60 ++++++ resources_servers/graphwalks/requirements.txt | 2 + .../graphwalks/tests/__init__.py | 0 .../graphwalks/tests/test_app.py | 200 ++++++++++++++++++ 31 files changed, 1562 insertions(+), 68 deletions(-) create mode 100644 benchmarks/graphwalks/README.md create mode 100644 benchmarks/graphwalks/__init__.py create mode 100644 benchmarks/graphwalks/config.yaml create mode 100644 benchmarks/graphwalks/config_n3_1m.yaml create mode 100644 benchmarks/graphwalks/data/.gitignore create mode 100644 benchmarks/graphwalks/prepare.py create mode 100644 benchmarks/graphwalks/prepare_n3_1m.py create mode 100644 benchmarks/longbench_v2/config_n3_1m.yaml create mode 100644 benchmarks/longbench_v2/prepare_n3_1m.py create mode 100644 benchmarks/longcodebench/config_n3_1m.yaml create mode 100644 benchmarks/longcodebench/prepare_n3_1m.py create mode 100644 benchmarks/mrcr/config_n3_128k.yaml create mode 100644 benchmarks/mrcr/config_n3_1m.yaml create mode 100644 benchmarks/mrcr/prepare_n3_128k.py create mode 100644 benchmarks/mrcr/prepare_n3_1m.py create mode 100644 resources_servers/graphwalks/README.md create mode 100644 resources_servers/graphwalks/app.py create mode 100644 resources_servers/graphwalks/configs/graphwalks.yaml create mode 100644 resources_servers/graphwalks/data/.gitignore create mode 100644 resources_servers/graphwalks/data/example.jsonl create mode 100644 resources_servers/graphwalks/data/example_metrics.json create mode 100644 resources_servers/graphwalks/requirements.txt create mode 100644 resources_servers/graphwalks/tests/__init__.py create mode 100644 resources_servers/graphwalks/tests/test_app.py diff --git a/README.md b/README.md index c8a379d73..51252e48e 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,7 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace). | Genrm Compare | rlhf | GenRM pairwise comparison for RLHF training | Compare multiple candidate responses using GenRM model | - | - | - | genrm_compare.yaml | - | | Google Search | agent | Multi-choice question answering problems with search tools integrated | Improve knowledge-related benchmarks with search tools | ✓ | - | Apache 2.0 | google_search.yaml | Nemotron-RL-knowledge-web_search-mcqa | | Gpqa Diamond | knowledge | GPQA Diamond multiple-choice question answering problems | Evaluate graduate-level scientific reasoning via MCQ verification | ✓ | - | MIT | gpqa_diamond.yaml | - | +| Graphwalks | other | Long-context graph-walks (BFS / parents) with F1-over-node-sets grading from openai/graphwalks | Improve long-context multi-step graph reasoning and adjacency-list traversal | - | - | - | graphwalks.yaml | - | | Grl Sokoban | games | Single-box Sokoban in Gymnasium API style. | Model emits one move per turn until the puzzle is solved. | - | - | - | grl_sokoban.yaml | - | | Grl Tetris | games | Tetris in Gymnasium API style. Model emits one or more moves per turn. | Multi-step Tetris environment | - | - | - | grl_tetris.yaml | - | | Gymnasium | other | Base class for Gymnasium-style servers. Not a standalone server. | Reusable base class for step/reset style environments | - | - | - | gymnasium.yaml | - | diff --git a/benchmarks/graphwalks/README.md b/benchmarks/graphwalks/README.md new file mode 100644 index 000000000..526a6e915 --- /dev/null +++ b/benchmarks/graphwalks/README.md @@ -0,0 +1,82 @@ +# GraphWalks benchmark + +Benchmark wrapper over the [`graphwalks` resources server](../../resources_servers/graphwalks/README.md) +for the [openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks) dataset. + +Each task supplies an adjacency list and asks the model to either list +the parents of a node (`problem_type: parents`) or return the BFS +frontier at exactly depth N (`problem_type: bfs`). Scoring is F1 over +the predicted node set vs. the expected node set, gated on the model +producing a `Final Answer: [...]` line. + +## Variants + +Two preset configs ship alongside this benchmark. Both apply the same +data + Skills prompt fixes (BFS depth disambiguation, self-parent +removal); they differ only in the tokenizer used for the `n_tokens` +column and an optional length filter. + +| Variant | Config | Prepare script | Tokenizer | Max tokens | Output | +|---|---|---|---|---|---| +| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/graphwalks_benchmark.jsonl` | +| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/graphwalks_n3_1m_benchmark.jsonl` | + +The N3 1M variant requires HF auth for the gated NVIDIA repo +(`HF_TOKEN` env or `huggingface-cli login`). + +## Prepare benchmark data + +```bash +# Default (o200k_base, no filter) +ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config.yaml]" + +# N3 1M variant +ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config_n3_1m.yaml]" +``` + +For one-off custom builds (different tokenizer / cap / output path), +invoke `prepare.py` directly: + +```bash +python benchmarks/graphwalks/prepare.py \ + --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \ + --max_context_tokens 131072 \ + --output_fpath benchmarks/graphwalks/data/graphwalks_llama_128k_benchmark.jsonl +``` + +## Start environment + +```bash +ng_run "+config_paths=[benchmarks/graphwalks/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]" +``` + +## Collect rollouts + +```bash +# Default variant +ng_collect_rollouts \ + +agent_name=graphwalks_benchmark_simple_agent \ + +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_benchmark.jsonl \ + +output_jsonl_fpath=results/graphwalks_rollouts.jsonl \ + +num_repeats=4 + +# N3 1M variant +ng_collect_rollouts \ + +agent_name=graphwalks_n3_1m_benchmark_simple_agent \ + +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl \ + +output_jsonl_fpath=results/graphwalks_n3_1m_rollouts.jsonl \ + +num_repeats=4 +``` + +## Metrics + +`compute_metrics()` emits `pass@k/accuracy`, `pass@1[avg-of-k]/accuracy` +via `compute_pass_majority_metrics`, plus per-`problem_type` subset +breakdowns via `compute_subset_metrics(subset_key="problem_type")` — +stratified pass@k keys like `problem_type=parents/pass@4/accuracy` and +`problem_type=bfs/pass@4/accuracy`. + +For reasoning models the vLLM server should be started with a +`--reasoning-parser` matching the model (e.g. `nano_v3` for Nemotron-3 +or `deepseek_r1`) so that `...` blocks are stripped +upstream of `Final Answer:` parsing. diff --git a/benchmarks/graphwalks/__init__.py b/benchmarks/graphwalks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/graphwalks/config.yaml b/benchmarks/graphwalks/config.yaml new file mode 100644 index 000000000..9e741fb02 --- /dev/null +++ b/benchmarks/graphwalks/config.yaml @@ -0,0 +1,28 @@ +# Chain to the generic graphwalks resources server + agent config. +config_paths: + - resources_servers/graphwalks/configs/graphwalks.yaml + +# Benchmark-specific overrides via `_inherit_from` so the base graphwalks config +# stays isolated from benchmark use. + +graphwalks_benchmark_resources_server: + _inherit_from: graphwalks_resources_server + +graphwalks_benchmark_simple_agent: + _inherit_from: graphwalks_simple_agent + responses_api_agents: + simple_agent: + resources_server: + name: graphwalks_benchmark_resources_server + datasets: + - name: graphwalks + type: benchmark + jsonl_fpath: benchmarks/graphwalks/data/graphwalks_benchmark.jsonl + prompt_config: null + prepare_script: benchmarks/graphwalks/prepare.py + # Rollouts per task for pass@k variance. + # NOTE: for `type: benchmark` datasets, `num_repeats` here is a + # placeholder — it only triggers row duplication for + # `type: train`/`validation`. To actually get N rollouts per task, + # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. + num_repeats: 4 diff --git a/benchmarks/graphwalks/config_n3_1m.yaml b/benchmarks/graphwalks/config_n3_1m.yaml new file mode 100644 index 000000000..746917e97 --- /dev/null +++ b/benchmarks/graphwalks/config_n3_1m.yaml @@ -0,0 +1,28 @@ +# GraphWalks — N3 1M-context variant. +# Same data + Skills prompt fixes as `config.yaml`, but `prepare_n3_1m.py` +# counts tokens with the Nemotron-3-Super HF tokenizer and drops samples +# whose tokenized prompt exceeds 1048576 tokens. +config_paths: + - resources_servers/graphwalks/configs/graphwalks.yaml + +graphwalks_n3_1m_benchmark_resources_server: + _inherit_from: graphwalks_resources_server + +graphwalks_n3_1m_benchmark_simple_agent: + _inherit_from: graphwalks_simple_agent + responses_api_agents: + simple_agent: + resources_server: + name: graphwalks_n3_1m_benchmark_resources_server + datasets: + - name: graphwalks_n3_1m + type: benchmark + jsonl_fpath: benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl + prompt_config: null + prepare_script: benchmarks/graphwalks/prepare_n3_1m.py + # Rollouts per task for pass@k variance. + # NOTE: for `type: benchmark` datasets, `num_repeats` here is a + # placeholder — it only triggers row duplication for + # `type: train`/`validation`. To actually get N rollouts per task, + # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. + num_repeats: 4 diff --git a/benchmarks/graphwalks/data/.gitignore b/benchmarks/graphwalks/data/.gitignore new file mode 100644 index 000000000..b06d45fe6 --- /dev/null +++ b/benchmarks/graphwalks/data/.gitignore @@ -0,0 +1 @@ +*benchmark.jsonl diff --git a/benchmarks/graphwalks/prepare.py b/benchmarks/graphwalks/prepare.py new file mode 100644 index 000000000..cbbd963d8 --- /dev/null +++ b/benchmarks/graphwalks/prepare.py @@ -0,0 +1,175 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare the GraphWalks benchmark data. + +Source: https://huggingface.co/datasets/openai/graphwalks + +Ported from: + https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/graphwalks/prepare.py + +Two upstream-prompt corrections from Skills are preserved here verbatim: + + 1. The BFS prompt is rewritten to disambiguate "depth N" — without + this rewrite, models often return nodes at intermediate depths. + 2. The parents prompt sometimes includes the target node inside its + own answer set; we strip it. + +Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens`` +field, with no length filter. For an N3 1M-context +variant that filters to fit, see ``prepare_n3_1m.py`` and +``config_n3_1m.yaml``. + +Invocation +---------- + +``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using +the defaults below. To build a custom variant, run this script +directly:: + + python benchmarks/graphwalks/prepare.py \\ + --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \\ + --max_context_tokens 131072 +""" + +import argparse +import json +import re +from pathlib import Path +from typing import Callable, Optional + +import tiktoken +from datasets import load_dataset +from tqdm import tqdm + + +BENCHMARK_DIR = Path(__file__).parent +DATA_DIR = BENCHMARK_DIR / "data" +DEFAULT_OUTPUT_FPATH = DATA_DIR / "graphwalks_benchmark.jsonl" + +DEFAULT_TOKENIZER_NAME = "o200k_base" +DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default + +_BFS_PATTERN = re.compile(r"Perform a BFS from node (\S+) with depth (\d+)") +_BFS_REPLACEMENT = ( + r"Perform a BFS from node \1 and return only the nodes at exactly depth \2 " + r"(not nodes at intermediate depths)" +) +_PARENTS_PATTERN = re.compile(r"Find the parents of node ([^\s.]+)\.") + + +def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]: + """Return a ``text -> token_count`` function. + + Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken + encoding, falls back to ``transformers.AutoTokenizer``. + """ + try: + enc = tiktoken.get_encoding(tokenizer_name) + return lambda text: len(enc.encode(text, disallowed_special=())) + except ValueError: + from transformers import AutoTokenizer + + hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) + return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False)) + + +def prepare( + tokenizer_name: str = DEFAULT_TOKENIZER_NAME, + max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS, + output_fpath: Path = DEFAULT_OUTPUT_FPATH, +) -> Path: + output_fpath = Path(output_fpath) + output_fpath.parent.mkdir(parents=True, exist_ok=True) + + dataset = load_dataset("openai/graphwalks", split="train") + count_tokens = _build_token_counter(tokenizer_name) + + kept = 0 + skipped_tokens = 0 + skipped_self_parent = 0 + with output_fpath.open("w", encoding="utf-8") as fout: + for entry in tqdm(dataset, desc="Preparing GraphWalks"): + prompt_text = entry["prompt"] + answer_nodes = list(entry["answer_nodes"]) + + # Skills fix #1: disambiguate BFS depth. + prompt_text = _BFS_PATTERN.sub(_BFS_REPLACEMENT, prompt_text) + + # Skills fix #2: strip the queried node from its own parents answer. + m = _PARENTS_PATTERN.search(prompt_text) + target = m.group(1) if m else None + if target is not None and target in answer_nodes: + answer_nodes.remove(target) + skipped_self_parent += 1 + + n_tokens = count_tokens(prompt_text) + if max_context_tokens is not None and n_tokens > max_context_tokens: + skipped_tokens += 1 + continue + + sample = { + "responses_create_params": {"input": [{"role": "user", "content": prompt_text}]}, + "expected_answer": json.dumps(sorted(answer_nodes)), + "problem_type": entry["problem_type"], + "n_tokens": n_tokens, + "prompt_chars": entry["prompt_chars"], + } + fout.write(json.dumps(sample, ensure_ascii=False) + "\n") + kept += 1 + + cap_str = "none" if max_context_tokens is None else str(max_context_tokens) + print( + f"Wrote {kept} samples to {output_fpath} " + f"(tokenizer={tokenizer_name}, cap={cap_str}; " + f"dropped {skipped_tokens} over cap; cleaned {skipped_self_parent} self-parent answers)" + ) + return output_fpath + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--tokenizer_name", + default=DEFAULT_TOKENIZER_NAME, + help=( + "Tokenizer used for token counting. Accepts a tiktoken encoding name " + "(e.g. 'cl100k_base', 'o200k_base') or a HuggingFace model id " + "(e.g. 'meta-llama/Llama-3.1-8B-Instruct'). " + f"Default: {DEFAULT_TOKENIZER_NAME}" + ), + ) + parser.add_argument( + "--max_context_tokens", + type=int, + default=DEFAULT_MAX_CONTEXT_TOKENS, + help=( + "Drop samples whose tokenized prompt exceeds this many tokens. " + "Omit (or pass a negative number) for no filter. " + f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}" + ), + ) + parser.add_argument( + "--output_fpath", + type=Path, + default=DEFAULT_OUTPUT_FPATH, + help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = _parse_args() + cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None + prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath) diff --git a/benchmarks/graphwalks/prepare_n3_1m.py b/benchmarks/graphwalks/prepare_n3_1m.py new file mode 100644 index 000000000..bdb6587ef --- /dev/null +++ b/benchmarks/graphwalks/prepare_n3_1m.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GraphWalks variant: Nemotron-3-Super tokenizer with a 1M token cap. + +Same data + Skills fixes as ``prepare.py``, but counts tokens with the +``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace tokenizer +and drops samples whose tokenized prompt exceeds 1048576 tokens +(Nemotron-3-Super's native 1M context window). + +Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated +NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``). + +The output JSONL lives alongside the default at +``data/graphwalks_n3_1m_benchmark.jsonl`` so both variants can +coexist. +""" + +from pathlib import Path + +from .prepare import prepare as _prepare + + +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +MAX_CONTEXT_TOKENS = 1048576 +OUTPUT_FPATH = Path(__file__).parent / "data" / "graphwalks_n3_1m_benchmark.jsonl" + + +def prepare() -> Path: + return _prepare( + tokenizer_name=TOKENIZER_NAME, + max_context_tokens=MAX_CONTEXT_TOKENS, + output_fpath=OUTPUT_FPATH, + ) + + +if __name__ == "__main__": + prepare() diff --git a/benchmarks/longbench_v2/README.md b/benchmarks/longbench_v2/README.md index 41da06adf..61fbd756b 100644 --- a/benchmarks/longbench_v2/README.md +++ b/benchmarks/longbench_v2/README.md @@ -13,26 +13,58 @@ grading; this directory adds only the dataset and prompt. Data source: HuggingFace `THUDM/LongBench-v2` (single "train" split, which is the full eval set). `prepare.py` preserves every Skills field (`index`, `context`, `question`, `choice_A..D`, `expected_answer`, -`domain`, `sub_domain`, `difficulty`, `length`, `context_tokens` via -tiktoken `cl100k_base`) and additionally emits `options` and -`grading_mode` for the mcqa server. +`domain`, `sub_domain`, `difficulty`, `length`, `context_tokens`) and +additionally emits `options` and `grading_mode` for the mcqa server. + +## Variants + +| Variant | Config | Prepare script | Tokenizer | Max tokens | Output | +|---|---|---|---|---|---| +| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/longbench_v2_benchmark.jsonl` | +| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/longbench_v2_n3_1m_benchmark.jsonl` | + +The N3 1M variant requires HF auth for the gated NVIDIA repo +(`HF_TOKEN` env or `huggingface-cli login`). LongBench-v2 contexts +span 8k-2M words, so the long-bucket rows above 1M tokens are filtered +out under the N3 1M cap. + +For one-off custom builds (different tokenizer / cap / output path), +invoke `prepare.py` directly: + +```bash +python benchmarks/longbench_v2/prepare.py \ + --tokenizer_name cl100k_base \ + --max_context_tokens 131072 \ + --output_fpath benchmarks/longbench_v2/data/longbench_v2_cl100k_128k_benchmark.jsonl +``` ## Example usage ```bash -# Prepare benchmark data +# Prepare benchmark data (default) ng_prepare_benchmark "+config_paths=[benchmarks/longbench_v2/config.yaml]" +# Prepare benchmark data (N3 1M variant) +ng_prepare_benchmark "+config_paths=[benchmarks/longbench_v2/config_n3_1m.yaml]" + # Running servers config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\ benchmarks/longbench_v2/config.yaml" ng_run "+config_paths=[$config_paths]" -# Collecting rollouts +# Collecting rollouts — default ng_collect_rollouts \ +agent_name=longbench_v2_mcqa_simple_agent \ +input_jsonl_fpath=benchmarks/longbench_v2/data/longbench_v2_benchmark.jsonl \ +output_jsonl_fpath=results/longbench_v2_rollouts.jsonl \ +prompt_config=benchmarks/longbench_v2/prompts/default.yaml \ +num_repeats=4 + +# Collecting rollouts — N3 1M +ng_collect_rollouts \ + +agent_name=longbench_v2_n3_1m_mcqa_simple_agent \ + +input_jsonl_fpath=benchmarks/longbench_v2/data/longbench_v2_n3_1m_benchmark.jsonl \ + +output_jsonl_fpath=results/longbench_v2_n3_1m_rollouts.jsonl \ + +prompt_config=benchmarks/longbench_v2/prompts/default.yaml \ + +num_repeats=4 ``` diff --git a/benchmarks/longbench_v2/config_n3_1m.yaml b/benchmarks/longbench_v2/config_n3_1m.yaml new file mode 100644 index 000000000..d532cc078 --- /dev/null +++ b/benchmarks/longbench_v2/config_n3_1m.yaml @@ -0,0 +1,18 @@ +# LongBench-v2 — N3 1M-context variant. +# Same data + fields as `config.yaml`, but `prepare_n3_1m.py` counts +# `context_tokens` with the Nemotron-3-Super HF tokenizer and drops +# samples whose tokenized context exceeds 1048576 tokens. +config_paths: + - resources_servers/mcqa/configs/mcqa.yaml + +longbench_v2_n3_1m_mcqa_simple_agent: + _inherit_from: mcqa_simple_agent + responses_api_agents: + simple_agent: + datasets: + - name: longbench_v2_n3_1m + type: benchmark + jsonl_fpath: benchmarks/longbench_v2/data/longbench_v2_n3_1m_benchmark.jsonl + prompt_config: benchmarks/longbench_v2/prompts/default.yaml + prepare_script: benchmarks/longbench_v2/prepare_n3_1m.py + license: Apache 2.0 diff --git a/benchmarks/longbench_v2/prepare.py b/benchmarks/longbench_v2/prepare.py index 020721f8b..702b73878 100644 --- a/benchmarks/longbench_v2/prepare.py +++ b/benchmarks/longbench_v2/prepare.py @@ -28,10 +28,28 @@ Dataset: https://huggingface.co/datasets/THUDM/LongBench-v2 Paper: https://arxiv.org/abs/2412.15204 + +Defaults: tokenizer ``o200k_base`` (tiktoken) for the +``context_tokens`` field, with no length filter. For an N3 1M-context +variant that filters to fit, see ``prepare_n3_1m.py`` and +``config_n3_1m.yaml``. + +Invocation +---------- + +``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using +the defaults below. To build a custom variant, run this script +directly:: + + python benchmarks/longbench_v2/prepare.py \\ + --tokenizer_name cl100k_base \\ + --max_context_tokens 131072 """ +import argparse import json from pathlib import Path +from typing import Callable, Optional import tiktoken from datasets import load_dataset @@ -40,23 +58,53 @@ BENCHMARK_DIR = Path(__file__).parent DATA_DIR = BENCHMARK_DIR / "data" -OUTPUT_FPATH = DATA_DIR / "longbench_v2_benchmark.jsonl" +DEFAULT_OUTPUT_FPATH = DATA_DIR / "longbench_v2_benchmark.jsonl" + +DEFAULT_TOKENIZER_NAME = "o200k_base" +DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default + -# tiktoken encoding name used by Skills' prepare.py for `context_tokens`. -TOKENIZER_NAME = "cl100k_base" +def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]: + """Return a ``text -> token_count`` function. + Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken + encoding, falls back to ``transformers.AutoTokenizer``. The tiktoken + path uses ``disallowed_special=()`` because LongBench-v2 contexts + sometimes contain raw ``<|endoftext|>`` strings that tiktoken would + otherwise refuse to encode. + """ + try: + enc = tiktoken.get_encoding(tokenizer_name) + return lambda text: len(enc.encode(text, disallowed_special=())) + except ValueError: + from transformers import AutoTokenizer -def prepare() -> Path: + hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) + return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False)) + + +def prepare( + tokenizer_name: str = DEFAULT_TOKENIZER_NAME, + max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS, + output_fpath: Path = DEFAULT_OUTPUT_FPATH, +) -> Path: """Download LongBench-v2, convert to Gym JSONL, return the output file path.""" - DATA_DIR.mkdir(parents=True, exist_ok=True) + output_fpath = Path(output_fpath) + output_fpath.parent.mkdir(parents=True, exist_ok=True) - print(f"Loading THUDM/LongBench-v2 (split='train', {TOKENIZER_NAME} for context tokens) ...") + print(f"Loading THUDM/LongBench-v2 (split='train', tokenizer='{tokenizer_name}') ...") dataset = load_dataset("THUDM/LongBench-v2", split="train") - encoder = tiktoken.get_encoding(TOKENIZER_NAME) + count_tokens = _build_token_counter(tokenizer_name) + + kept = 0 + skipped = 0 + with open(output_fpath, "w", encoding="utf-8") as out: + for entry in tqdm(dataset, desc=f"Writing {output_fpath.name}"): + context_tokens = count_tokens(entry["context"]) + if max_context_tokens is not None and context_tokens > max_context_tokens: + skipped += 1 + continue - count = 0 - with open(OUTPUT_FPATH, "w", encoding="utf-8") as out: - for entry in tqdm(dataset, desc="Writing longbench_v2_benchmark.jsonl"): record = { # Fields preserved verbatim from Skills' prepare.py "index": entry["_id"], @@ -71,11 +119,7 @@ def prepare() -> Path: "sub_domain": entry["sub_domain"], "difficulty": entry["difficulty"], "length": entry["length"], - # disallowed_special=() — some LongBench-v2 contexts contain - # raw `<|endoftext|>` strings that tiktoken would otherwise - # refuse to encode. We only need the count, so encode them - # as plain text. - "context_tokens": len(encoder.encode(entry["context"], disallowed_special=())), + "context_tokens": context_tokens, # Gym-side additions consumed by the `mcqa` resource server. # mcqa's verify() reads `options`, `expected_answer`, `grading_mode`. "options": [ @@ -87,11 +131,48 @@ def prepare() -> Path: "grading_mode": "strict_single_letter_boxed", } out.write(json.dumps(record, ensure_ascii=False) + "\n") - count += 1 + kept += 1 + + cap_str = "none" if max_context_tokens is None else str(max_context_tokens) + print( + f"Wrote {kept} problems to {output_fpath} " + f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped} over cap)" + ) + return output_fpath + - print(f"Wrote {count} problems to {OUTPUT_FPATH}") - return OUTPUT_FPATH +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--tokenizer_name", + default=DEFAULT_TOKENIZER_NAME, + help=( + "Tokenizer used for the context_tokens count and length filter. " + "Accepts a tiktoken encoding name (e.g. 'cl100k_base', 'o200k_base') " + "or a HuggingFace model id (e.g. 'meta-llama/Llama-3.1-8B-Instruct'). " + f"Default: {DEFAULT_TOKENIZER_NAME}" + ), + ) + parser.add_argument( + "--max_context_tokens", + type=int, + default=DEFAULT_MAX_CONTEXT_TOKENS, + help=( + "Drop samples whose tokenized context exceeds this many tokens. " + "Omit (or pass a negative number) for no filter. " + f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}" + ), + ) + parser.add_argument( + "--output_fpath", + type=Path, + default=DEFAULT_OUTPUT_FPATH, + help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}", + ) + return parser.parse_args() if __name__ == "__main__": - prepare() + args = _parse_args() + cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None + prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath) diff --git a/benchmarks/longbench_v2/prepare_n3_1m.py b/benchmarks/longbench_v2/prepare_n3_1m.py new file mode 100644 index 000000000..b8fd6a64d --- /dev/null +++ b/benchmarks/longbench_v2/prepare_n3_1m.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""LongBench-v2 variant: Nemotron-3-Super tokenizer with a 1M context cap. + +Same data + fields as ``prepare.py``, but counts ``context_tokens`` +with the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace +tokenizer and drops samples whose tokenized context exceeds 1048576 +tokens (Nemotron-3-Super's native 1M context window). LongBench-v2 +contexts span 8k-2M words, so the long-bucket rows above 1M tokens +are filtered out. + +Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated +NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``). +""" + +from pathlib import Path + +from .prepare import prepare as _prepare + + +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +MAX_CONTEXT_TOKENS = 1048576 +OUTPUT_FPATH = Path(__file__).parent / "data" / "longbench_v2_n3_1m_benchmark.jsonl" + + +def prepare() -> Path: + return _prepare( + tokenizer_name=TOKENIZER_NAME, + max_context_tokens=MAX_CONTEXT_TOKENS, + output_fpath=OUTPUT_FPATH, + ) + + +if __name__ == "__main__": + prepare() diff --git a/benchmarks/longcodebench/README.md b/benchmarks/longcodebench/README.md index 64ca598d1..60192ab36 100644 --- a/benchmarks/longcodebench/README.md +++ b/benchmarks/longcodebench/README.md @@ -12,22 +12,53 @@ the long code prompt plus the postfix; the shared wraps it as a single user message, mirroring NeMo Skills' `prompt_format=openai` behaviour. +## Variants + +| Variant | Config | Prepare script | Tokenizer | Max tokens | Output | +|---|---|---|---|---|---| +| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/longcodebench_benchmark.jsonl` | +| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/longcodebench_n3_1m_benchmark.jsonl` | + +The N3 1M variant requires HF auth for the gated NVIDIA repo +(`HF_TOKEN` env or `huggingface-cli login`). + +For one-off custom builds (different tokenizer / cap / output path), +invoke `prepare.py` directly: + +```bash +python benchmarks/longcodebench/prepare.py \ + --tokenizer_name cl100k_base \ + --max_context_tokens 131072 \ + --output_fpath benchmarks/longcodebench/data/longcodebench_cl100k_128k_benchmark.jsonl +``` + ## Example usage ```bash -# Prepare benchmark data +# Prepare benchmark data (default) ng_prepare_benchmark "+config_paths=[benchmarks/longcodebench/config.yaml]" +# Prepare benchmark data (N3 1M variant) +ng_prepare_benchmark "+config_paths=[benchmarks/longcodebench/config_n3_1m.yaml]" + # Running servers config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\ benchmarks/longcodebench/config.yaml" ng_run "+config_paths=[$config_paths]" -# Collecting rollouts +# Collecting rollouts — default ng_collect_rollouts \ +agent_name=longcodebench_mcqa_simple_agent \ +input_jsonl_fpath=benchmarks/longcodebench/data/longcodebench_benchmark.jsonl \ +output_jsonl_fpath=results/longcodebench_rollouts.jsonl \ +prompt_config=benchmarks/prompts/generic/default.yaml \ +num_repeats=4 + +# Collecting rollouts — N3 1M +ng_collect_rollouts \ + +agent_name=longcodebench_n3_1m_mcqa_simple_agent \ + +input_jsonl_fpath=benchmarks/longcodebench/data/longcodebench_n3_1m_benchmark.jsonl \ + +output_jsonl_fpath=results/longcodebench_n3_1m_rollouts.jsonl \ + +prompt_config=benchmarks/prompts/generic/default.yaml \ + +num_repeats=4 ``` diff --git a/benchmarks/longcodebench/config_n3_1m.yaml b/benchmarks/longcodebench/config_n3_1m.yaml new file mode 100644 index 000000000..bc1ed0807 --- /dev/null +++ b/benchmarks/longcodebench/config_n3_1m.yaml @@ -0,0 +1,18 @@ +# LongCodeBench — N3 1M-context variant. +# Same data + fields as `config.yaml`, but `prepare_n3_1m.py` counts +# `n_tokens` with the Nemotron-3-Super HF tokenizer and drops samples +# whose tokenized prompt exceeds 1048576 tokens. +config_paths: + - resources_servers/mcqa/configs/mcqa.yaml + +longcodebench_n3_1m_mcqa_simple_agent: + _inherit_from: mcqa_simple_agent + responses_api_agents: + simple_agent: + datasets: + - name: longcodebench_n3_1m + type: benchmark + jsonl_fpath: benchmarks/longcodebench/data/longcodebench_n3_1m_benchmark.jsonl + prompt_config: benchmarks/prompts/generic/default.yaml + prepare_script: benchmarks/longcodebench/prepare_n3_1m.py + license: Creative Commons Attribution 4.0 International diff --git a/benchmarks/longcodebench/prepare.py b/benchmarks/longcodebench/prepare.py index db8a90cb0..a650c2a53 100644 --- a/benchmarks/longcodebench/prepare.py +++ b/benchmarks/longcodebench/prepare.py @@ -27,20 +27,35 @@ purely to populate the server's `allowed_letters` set; the option text is not used for grading because the postfix forces a `\\boxed{X}` answer. -Skills' prepare also stores a `n_tokens_cl100k_base` field counted with -tiktoken. The mcqa verifier never reads it; we omit it on the Gym side to -avoid pulling tiktoken into Gym's main dependency set just for one -benchmark's metadata column. +Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens`` +field, with no length filter. For an N3 1M-context +variant that filters to fit, see ``prepare_n3_1m.py`` and +``config_n3_1m.yaml``. + +Invocation +---------- + +``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using +the defaults below. To build a custom variant, run this script +directly:: + + python benchmarks/longcodebench/prepare.py \\ + --tokenizer_name cl100k_base \\ + --max_context_tokens 131072 """ +import argparse import json import uuid from pathlib import Path +from typing import Callable, Optional + +import tiktoken BENCHMARK_DIR = Path(__file__).parent DATA_DIR = BENCHMARK_DIR / "data" -OUTPUT_FPATH = DATA_DIR / "longcodebench_benchmark.jsonl" +DEFAULT_OUTPUT_FPATH = DATA_DIR / "longcodebench_benchmark.jsonl" OPTION_LETTERS = ("A", "B", "C", "D") POSTFIX = ( @@ -48,25 +63,58 @@ "'Answer: \\boxed{A/B/C/D}' (e.g. 'Answer: \\boxed{A}')." ) +DEFAULT_TOKENIZER_NAME = "o200k_base" +DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default + + +def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]: + """Return a ``text -> token_count`` function. -def prepare() -> Path: + Tries ``tiktoken.get_encoding`` first; falls back to + ``transformers.AutoTokenizer`` for HuggingFace model ids. + """ + try: + enc = tiktoken.get_encoding(tokenizer_name) + return lambda text: len(enc.encode(text, disallowed_special=())) + except ValueError: + from transformers import AutoTokenizer + + hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) + return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False)) + + +def prepare( + tokenizer_name: str = DEFAULT_TOKENIZER_NAME, + max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS, + output_fpath: Path = DEFAULT_OUTPUT_FPATH, +) -> Path: """Download LongCodeBench LongCodeQA from HuggingFace and write Gym JSONL.""" from datasets import load_dataset - print("Downloading LongCodeBench LongCodeQA from HuggingFace...") + output_fpath = Path(output_fpath) + output_fpath.parent.mkdir(parents=True, exist_ok=True) + + print(f"Downloading LongCodeBench LongCodeQA (tokenizer='{tokenizer_name}') ...") ds = load_dataset("json", data_files="hf://datasets/Steefano/LCB/LongCodeQA.zip") data = ds["train"] - DATA_DIR.mkdir(parents=True, exist_ok=True) + count_tokens = _build_token_counter(tokenizer_name) # Empty-text option dicts: the mcqa server only consumes the option *keys* # for `strict_single_letter_boxed` grading; option text is irrelevant since # the prompt postfix forces the model to emit `\boxed{}`. options = [{letter: ""} for letter in OPTION_LETTERS] + kept = 0 + skipped = 0 rows = [] for entry in data: question = entry["prompt"].strip() + POSTFIX + n_tokens = count_tokens(question) + if max_context_tokens is not None and n_tokens > max_context_tokens: + skipped += 1 + continue + row = { "question": question, "options": options, @@ -76,15 +124,54 @@ def prepare() -> Path: "repo": entry["repo"], "prompt_goal": entry["prompt_goal"], "is_hard": entry["is_hard"], + "n_tokens": n_tokens, } rows.append(json.dumps(row) + "\n") + kept += 1 - with open(OUTPUT_FPATH, "w", encoding="utf-8") as f: + with open(output_fpath, "w", encoding="utf-8") as f: f.writelines(rows) - print(f"Wrote {len(rows)} problems to {OUTPUT_FPATH}") - return OUTPUT_FPATH + cap_str = "none" if max_context_tokens is None else str(max_context_tokens) + print( + f"Wrote {kept} problems to {output_fpath} " + f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped} over cap)" + ) + return output_fpath + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--tokenizer_name", + default=DEFAULT_TOKENIZER_NAME, + help=( + "Tokenizer used for the n_tokens count and length filter. " + "Accepts a tiktoken encoding name (e.g. 'cl100k_base', 'o200k_base') " + "or a HuggingFace model id (e.g. 'meta-llama/Llama-3.1-8B-Instruct'). " + f"Default: {DEFAULT_TOKENIZER_NAME}" + ), + ) + parser.add_argument( + "--max_context_tokens", + type=int, + default=DEFAULT_MAX_CONTEXT_TOKENS, + help=( + "Drop samples whose tokenized prompt exceeds this many tokens. " + "Omit (or pass a negative number) for no filter. " + f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}" + ), + ) + parser.add_argument( + "--output_fpath", + type=Path, + default=DEFAULT_OUTPUT_FPATH, + help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}", + ) + return parser.parse_args() if __name__ == "__main__": - prepare() + args = _parse_args() + cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None + prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath) diff --git a/benchmarks/longcodebench/prepare_n3_1m.py b/benchmarks/longcodebench/prepare_n3_1m.py new file mode 100644 index 000000000..cf2dfcda2 --- /dev/null +++ b/benchmarks/longcodebench/prepare_n3_1m.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""LongCodeBench variant: Nemotron-3-Super tokenizer with a 1M context cap. + +Same data + fields as ``prepare.py``, but counts ``n_tokens`` with the +``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace tokenizer +and drops samples whose tokenized prompt exceeds 1048576 tokens +(Nemotron-3-Super's native 1M context window). + +Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated +NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``). +""" + +from pathlib import Path + +from .prepare import prepare as _prepare + + +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +MAX_CONTEXT_TOKENS = 1048576 +OUTPUT_FPATH = Path(__file__).parent / "data" / "longcodebench_n3_1m_benchmark.jsonl" + + +def prepare() -> Path: + return _prepare( + tokenizer_name=TOKENIZER_NAME, + max_context_tokens=MAX_CONTEXT_TOKENS, + output_fpath=OUTPUT_FPATH, + ) + + +if __name__ == "__main__": + prepare() diff --git a/benchmarks/mrcr/README.md b/benchmarks/mrcr/README.md index 535bf9002..d31ae8ec0 100644 --- a/benchmarks/mrcr/README.md +++ b/benchmarks/mrcr/README.md @@ -8,17 +8,39 @@ to the Nth occurrence and reproduce it exactly" instruction. Scoring: `SequenceMatcher.ratio()` between stripped response and stripped expected answer, gated on the response starting with the random prefix. +## Variants + +| Variant | Config | Prepare script | Tokenizer | Max tokens | Output | +|---|---|---|---|---|---| +| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/mrcr_benchmark.jsonl` | +| N3 128k | `config_n3_128k.yaml` | `prepare_n3_128k.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `131072` | `data/mrcr_n3_128k_benchmark.jsonl` | +| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/mrcr_n3_1m_benchmark.jsonl` | + +The N3 variants require HF auth for the gated NVIDIA repo +(`HF_TOKEN` env or `huggingface-cli login`). + +For one-off custom builds (different tokenizer / cap / output path), +invoke `prepare.py` directly: + +```bash +python benchmarks/mrcr/prepare.py \ + --tokenizer_name nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 \ + --max_context_tokens 131072 \ + --output_fpath benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl +``` + ## Prepare benchmark data ```bash +# Default (o200k_base, no filter) ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config.yaml]" -``` -Downloads the HF dataset, token-counts each sample with `tiktoken o200k_base`, -and writes `benchmarks/mrcr/data/mrcr_benchmark.jsonl`. Samples over 200000 -input tokens are dropped to leave headroom for model-side tokenizers (which -can be 7–10% heavier than tiktoken) to stay under a 262144-token native -context. +# N3 128k variant +ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config_n3_128k.yaml]" + +# N3 1M variant +ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config_n3_1m.yaml]" +``` ## Start environment @@ -29,11 +51,26 @@ ng_run "+config_paths=[benchmarks/mrcr/config.yaml,responses_api_models/vllm_mod ## Collect rollouts ```bash +# Default variant ng_collect_rollouts \ +agent_name=mrcr_benchmark_simple_agent \ +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_benchmark.jsonl \ +output_jsonl_fpath=results/mrcr_rollouts.jsonl \ +num_repeats=4 + +# N3 128k variant +ng_collect_rollouts \ + +agent_name=mrcr_n3_128k_benchmark_simple_agent \ + +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl \ + +output_jsonl_fpath=results/mrcr_n3_128k_rollouts.jsonl \ + +num_repeats=4 + +# N3 1M variant +ng_collect_rollouts \ + +agent_name=mrcr_n3_1m_benchmark_simple_agent \ + +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_n3_1m_benchmark.jsonl \ + +output_jsonl_fpath=results/mrcr_n3_1m_rollouts.jsonl \ + +num_repeats=4 ``` ## Metrics diff --git a/benchmarks/mrcr/config_n3_128k.yaml b/benchmarks/mrcr/config_n3_128k.yaml new file mode 100644 index 000000000..df8648acb --- /dev/null +++ b/benchmarks/mrcr/config_n3_128k.yaml @@ -0,0 +1,28 @@ +# MRCR — Nemotron-3-Super 128k-context variant. +# Same data + grading as `config.yaml`, but `prepare_n3_128k.py` counts +# tokens with the Nemotron-3-Super HF tokenizer and drops samples whose +# tokenized conversation exceeds 131072 tokens. +config_paths: + - resources_servers/mrcr/configs/mrcr.yaml + +mrcr_n3_128k_benchmark_resources_server: + _inherit_from: mrcr_resources_server + +mrcr_n3_128k_benchmark_simple_agent: + _inherit_from: mrcr_simple_agent + responses_api_agents: + simple_agent: + resources_server: + name: mrcr_n3_128k_benchmark_resources_server + datasets: + - name: mrcr_n3_128k + type: benchmark + jsonl_fpath: benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl + prompt_config: null + prepare_script: benchmarks/mrcr/prepare_n3_128k.py + # Rollouts per task for pass@k variance. + # NOTE: for `type: benchmark` datasets, `num_repeats` here is a + # placeholder — it only triggers row duplication for + # `type: train`/`validation`. To actually get N rollouts per task, + # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. + num_repeats: 4 diff --git a/benchmarks/mrcr/config_n3_1m.yaml b/benchmarks/mrcr/config_n3_1m.yaml new file mode 100644 index 000000000..7f97967a3 --- /dev/null +++ b/benchmarks/mrcr/config_n3_1m.yaml @@ -0,0 +1,28 @@ +# MRCR — Nemotron-3-Super 1M-context variant. +# Same data + grading as `config.yaml`, but `prepare_n3_1m.py` counts +# tokens with the Nemotron-3-Super HF tokenizer and drops samples whose +# tokenized conversation exceeds 1048576 tokens. +config_paths: + - resources_servers/mrcr/configs/mrcr.yaml + +mrcr_n3_1m_benchmark_resources_server: + _inherit_from: mrcr_resources_server + +mrcr_n3_1m_benchmark_simple_agent: + _inherit_from: mrcr_simple_agent + responses_api_agents: + simple_agent: + resources_server: + name: mrcr_n3_1m_benchmark_resources_server + datasets: + - name: mrcr_n3_1m + type: benchmark + jsonl_fpath: benchmarks/mrcr/data/mrcr_n3_1m_benchmark.jsonl + prompt_config: null + prepare_script: benchmarks/mrcr/prepare_n3_1m.py + # Rollouts per task for pass@k variance. + # NOTE: for `type: benchmark` datasets, `num_repeats` here is a + # placeholder — it only triggers row duplication for + # `type: train`/`validation`. To actually get N rollouts per task, + # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. + num_repeats: 4 diff --git a/benchmarks/mrcr/prepare.py b/benchmarks/mrcr/prepare.py index 0bcfad81a..2ef8e767b 100644 --- a/benchmarks/mrcr/prepare.py +++ b/benchmarks/mrcr/prepare.py @@ -20,18 +20,30 @@ https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/mrcr/prepare.py Each row in the upstream dataset has a `prompt` field that is a JSON-stringified -list of OpenAI chat messages. We parse it into `responses_create_params.input`, -count tokens with tiktoken `o200k_base` (same tokenizer used by the official -MRCR grading setup), and filter to samples that fit in the model context. - -The 200000-token cap leaves headroom for tokenizer drift: a model's own -tokenizer can produce ~7-10% more tokens than tiktoken `o200k_base`, so -filtering at 200K tiktoken keeps the model-side worst-case near 220K, which -combined with ~32K generation stays under a 262144-token native context. +list of OpenAI chat messages. We parse it into `responses_create_params.input` +and count tokens by summing the per-message tokenized lengths. + +Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens`` +field, with no length filter. For a 128k-context variant using the +Nemotron-3-Super HF tokenizer, see ``prepare_n3_128k.py`` and +``config_n3_128k.yaml``. + +Invocation +---------- + +``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using +the defaults below. To build a custom variant, run this script +directly:: + + python benchmarks/mrcr/prepare.py \\ + --tokenizer_name nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 \\ + --max_context_tokens 131072 """ +import argparse import json from pathlib import Path +from typing import Callable, Optional import tiktoken from datasets import load_dataset @@ -40,30 +52,55 @@ BENCHMARK_DIR = Path(__file__).parent DATA_DIR = BENCHMARK_DIR / "data" -OUTPUT_FPATH = DATA_DIR / "mrcr_benchmark.jsonl" +DEFAULT_OUTPUT_FPATH = DATA_DIR / "mrcr_benchmark.jsonl" + +DEFAULT_TOKENIZER_NAME = "o200k_base" +DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default -MAX_CONTEXT_TOKENS = 200000 +def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]: + """Return a ``text -> token_count`` function. -def _count_tokens(messages: list[dict]) -> int: - """Token count using the o200k_base tokenizer — same as Skills prepare.""" - enc = tiktoken.get_encoding("o200k_base") - return sum(len(enc.encode(m["content"])) for m in messages) + Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken + encoding, falls back to ``transformers.AutoTokenizer``. + """ + try: + enc = tiktoken.get_encoding(tokenizer_name) + return lambda text: len(enc.encode(text, disallowed_special=())) + except ValueError: + from transformers import AutoTokenizer + hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True) + return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False)) -def prepare() -> Path: - DATA_DIR.mkdir(parents=True, exist_ok=True) + +def _count_message_tokens(messages: list[dict], count_one: Callable[[str], int]) -> int: + """Sum tokens across every message's ``content`` field. + + Matches the per-message summing used by ``nemo_skills/dataset/mrcr/prepare.py`` + and the official openai/mrcr grading setup. + """ + return sum(count_one(m["content"]) for m in messages) + + +def prepare( + tokenizer_name: str = DEFAULT_TOKENIZER_NAME, + max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS, + output_fpath: Path = DEFAULT_OUTPUT_FPATH, +) -> Path: + output_fpath = Path(output_fpath) + output_fpath.parent.mkdir(parents=True, exist_ok=True) dataset = load_dataset("openai/mrcr", split="train") + count_one = _build_token_counter(tokenizer_name) kept = 0 skipped_tokens = 0 - with OUTPUT_FPATH.open("w", encoding="utf-8") as fout: - for idx, entry in tqdm(enumerate(dataset), desc="Preparing MRCR"): + with output_fpath.open("w", encoding="utf-8") as fout: + for entry in tqdm(dataset, desc="Preparing MRCR"): messages = json.loads(entry["prompt"]) - - n_tokens = _count_tokens(messages) - if n_tokens > MAX_CONTEXT_TOKENS: + n_tokens = _count_message_tokens(messages, count_one) + if max_context_tokens is not None and n_tokens > max_context_tokens: skipped_tokens += 1 continue @@ -77,9 +114,46 @@ def prepare() -> Path: fout.write(json.dumps(sample) + "\n") kept += 1 - print(f"Wrote {kept} samples to {OUTPUT_FPATH} (skipped {skipped_tokens} with >{MAX_CONTEXT_TOKENS} tokens)") - return OUTPUT_FPATH + cap_str = "none" if max_context_tokens is None else str(max_context_tokens) + print( + f"Wrote {kept} samples to {output_fpath} " + f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped_tokens} over cap)" + ) + return output_fpath + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--tokenizer_name", + default=DEFAULT_TOKENIZER_NAME, + help=( + "Tokenizer used for token counting. Accepts a tiktoken encoding name " + "(e.g. 'cl100k_base', 'o200k_base') or a HuggingFace model id " + "(e.g. 'meta-llama/Llama-3.1-8B-Instruct'). " + f"Default: {DEFAULT_TOKENIZER_NAME}" + ), + ) + parser.add_argument( + "--max_context_tokens", + type=int, + default=DEFAULT_MAX_CONTEXT_TOKENS, + help=( + "Drop samples whose tokenized conversation exceeds this many tokens. " + "Omit (or pass a negative number) for no filter. " + f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}" + ), + ) + parser.add_argument( + "--output_fpath", + type=Path, + default=DEFAULT_OUTPUT_FPATH, + help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}", + ) + return parser.parse_args() if __name__ == "__main__": - prepare() + args = _parse_args() + cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None + prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath) diff --git a/benchmarks/mrcr/prepare_n3_128k.py b/benchmarks/mrcr/prepare_n3_128k.py new file mode 100644 index 000000000..6e5ef7400 --- /dev/null +++ b/benchmarks/mrcr/prepare_n3_128k.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MRCR variant: Nemotron-3-Super tokenizer with a 128k token cap. + +Same data + grading as ``prepare.py``, but counts ``n_tokens`` with +the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace +tokenizer and drops samples whose tokenized conversation exceeds +131072 tokens. + +Paired with ``config_n3_128k.yaml``. Requires HF auth for the gated +NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``). +""" + +from pathlib import Path + +from .prepare import prepare as _prepare + + +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +MAX_CONTEXT_TOKENS = 131072 +OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_128k_benchmark.jsonl" + + +def prepare() -> Path: + return _prepare( + tokenizer_name=TOKENIZER_NAME, + max_context_tokens=MAX_CONTEXT_TOKENS, + output_fpath=OUTPUT_FPATH, + ) + + +if __name__ == "__main__": + prepare() diff --git a/benchmarks/mrcr/prepare_n3_1m.py b/benchmarks/mrcr/prepare_n3_1m.py new file mode 100644 index 000000000..28d0fa744 --- /dev/null +++ b/benchmarks/mrcr/prepare_n3_1m.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MRCR variant: Nemotron-3-Super tokenizer with a 1M token cap. + +Same data + grading as ``prepare.py``, but counts ``n_tokens`` with +the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace +tokenizer and drops samples whose tokenized conversation exceeds +1048576 tokens (Nemotron-3-Super's native 1M context window). + +Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated +NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``). +""" + +from pathlib import Path + +from .prepare import prepare as _prepare + + +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +MAX_CONTEXT_TOKENS = 1048576 +OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_1m_benchmark.jsonl" + + +def prepare() -> Path: + return _prepare( + tokenizer_name=TOKENIZER_NAME, + max_context_tokens=MAX_CONTEXT_TOKENS, + output_fpath=OUTPUT_FPATH, + ) + + +if __name__ == "__main__": + prepare() diff --git a/resources_servers/graphwalks/README.md b/resources_servers/graphwalks/README.md new file mode 100644 index 000000000..5f11f4de5 --- /dev/null +++ b/resources_servers/graphwalks/README.md @@ -0,0 +1,46 @@ +# GraphWalks resources server + +OpenAI's [GraphWalks](https://huggingface.co/datasets/openai/graphwalks) +long-context benchmark. Each task provides an adjacency list (often +massive) and asks the model either to: + +- **parents**: list every parent of a target node, or +- **bfs**: list every node reachable at exactly depth N via BFS from a + source node. + +## Scoring + +1. The model must end its response with a line of the form + `Final Answer: [n1, n2, ...]`. If the format is missing, + `parse_failed=True` and reward=0. +2. Otherwise reward is the **F1 score** between the predicted node + set and the expected node set (continuous in [0, 1]): + - both empty → 1.0 + - one empty (the other non-empty) → 0.0 + - else `2·P·R / (P + R)` + +Grader ported from +https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluator/graphwalks.py. + +## Start environment + +```bash +ng_run "+config_paths=[resources_servers/graphwalks/configs/graphwalks.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]" +``` + +## Collect example rollouts + +```bash +ng_collect_rollouts \ + +agent_name=graphwalks_simple_agent \ + +input_jsonl_fpath=resources_servers/graphwalks/data/example.jsonl \ + +output_jsonl_fpath=resources_servers/graphwalks/data/example_rollouts.jsonl +``` + +For the full benchmark run see +[`benchmarks/graphwalks/README.md`](../../benchmarks/graphwalks/README.md). + +## Licensing + +- Code: Apache 2.0 +- Data ([openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks)): see upstream license diff --git a/resources_servers/graphwalks/app.py b/resources_servers/graphwalks/app.py new file mode 100644 index 000000000..6ab73f2d5 --- /dev/null +++ b/resources_servers/graphwalks/app.py @@ -0,0 +1,170 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GraphWalks (OpenAI) resources server. + +Implements F1-over-node-sets grading from the official +[openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks) +benchmark. Each task asks the model either to (a) list the parents of a +node or (b) return BFS-reachable nodes at exactly a given depth. + +Ported from: + https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluator/graphwalks.py + +Scoring: + - Parse the model's final line for ``Final Answer: [n1, n2, ...]``. + If absent, ``parse_failed=True`` and reward=0. + - Otherwise compute F1 between the predicted node set and the + expected node set. Empty-vs-empty matches as F1=1.0; either + side empty (with the other non-empty) is F1=0. + - Reward is the F1 score in [0, 1] — continuous, like MRCR. +""" + +import json +import re +from typing import Any, Dict, List, Optional, Tuple, Union + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) +from nemo_gym.reward_profile import ( + compute_pass_majority_metrics, + compute_subset_metrics, + highest_k_metrics, +) + + +class GraphWalksResourcesServerConfig(BaseResourcesServerConfig): + pass + + +class GraphWalksVerifyRequest(BaseVerifyRequest): + expected_answer: str + problem_type: str + n_tokens: Optional[int] = None + prompt_chars: Optional[int] = None + + +class GraphWalksVerifyResponse(GraphWalksVerifyRequest, BaseVerifyResponse): + f1: float + parse_failed: bool + predicted_nodes: List[str] + + +class GraphWalksResourcesServer(SimpleResourcesServer): + config: GraphWalksResourcesServerConfig + + async def verify(self, body: GraphWalksVerifyRequest) -> GraphWalksVerifyResponse: + response = body.response.output_text + predicted_nodes, parse_failed = _parse_node_list(response) + try: + expected_nodes = set(json.loads(body.expected_answer)) + except (json.JSONDecodeError, TypeError): + expected_nodes = set() + f1 = _f1_score(set(predicted_nodes), expected_nodes, parse_failed) + return GraphWalksVerifyResponse( + **body.model_dump(), + reward=f1, + f1=f1, + parse_failed=parse_failed, + predicted_nodes=predicted_nodes, + ) + + # ────────────────────────────────────────────────────────── + # Aggregate metrics overrides + # ────────────────────────────────────────────────────────── + + @staticmethod + def _score_fn(r: Dict[str, Any]) -> Dict[str, Union[float, bool]]: + return {"accuracy": r["reward"]} + + def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]: + """Pass@k plus a per-`problem_type` subset breakdown. + + F1 is a continuous score in [0, 1] so pass@k is max-of-k (not + combinatorial). majority@k is not meaningful (no discrete + extracted answer) — `answer_key` is left None. + """ + metrics, _, _, _ = compute_pass_majority_metrics(tasks, score_fn=self._score_fn) + subset_metrics = compute_subset_metrics(tasks, subset_key="problem_type", score_fn=self._score_fn) + # compute_subset_metrics emits keys like "/pass@k/accuracy" where + # is the raw subset value. Prepend the field name so the key + # stays self-describing: "problem_type=/pass@k/accuracy". + subset_metrics = {(f"problem_type={k}" if "/" in k else k): v for k, v in subset_metrics.items()} + metrics.update(subset_metrics) + return metrics + + def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]: + key: Dict[str, Any] = {} + for name in ("mean/input_tokens", "mean/output_tokens"): + if name in agent_metrics: + key[name] = agent_metrics[name] + key.update(highest_k_metrics(agent_metrics, "pass@1[avg-of-{k}]")) + key.update(highest_k_metrics(agent_metrics, "pass@{k}")) + return key + + +_FINAL_ANSWER_RE = re.compile(r"Final Answer:\s*\[(.*)\]") + + +def _parse_node_list(response: str) -> Tuple[List[str], bool]: + """Parse ``Final Answer: [n1, n2, ...]`` from the last non-empty line. + + Returns ``(nodes, parse_failed)``. ``parse_failed`` is True when the + expected format is absent. Empty list with ``parse_failed=False`` means + the model explicitly returned no nodes. + + Reference: https://huggingface.co/datasets/openai/graphwalks + """ + lines = [line for line in (response or "").strip().split("\n") if line.strip()] + if not lines: + return [], True + + match = _FINAL_ANSWER_RE.search(lines[-1]) + if not match: + return [], True + + content = match.group(1) + if not content.strip(): + return [], False + return [item.strip() for item in content.split(",") if item.strip()], False + + +def _f1_score(predicted: set, expected: set, parse_failed: bool) -> float: + """F1 between two node sets. + + - parse_failed → 0.0 (no answer extracted) + - both empty → 1.0 (model correctly returned nothing) + - one empty → 0.0 + - otherwise → 2·P·R / (P + R) + """ + if parse_failed: + return 0.0 + if not expected and not predicted: + return 1.0 + if not predicted or not expected: + return 0.0 + tp = len(predicted & expected) + if tp == 0: + return 0.0 + precision = tp / len(predicted) + recall = tp / len(expected) + return 2 * precision * recall / (precision + recall) + + +if __name__ == "__main__": + GraphWalksResourcesServer.run_webserver() diff --git a/resources_servers/graphwalks/configs/graphwalks.yaml b/resources_servers/graphwalks/configs/graphwalks.yaml new file mode 100644 index 000000000..6b5f6ed50 --- /dev/null +++ b/resources_servers/graphwalks/configs/graphwalks.yaml @@ -0,0 +1,25 @@ +graphwalks_resources_server: + resources_servers: + graphwalks: + entrypoint: app.py + domain: other + verified: false + description: Long-context graph-walks (BFS / parents) with F1-over-node-sets grading from openai/graphwalks + value: Improve long-context multi-step graph reasoning and adjacency-list traversal + +graphwalks_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: graphwalks_resources_server + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: example + type: example + jsonl_fpath: resources_servers/graphwalks/data/example.jsonl + num_repeats: 1 + license: MIT diff --git a/resources_servers/graphwalks/data/.gitignore b/resources_servers/graphwalks/data/.gitignore new file mode 100644 index 000000000..8d24a9b19 --- /dev/null +++ b/resources_servers/graphwalks/data/.gitignore @@ -0,0 +1,6 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl +*benchmark.jsonl diff --git a/resources_servers/graphwalks/data/example.jsonl b/resources_servers/graphwalks/data/example.jsonl new file mode 100644 index 000000000..45f92fd58 --- /dev/null +++ b/resources_servers/graphwalks/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nnode_0 -> node_1, node_2\nnode_1 -> node_3\nnode_2 -> node_3, node_4\nnode_3 -> node_5\nnode_4 -> node_5\n\nOperation: Find the parents of node node_3."}]}, "expected_answer": "[\"node_1\", \"node_2\"]", "problem_type": "parents", "n_tokens": 120, "prompt_chars": 480, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nalpha -> beta, gamma\nbeta -> delta\ngamma -> delta, epsilon\ndelta -> zeta\nepsilon -> zeta\n\nOperation: Find the parents of node zeta."}]}, "expected_answer": "[\"delta\", \"epsilon\"]", "problem_type": "parents", "n_tokens": 110, "prompt_chars": 430, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nnode_0 -> node_1, node_2\nnode_1 -> node_3, node_4\nnode_2 -> node_5\nnode_3 -> node_6\nnode_4 -> node_6\nnode_5 -> node_7\n\nOperation: Perform a BFS from node node_0 and return only the nodes at exactly depth 2 (not nodes at intermediate depths)."}]}, "expected_answer": "[\"node_3\", \"node_4\", \"node_5\"]", "problem_type": "bfs", "n_tokens": 150, "prompt_chars": 560, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nroot -> a, b\na -> c, d\nb -> e\nc -> f\nd -> f, g\ne -> g\nf -> h\ng -> h\n\nOperation: Perform a BFS from node root and return only the nodes at exactly depth 3 (not nodes at intermediate depths)."}]}, "expected_answer": "[\"f\", \"g\"]", "problem_type": "bfs", "n_tokens": 145, "prompt_chars": 540, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nn0 -> n1\nn1 -> n2, n3\nn2 -> n4\nn3 -> n4, n5\nn4 -> n6\nn5 -> n6\nn6 -> n7\n\nOperation: Find the parents of node n4."}]}, "expected_answer": "[\"n2\", \"n3\"]", "problem_type": "parents", "n_tokens": 130, "prompt_chars": 470, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}} diff --git a/resources_servers/graphwalks/data/example_metrics.json b/resources_servers/graphwalks/data/example_metrics.json new file mode 100644 index 000000000..8a0f7bdb5 --- /dev/null +++ b/resources_servers/graphwalks/data/example_metrics.json @@ -0,0 +1,60 @@ +{ + "name": "example", + "type": "example", + "jsonl_fpath": "resources_servers/graphwalks/data/example.jsonl", + "num_repeats": 1, + "gitlab_identifier": null, + "huggingface_identifier": null, + "license": "MIT", + "Number of examples": 5, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 5, + "Average": 77.2, + "Min": 69.0, + "Max": 90.0, + "Standard deviation": 9.71 + }, + "Number of turns": { + "Total # non-null values": 5, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "expected_answer": { + "unique_count": 5, + "total_count": 5 + }, + "problem_type": { + "unique_count": 2, + "total_count": 5 + }, + "n_tokens": { + "Total # non-null values": 5, + "Average": 131.0, + "Min": 110.0, + "Max": 150.0, + "Standard deviation": 16.73 + }, + "prompt_chars": { + "Total # non-null values": 5, + "Average": 496.0, + "Min": 430.0, + "Max": 560.0, + "Standard deviation": 53.2 + } +} \ No newline at end of file diff --git a/resources_servers/graphwalks/requirements.txt b/resources_servers/graphwalks/requirements.txt new file mode 100644 index 000000000..151b4ab7b --- /dev/null +++ b/resources_servers/graphwalks/requirements.txt @@ -0,0 +1,2 @@ +-e nemo-gym[dev] @ ../../ +tiktoken diff --git a/resources_servers/graphwalks/tests/__init__.py b/resources_servers/graphwalks/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/resources_servers/graphwalks/tests/test_app.py b/resources_servers/graphwalks/tests/test_app.py new file mode 100644 index 000000000..6c4cd9a86 --- /dev/null +++ b/resources_servers/graphwalks/tests/test_app.py @@ -0,0 +1,200 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from unittest.mock import MagicMock + +import pytest + +from nemo_gym.server_utils import ServerClient +from resources_servers.graphwalks.app import ( + GraphWalksResourcesServer, + GraphWalksResourcesServerConfig, + _f1_score, + _parse_node_list, +) + + +class TestSanity: + def test_sanity(self) -> None: + config = GraphWalksResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + ) + GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + +class TestParseNodeList: + """Tests for the Final-Answer line parser. + + Reference: https://huggingface.co/datasets/openai/graphwalks + """ + + def test_parses_well_formed_list(self) -> None: + nodes, failed = _parse_node_list("...\nFinal Answer: [node_1, node_2, node_3]") + assert nodes == ["node_1", "node_2", "node_3"] + assert failed is False + + def test_empty_list_is_valid(self) -> None: + """`Final Answer: []` is a valid no-nodes answer, not a parse failure.""" + nodes, failed = _parse_node_list("Final Answer: []") + assert nodes == [] + assert failed is False + + def test_only_uses_last_line(self) -> None: + text = "Final Answer: [decoy]\nbecause I said so\nFinal Answer: [real]" + nodes, failed = _parse_node_list(text) + assert nodes == ["real"] + assert failed is False + + def test_skips_trailing_blank_lines(self) -> None: + nodes, failed = _parse_node_list("Final Answer: [a, b]\n\n \n") + assert nodes == ["a", "b"] + assert failed is False + + def test_missing_format_fails(self) -> None: + nodes, failed = _parse_node_list("The answer is node_42.") + assert nodes == [] + assert failed is True + + def test_blank_response_fails(self) -> None: + nodes, failed = _parse_node_list("") + assert nodes == [] + assert failed is True + + def test_strips_whitespace_inside_list(self) -> None: + nodes, failed = _parse_node_list("Final Answer: [ a ,b , c]") + assert nodes == ["a", "b", "c"] + assert failed is False + + def test_drops_empty_items(self) -> None: + """Trailing commas / double commas should not produce empty entries.""" + nodes, failed = _parse_node_list("Final Answer: [a,, b,]") + assert nodes == ["a", "b"] + assert failed is False + + +class TestF1Score: + def test_parse_failed_is_zero(self) -> None: + assert _f1_score({"a"}, {"a"}, parse_failed=True) == 0.0 + + def test_both_empty_is_one(self) -> None: + assert _f1_score(set(), set(), parse_failed=False) == 1.0 + + def test_predicted_empty_expected_nonempty(self) -> None: + assert _f1_score(set(), {"a"}, parse_failed=False) == 0.0 + + def test_predicted_nonempty_expected_empty(self) -> None: + assert _f1_score({"a"}, set(), parse_failed=False) == 0.0 + + def test_exact_match_is_one(self) -> None: + assert _f1_score({"a", "b"}, {"a", "b"}, parse_failed=False) == 1.0 + + def test_no_overlap_is_zero(self) -> None: + assert _f1_score({"a"}, {"b"}, parse_failed=False) == 0.0 + + def test_partial_overlap(self) -> None: + # P=1/2, R=1/2 → F1=0.5 + assert math.isclose(_f1_score({"a", "b"}, {"a", "c"}, parse_failed=False), 0.5) + + def test_unequal_sizes(self) -> None: + # predicted={a,b,c}, expected={a}; P=1/3, R=1 → F1=0.5 + assert math.isclose(_f1_score({"a", "b", "c"}, {"a"}, parse_failed=False), 0.5) + + +class TestScoreFn: + def test_score_fn_returns_accuracy_equals_reward(self) -> None: + assert GraphWalksResourcesServer._score_fn({"reward": 0.73}) == {"accuracy": 0.73} + + def test_score_fn_handles_zero(self) -> None: + assert GraphWalksResourcesServer._score_fn({"reward": 0.0}) == {"accuracy": 0.0} + + def test_score_fn_handles_one(self) -> None: + assert GraphWalksResourcesServer._score_fn({"reward": 1.0}) == {"accuracy": 1.0} + + +class TestComputeMetrics: + @pytest.fixture + def server(self) -> GraphWalksResourcesServer: + config = GraphWalksResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + ) + return GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + def test_compute_metrics_empty(self, server: GraphWalksResourcesServer) -> None: + assert server.compute_metrics([]) == {} + + def test_compute_metrics_includes_pass_at_k(self, server: GraphWalksResourcesServer) -> None: + tasks = [ + [{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}], + [{"reward": 0.8, "problem_type": "bfs"}, {"reward": 0.6, "problem_type": "bfs"}], + ] + metrics = server.compute_metrics(tasks) + assert "pass@1/accuracy" in metrics + assert "pass@2/accuracy" in metrics + assert "pass@1[avg-of-2]/accuracy" in metrics + + def test_compute_metrics_includes_subset_breakdown(self, server: GraphWalksResourcesServer) -> None: + """Per-problem-type subset should appear as `problem_type=/...`.""" + tasks = [ + [{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}], + [{"reward": 0.8, "problem_type": "bfs"}, {"reward": 0.6, "problem_type": "bfs"}], + ] + metrics = server.compute_metrics(tasks) + assert any(k.startswith("problem_type=parents/pass@") for k in metrics) + assert any(k.startswith("problem_type=bfs/pass@") for k in metrics) + # Bare "/..." keys must NOT leak through from compute_subset_metrics. + assert not any(k.startswith(("parents/", "bfs/")) for k in metrics) + + def test_compute_metrics_no_majority(self, server: GraphWalksResourcesServer) -> None: + """majority@k is skipped because F1 has no discrete answer_key.""" + tasks = [[{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}]] + metrics = server.compute_metrics(tasks) + assert not any(k.startswith("majority@") for k in metrics) + + +class TestGetKeyMetrics: + @pytest.fixture + def server(self) -> GraphWalksResourcesServer: + config = GraphWalksResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + ) + return GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + def test_get_key_metrics_picks_highest_k(self, server: GraphWalksResourcesServer) -> None: + agent_metrics = { + "pass@1/accuracy": 50.0, + "pass@2/accuracy": 70.0, + "pass@4/accuracy": 80.0, + "pass@1[avg-of-4]/accuracy": 60.0, + "mean/input_tokens": 1000, + "mean/output_tokens": 200, + } + key = server.get_key_metrics(agent_metrics) + assert key["pass@4/accuracy"] == 80.0 + assert key["pass@1[avg-of-4]/accuracy"] == 60.0 + assert key["mean/input_tokens"] == 1000 + assert key["mean/output_tokens"] == 200 + # Lower-k entries should not be in the key set + assert "pass@1/accuracy" not in key + assert "pass@2/accuracy" not in key From d63d227c8d89510f5867a53c0654259625008ab4 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh Date: Wed, 27 May 2026 15:24:16 -0700 Subject: [PATCH 2/3] fix Signed-off-by: Cheng-Ping Hsieh --- benchmarks/graphwalks/prepare_n3_1m.py | 2 +- benchmarks/longbench_v2/prepare_n3_1m.py | 2 +- benchmarks/longcodebench/prepare_n3_1m.py | 2 +- benchmarks/mrcr/prepare_n3_128k.py | 2 +- benchmarks/mrcr/prepare_n3_1m.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/graphwalks/prepare_n3_1m.py b/benchmarks/graphwalks/prepare_n3_1m.py index bdb6587ef..36d84c456 100644 --- a/benchmarks/graphwalks/prepare_n3_1m.py +++ b/benchmarks/graphwalks/prepare_n3_1m.py @@ -32,7 +32,7 @@ from .prepare import prepare as _prepare -TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret MAX_CONTEXT_TOKENS = 1048576 OUTPUT_FPATH = Path(__file__).parent / "data" / "graphwalks_n3_1m_benchmark.jsonl" diff --git a/benchmarks/longbench_v2/prepare_n3_1m.py b/benchmarks/longbench_v2/prepare_n3_1m.py index b8fd6a64d..4959653ab 100644 --- a/benchmarks/longbench_v2/prepare_n3_1m.py +++ b/benchmarks/longbench_v2/prepare_n3_1m.py @@ -30,7 +30,7 @@ from .prepare import prepare as _prepare -TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret MAX_CONTEXT_TOKENS = 1048576 OUTPUT_FPATH = Path(__file__).parent / "data" / "longbench_v2_n3_1m_benchmark.jsonl" diff --git a/benchmarks/longcodebench/prepare_n3_1m.py b/benchmarks/longcodebench/prepare_n3_1m.py index cf2dfcda2..0cb6f9fcb 100644 --- a/benchmarks/longcodebench/prepare_n3_1m.py +++ b/benchmarks/longcodebench/prepare_n3_1m.py @@ -28,7 +28,7 @@ from .prepare import prepare as _prepare -TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret MAX_CONTEXT_TOKENS = 1048576 OUTPUT_FPATH = Path(__file__).parent / "data" / "longcodebench_n3_1m_benchmark.jsonl" diff --git a/benchmarks/mrcr/prepare_n3_128k.py b/benchmarks/mrcr/prepare_n3_128k.py index 6e5ef7400..c75e0faaa 100644 --- a/benchmarks/mrcr/prepare_n3_128k.py +++ b/benchmarks/mrcr/prepare_n3_128k.py @@ -28,7 +28,7 @@ from .prepare import prepare as _prepare -TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret MAX_CONTEXT_TOKENS = 131072 OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_128k_benchmark.jsonl" diff --git a/benchmarks/mrcr/prepare_n3_1m.py b/benchmarks/mrcr/prepare_n3_1m.py index 28d0fa744..ffcbcdcb6 100644 --- a/benchmarks/mrcr/prepare_n3_1m.py +++ b/benchmarks/mrcr/prepare_n3_1m.py @@ -28,7 +28,7 @@ from .prepare import prepare as _prepare -TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" +TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret MAX_CONTEXT_TOKENS = 1048576 OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_1m_benchmark.jsonl" From b4cf8bbd53c6106109d53759825c8e29e57eaab6 Mon Sep 17 00:00:00 2001 From: Cheng-Ping Hsieh Date: Fri, 29 May 2026 14:25:53 -0700 Subject: [PATCH 3/3] save Signed-off-by: Cheng-Ping Hsieh --- benchmarks/graphwalks/config.yaml | 2 +- benchmarks/graphwalks/config_n3_1m.yaml | 2 +- benchmarks/mrcr/config_n3_128k.yaml | 2 +- benchmarks/mrcr/config_n3_1m.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/graphwalks/config.yaml b/benchmarks/graphwalks/config.yaml index 9e741fb02..61746a329 100644 --- a/benchmarks/graphwalks/config.yaml +++ b/benchmarks/graphwalks/config.yaml @@ -25,4 +25,4 @@ graphwalks_benchmark_simple_agent: # placeholder — it only triggers row duplication for # `type: train`/`validation`. To actually get N rollouts per task, # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. - num_repeats: 4 + num_repeats: 1 diff --git a/benchmarks/graphwalks/config_n3_1m.yaml b/benchmarks/graphwalks/config_n3_1m.yaml index 746917e97..cd9ca2ba9 100644 --- a/benchmarks/graphwalks/config_n3_1m.yaml +++ b/benchmarks/graphwalks/config_n3_1m.yaml @@ -25,4 +25,4 @@ graphwalks_n3_1m_benchmark_simple_agent: # placeholder — it only triggers row duplication for # `type: train`/`validation`. To actually get N rollouts per task, # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. - num_repeats: 4 + num_repeats: 1 diff --git a/benchmarks/mrcr/config_n3_128k.yaml b/benchmarks/mrcr/config_n3_128k.yaml index df8648acb..362aa9a24 100644 --- a/benchmarks/mrcr/config_n3_128k.yaml +++ b/benchmarks/mrcr/config_n3_128k.yaml @@ -25,4 +25,4 @@ mrcr_n3_128k_benchmark_simple_agent: # placeholder — it only triggers row duplication for # `type: train`/`validation`. To actually get N rollouts per task, # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. - num_repeats: 4 + num_repeats: 1 diff --git a/benchmarks/mrcr/config_n3_1m.yaml b/benchmarks/mrcr/config_n3_1m.yaml index 7f97967a3..c1ab257db 100644 --- a/benchmarks/mrcr/config_n3_1m.yaml +++ b/benchmarks/mrcr/config_n3_1m.yaml @@ -25,4 +25,4 @@ mrcr_n3_1m_benchmark_simple_agent: # placeholder — it only triggers row duplication for # `type: train`/`validation`. To actually get N rollouts per task, # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI. - num_repeats: 4 + num_repeats: 1