diff --git a/README.md b/README.md
index c8a379d73..51252e48e 100644
--- a/README.md
+++ b/README.md
@@ -208,6 +208,7 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace).
| Genrm Compare | rlhf | GenRM pairwise comparison for RLHF training | Compare multiple candidate responses using GenRM model | - | - | - | genrm_compare.yaml | - |
| Google Search | agent | Multi-choice question answering problems with search tools integrated | Improve knowledge-related benchmarks with search tools | ✓ | - | Apache 2.0 | google_search.yaml | Nemotron-RL-knowledge-web_search-mcqa |
| Gpqa Diamond | knowledge | GPQA Diamond multiple-choice question answering problems | Evaluate graduate-level scientific reasoning via MCQ verification | ✓ | - | MIT | gpqa_diamond.yaml | - |
+| Graphwalks | other | Long-context graph-walks (BFS / parents) with F1-over-node-sets grading from openai/graphwalks | Improve long-context multi-step graph reasoning and adjacency-list traversal | - | - | - | graphwalks.yaml | - |
| Grl Sokoban | games | Single-box Sokoban in Gymnasium API style. | Model emits one move per turn until the puzzle is solved. | - | - | - | grl_sokoban.yaml | - |
| Grl Tetris | games | Tetris in Gymnasium API style. Model emits one or more moves per turn. | Multi-step Tetris environment | - | - | - | grl_tetris.yaml | - |
| Gymnasium | other | Base class for Gymnasium-style servers. Not a standalone server. | Reusable base class for step/reset style environments | - | - | - | gymnasium.yaml | - |
diff --git a/benchmarks/graphwalks/README.md b/benchmarks/graphwalks/README.md
new file mode 100644
index 000000000..526a6e915
--- /dev/null
+++ b/benchmarks/graphwalks/README.md
@@ -0,0 +1,82 @@
+# GraphWalks benchmark
+
+Benchmark wrapper over the [`graphwalks` resources server](../../resources_servers/graphwalks/README.md)
+for the [openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks) dataset.
+
+Each task supplies an adjacency list and asks the model to either list
+the parents of a node (`problem_type: parents`) or return the BFS
+frontier at exactly depth N (`problem_type: bfs`). Scoring is F1 over
+the predicted node set vs. the expected node set, gated on the model
+producing a `Final Answer: [...]` line.
+
+## Variants
+
+Two preset configs ship alongside this benchmark. Both apply the same
+data + Skills prompt fixes (BFS depth disambiguation, self-parent
+removal); they differ only in the tokenizer used for the `n_tokens`
+column and an optional length filter.
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/graphwalks_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/graphwalks_n3_1m_benchmark.jsonl` |
+
+The N3 1M variant requires HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`).
+
+## Prepare benchmark data
+
+```bash
+# Default (o200k_base, no filter)
+ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config.yaml]"
+
+# N3 1M variant
+ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config_n3_1m.yaml]"
+```
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/graphwalks/prepare.py \
+ --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \
+ --max_context_tokens 131072 \
+ --output_fpath benchmarks/graphwalks/data/graphwalks_llama_128k_benchmark.jsonl
+```
+
+## Start environment
+
+```bash
+ng_run "+config_paths=[benchmarks/graphwalks/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]"
+```
+
+## Collect rollouts
+
+```bash
+# Default variant
+ng_collect_rollouts \
+ +agent_name=graphwalks_benchmark_simple_agent \
+ +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_benchmark.jsonl \
+ +output_jsonl_fpath=results/graphwalks_rollouts.jsonl \
+ +num_repeats=4
+
+# N3 1M variant
+ng_collect_rollouts \
+ +agent_name=graphwalks_n3_1m_benchmark_simple_agent \
+ +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl \
+ +output_jsonl_fpath=results/graphwalks_n3_1m_rollouts.jsonl \
+ +num_repeats=4
+```
+
+## Metrics
+
+`compute_metrics()` emits `pass@k/accuracy`, `pass@1[avg-of-k]/accuracy`
+via `compute_pass_majority_metrics`, plus per-`problem_type` subset
+breakdowns via `compute_subset_metrics(subset_key="problem_type")` —
+stratified pass@k keys like `problem_type=parents/pass@4/accuracy` and
+`problem_type=bfs/pass@4/accuracy`.
+
+For reasoning models the vLLM server should be started with a
+`--reasoning-parser` matching the model (e.g. `nano_v3` for Nemotron-3
+or `deepseek_r1`) so that `...` blocks are stripped
+upstream of `Final Answer:` parsing.
diff --git a/benchmarks/graphwalks/__init__.py b/benchmarks/graphwalks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/graphwalks/config.yaml b/benchmarks/graphwalks/config.yaml
new file mode 100644
index 000000000..61746a329
--- /dev/null
+++ b/benchmarks/graphwalks/config.yaml
@@ -0,0 +1,28 @@
+# Chain to the generic graphwalks resources server + agent config.
+config_paths:
+ - resources_servers/graphwalks/configs/graphwalks.yaml
+
+# Benchmark-specific overrides via `_inherit_from` so the base graphwalks config
+# stays isolated from benchmark use.
+
+graphwalks_benchmark_resources_server:
+ _inherit_from: graphwalks_resources_server
+
+graphwalks_benchmark_simple_agent:
+ _inherit_from: graphwalks_simple_agent
+ responses_api_agents:
+ simple_agent:
+ resources_server:
+ name: graphwalks_benchmark_resources_server
+ datasets:
+ - name: graphwalks
+ type: benchmark
+ jsonl_fpath: benchmarks/graphwalks/data/graphwalks_benchmark.jsonl
+ prompt_config: null
+ prepare_script: benchmarks/graphwalks/prepare.py
+ # Rollouts per task for pass@k variance.
+ # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+ # placeholder — it only triggers row duplication for
+ # `type: train`/`validation`. To actually get N rollouts per task,
+ # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+ num_repeats: 1
diff --git a/benchmarks/graphwalks/config_n3_1m.yaml b/benchmarks/graphwalks/config_n3_1m.yaml
new file mode 100644
index 000000000..cd9ca2ba9
--- /dev/null
+++ b/benchmarks/graphwalks/config_n3_1m.yaml
@@ -0,0 +1,28 @@
+# GraphWalks — N3 1M-context variant.
+# Same data + Skills prompt fixes as `config.yaml`, but `prepare_n3_1m.py`
+# counts tokens with the Nemotron-3-Super HF tokenizer and drops samples
+# whose tokenized prompt exceeds 1048576 tokens.
+config_paths:
+ - resources_servers/graphwalks/configs/graphwalks.yaml
+
+graphwalks_n3_1m_benchmark_resources_server:
+ _inherit_from: graphwalks_resources_server
+
+graphwalks_n3_1m_benchmark_simple_agent:
+ _inherit_from: graphwalks_simple_agent
+ responses_api_agents:
+ simple_agent:
+ resources_server:
+ name: graphwalks_n3_1m_benchmark_resources_server
+ datasets:
+ - name: graphwalks_n3_1m
+ type: benchmark
+ jsonl_fpath: benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl
+ prompt_config: null
+ prepare_script: benchmarks/graphwalks/prepare_n3_1m.py
+ # Rollouts per task for pass@k variance.
+ # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+ # placeholder — it only triggers row duplication for
+ # `type: train`/`validation`. To actually get N rollouts per task,
+ # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+ num_repeats: 1
diff --git a/benchmarks/graphwalks/data/.gitignore b/benchmarks/graphwalks/data/.gitignore
new file mode 100644
index 000000000..b06d45fe6
--- /dev/null
+++ b/benchmarks/graphwalks/data/.gitignore
@@ -0,0 +1 @@
+*benchmark.jsonl
diff --git a/benchmarks/graphwalks/prepare.py b/benchmarks/graphwalks/prepare.py
new file mode 100644
index 000000000..cbbd963d8
--- /dev/null
+++ b/benchmarks/graphwalks/prepare.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare the GraphWalks benchmark data.
+
+Source: https://huggingface.co/datasets/openai/graphwalks
+
+Ported from:
+ https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/graphwalks/prepare.py
+
+Two upstream-prompt corrections from Skills are preserved here verbatim:
+
+ 1. The BFS prompt is rewritten to disambiguate "depth N" — without
+ this rewrite, models often return nodes at intermediate depths.
+ 2. The parents prompt sometimes includes the target node inside its
+ own answer set; we strip it.
+
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens``
+field, with no length filter. For an N3 1M-context
+variant that filters to fit, see ``prepare_n3_1m.py`` and
+``config_n3_1m.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+ python benchmarks/graphwalks/prepare.py \\
+ --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \\
+ --max_context_tokens 131072
+"""
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Callable, Optional
+
+import tiktoken
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+BENCHMARK_DIR = Path(__file__).parent
+DATA_DIR = BENCHMARK_DIR / "data"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "graphwalks_benchmark.jsonl"
+
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default
+
+_BFS_PATTERN = re.compile(r"Perform a BFS from node (\S+) with depth (\d+)")
+_BFS_REPLACEMENT = (
+ r"Perform a BFS from node \1 and return only the nodes at exactly depth \2 "
+ r"(not nodes at intermediate depths)"
+)
+_PARENTS_PATTERN = re.compile(r"Find the parents of node ([^\s.]+)\.")
+
+
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+ """Return a ``text -> token_count`` function.
+
+ Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken
+ encoding, falls back to ``transformers.AutoTokenizer``.
+ """
+ try:
+ enc = tiktoken.get_encoding(tokenizer_name)
+ return lambda text: len(enc.encode(text, disallowed_special=()))
+ except ValueError:
+ from transformers import AutoTokenizer
+
+ hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+ return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
+
+
+def prepare(
+ tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+ max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+ output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
+ output_fpath = Path(output_fpath)
+ output_fpath.parent.mkdir(parents=True, exist_ok=True)
+
+ dataset = load_dataset("openai/graphwalks", split="train")
+ count_tokens = _build_token_counter(tokenizer_name)
+
+ kept = 0
+ skipped_tokens = 0
+ skipped_self_parent = 0
+ with output_fpath.open("w", encoding="utf-8") as fout:
+ for entry in tqdm(dataset, desc="Preparing GraphWalks"):
+ prompt_text = entry["prompt"]
+ answer_nodes = list(entry["answer_nodes"])
+
+ # Skills fix #1: disambiguate BFS depth.
+ prompt_text = _BFS_PATTERN.sub(_BFS_REPLACEMENT, prompt_text)
+
+ # Skills fix #2: strip the queried node from its own parents answer.
+ m = _PARENTS_PATTERN.search(prompt_text)
+ target = m.group(1) if m else None
+ if target is not None and target in answer_nodes:
+ answer_nodes.remove(target)
+ skipped_self_parent += 1
+
+ n_tokens = count_tokens(prompt_text)
+ if max_context_tokens is not None and n_tokens > max_context_tokens:
+ skipped_tokens += 1
+ continue
+
+ sample = {
+ "responses_create_params": {"input": [{"role": "user", "content": prompt_text}]},
+ "expected_answer": json.dumps(sorted(answer_nodes)),
+ "problem_type": entry["problem_type"],
+ "n_tokens": n_tokens,
+ "prompt_chars": entry["prompt_chars"],
+ }
+ fout.write(json.dumps(sample, ensure_ascii=False) + "\n")
+ kept += 1
+
+ cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+ print(
+ f"Wrote {kept} samples to {output_fpath} "
+ f"(tokenizer={tokenizer_name}, cap={cap_str}; "
+ f"dropped {skipped_tokens} over cap; cleaned {skipped_self_parent} self-parent answers)"
+ )
+ return output_fpath
+
+
+def _parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--tokenizer_name",
+ default=DEFAULT_TOKENIZER_NAME,
+ help=(
+ "Tokenizer used for token counting. Accepts a tiktoken encoding name "
+ "(e.g. 'cl100k_base', 'o200k_base') or a HuggingFace model id "
+ "(e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+ f"Default: {DEFAULT_TOKENIZER_NAME}"
+ ),
+ )
+ parser.add_argument(
+ "--max_context_tokens",
+ type=int,
+ default=DEFAULT_MAX_CONTEXT_TOKENS,
+ help=(
+ "Drop samples whose tokenized prompt exceeds this many tokens. "
+ "Omit (or pass a negative number) for no filter. "
+ f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+ ),
+ )
+ parser.add_argument(
+ "--output_fpath",
+ type=Path,
+ default=DEFAULT_OUTPUT_FPATH,
+ help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+ )
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = _parse_args()
+ cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+ prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/graphwalks/prepare_n3_1m.py b/benchmarks/graphwalks/prepare_n3_1m.py
new file mode 100644
index 000000000..36d84c456
--- /dev/null
+++ b/benchmarks/graphwalks/prepare_n3_1m.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraphWalks variant: Nemotron-3-Super tokenizer with a 1M token cap.
+
+Same data + Skills fixes as ``prepare.py``, but counts tokens with the
+``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace tokenizer
+and drops samples whose tokenized prompt exceeds 1048576 tokens
+(Nemotron-3-Super's native 1M context window).
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+
+The output JSONL lives alongside the default at
+``data/graphwalks_n3_1m_benchmark.jsonl`` so both variants can
+coexist.
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "graphwalks_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+ return _prepare(
+ tokenizer_name=TOKENIZER_NAME,
+ max_context_tokens=MAX_CONTEXT_TOKENS,
+ output_fpath=OUTPUT_FPATH,
+ )
+
+
+if __name__ == "__main__":
+ prepare()
diff --git a/benchmarks/longbench_v2/README.md b/benchmarks/longbench_v2/README.md
index 41da06adf..61fbd756b 100644
--- a/benchmarks/longbench_v2/README.md
+++ b/benchmarks/longbench_v2/README.md
@@ -13,26 +13,58 @@ grading; this directory adds only the dataset and prompt.
Data source: HuggingFace `THUDM/LongBench-v2` (single "train" split,
which is the full eval set). `prepare.py` preserves every Skills
field (`index`, `context`, `question`, `choice_A..D`, `expected_answer`,
-`domain`, `sub_domain`, `difficulty`, `length`, `context_tokens` via
-tiktoken `cl100k_base`) and additionally emits `options` and
-`grading_mode` for the mcqa server.
+`domain`, `sub_domain`, `difficulty`, `length`, `context_tokens`) and
+additionally emits `options` and `grading_mode` for the mcqa server.
+
+## Variants
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/longbench_v2_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/longbench_v2_n3_1m_benchmark.jsonl` |
+
+The N3 1M variant requires HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`). LongBench-v2 contexts
+span 8k-2M words, so the long-bucket rows above 1M tokens are filtered
+out under the N3 1M cap.
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/longbench_v2/prepare.py \
+ --tokenizer_name cl100k_base \
+ --max_context_tokens 131072 \
+ --output_fpath benchmarks/longbench_v2/data/longbench_v2_cl100k_128k_benchmark.jsonl
+```
## Example usage
```bash
-# Prepare benchmark data
+# Prepare benchmark data (default)
ng_prepare_benchmark "+config_paths=[benchmarks/longbench_v2/config.yaml]"
+# Prepare benchmark data (N3 1M variant)
+ng_prepare_benchmark "+config_paths=[benchmarks/longbench_v2/config_n3_1m.yaml]"
+
# Running servers
config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\
benchmarks/longbench_v2/config.yaml"
ng_run "+config_paths=[$config_paths]"
-# Collecting rollouts
+# Collecting rollouts — default
ng_collect_rollouts \
+agent_name=longbench_v2_mcqa_simple_agent \
+input_jsonl_fpath=benchmarks/longbench_v2/data/longbench_v2_benchmark.jsonl \
+output_jsonl_fpath=results/longbench_v2_rollouts.jsonl \
+prompt_config=benchmarks/longbench_v2/prompts/default.yaml \
+num_repeats=4
+
+# Collecting rollouts — N3 1M
+ng_collect_rollouts \
+ +agent_name=longbench_v2_n3_1m_mcqa_simple_agent \
+ +input_jsonl_fpath=benchmarks/longbench_v2/data/longbench_v2_n3_1m_benchmark.jsonl \
+ +output_jsonl_fpath=results/longbench_v2_n3_1m_rollouts.jsonl \
+ +prompt_config=benchmarks/longbench_v2/prompts/default.yaml \
+ +num_repeats=4
```
diff --git a/benchmarks/longbench_v2/config_n3_1m.yaml b/benchmarks/longbench_v2/config_n3_1m.yaml
new file mode 100644
index 000000000..d532cc078
--- /dev/null
+++ b/benchmarks/longbench_v2/config_n3_1m.yaml
@@ -0,0 +1,18 @@
+# LongBench-v2 — N3 1M-context variant.
+# Same data + fields as `config.yaml`, but `prepare_n3_1m.py` counts
+# `context_tokens` with the Nemotron-3-Super HF tokenizer and drops
+# samples whose tokenized context exceeds 1048576 tokens.
+config_paths:
+ - resources_servers/mcqa/configs/mcqa.yaml
+
+longbench_v2_n3_1m_mcqa_simple_agent:
+ _inherit_from: mcqa_simple_agent
+ responses_api_agents:
+ simple_agent:
+ datasets:
+ - name: longbench_v2_n3_1m
+ type: benchmark
+ jsonl_fpath: benchmarks/longbench_v2/data/longbench_v2_n3_1m_benchmark.jsonl
+ prompt_config: benchmarks/longbench_v2/prompts/default.yaml
+ prepare_script: benchmarks/longbench_v2/prepare_n3_1m.py
+ license: Apache 2.0
diff --git a/benchmarks/longbench_v2/prepare.py b/benchmarks/longbench_v2/prepare.py
index 020721f8b..702b73878 100644
--- a/benchmarks/longbench_v2/prepare.py
+++ b/benchmarks/longbench_v2/prepare.py
@@ -28,10 +28,28 @@
Dataset: https://huggingface.co/datasets/THUDM/LongBench-v2
Paper: https://arxiv.org/abs/2412.15204
+
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the
+``context_tokens`` field, with no length filter. For an N3 1M-context
+variant that filters to fit, see ``prepare_n3_1m.py`` and
+``config_n3_1m.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+ python benchmarks/longbench_v2/prepare.py \\
+ --tokenizer_name cl100k_base \\
+ --max_context_tokens 131072
"""
+import argparse
import json
from pathlib import Path
+from typing import Callable, Optional
import tiktoken
from datasets import load_dataset
@@ -40,23 +58,53 @@
BENCHMARK_DIR = Path(__file__).parent
DATA_DIR = BENCHMARK_DIR / "data"
-OUTPUT_FPATH = DATA_DIR / "longbench_v2_benchmark.jsonl"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "longbench_v2_benchmark.jsonl"
+
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default
+
-# tiktoken encoding name used by Skills' prepare.py for `context_tokens`.
-TOKENIZER_NAME = "cl100k_base"
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+ """Return a ``text -> token_count`` function.
+ Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken
+ encoding, falls back to ``transformers.AutoTokenizer``. The tiktoken
+ path uses ``disallowed_special=()`` because LongBench-v2 contexts
+ sometimes contain raw ``<|endoftext|>`` strings that tiktoken would
+ otherwise refuse to encode.
+ """
+ try:
+ enc = tiktoken.get_encoding(tokenizer_name)
+ return lambda text: len(enc.encode(text, disallowed_special=()))
+ except ValueError:
+ from transformers import AutoTokenizer
-def prepare() -> Path:
+ hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+ return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
+
+
+def prepare(
+ tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+ max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+ output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
"""Download LongBench-v2, convert to Gym JSONL, return the output file path."""
- DATA_DIR.mkdir(parents=True, exist_ok=True)
+ output_fpath = Path(output_fpath)
+ output_fpath.parent.mkdir(parents=True, exist_ok=True)
- print(f"Loading THUDM/LongBench-v2 (split='train', {TOKENIZER_NAME} for context tokens) ...")
+ print(f"Loading THUDM/LongBench-v2 (split='train', tokenizer='{tokenizer_name}') ...")
dataset = load_dataset("THUDM/LongBench-v2", split="train")
- encoder = tiktoken.get_encoding(TOKENIZER_NAME)
+ count_tokens = _build_token_counter(tokenizer_name)
+
+ kept = 0
+ skipped = 0
+ with open(output_fpath, "w", encoding="utf-8") as out:
+ for entry in tqdm(dataset, desc=f"Writing {output_fpath.name}"):
+ context_tokens = count_tokens(entry["context"])
+ if max_context_tokens is not None and context_tokens > max_context_tokens:
+ skipped += 1
+ continue
- count = 0
- with open(OUTPUT_FPATH, "w", encoding="utf-8") as out:
- for entry in tqdm(dataset, desc="Writing longbench_v2_benchmark.jsonl"):
record = {
# Fields preserved verbatim from Skills' prepare.py
"index": entry["_id"],
@@ -71,11 +119,7 @@ def prepare() -> Path:
"sub_domain": entry["sub_domain"],
"difficulty": entry["difficulty"],
"length": entry["length"],
- # disallowed_special=() — some LongBench-v2 contexts contain
- # raw `<|endoftext|>` strings that tiktoken would otherwise
- # refuse to encode. We only need the count, so encode them
- # as plain text.
- "context_tokens": len(encoder.encode(entry["context"], disallowed_special=())),
+ "context_tokens": context_tokens,
# Gym-side additions consumed by the `mcqa` resource server.
# mcqa's verify() reads `options`, `expected_answer`, `grading_mode`.
"options": [
@@ -87,11 +131,48 @@ def prepare() -> Path:
"grading_mode": "strict_single_letter_boxed",
}
out.write(json.dumps(record, ensure_ascii=False) + "\n")
- count += 1
+ kept += 1
+
+ cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+ print(
+ f"Wrote {kept} problems to {output_fpath} "
+ f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped} over cap)"
+ )
+ return output_fpath
+
- print(f"Wrote {count} problems to {OUTPUT_FPATH}")
- return OUTPUT_FPATH
+def _parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--tokenizer_name",
+ default=DEFAULT_TOKENIZER_NAME,
+ help=(
+ "Tokenizer used for the context_tokens count and length filter. "
+ "Accepts a tiktoken encoding name (e.g. 'cl100k_base', 'o200k_base') "
+ "or a HuggingFace model id (e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+ f"Default: {DEFAULT_TOKENIZER_NAME}"
+ ),
+ )
+ parser.add_argument(
+ "--max_context_tokens",
+ type=int,
+ default=DEFAULT_MAX_CONTEXT_TOKENS,
+ help=(
+ "Drop samples whose tokenized context exceeds this many tokens. "
+ "Omit (or pass a negative number) for no filter. "
+ f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+ ),
+ )
+ parser.add_argument(
+ "--output_fpath",
+ type=Path,
+ default=DEFAULT_OUTPUT_FPATH,
+ help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+ )
+ return parser.parse_args()
if __name__ == "__main__":
- prepare()
+ args = _parse_args()
+ cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+ prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/longbench_v2/prepare_n3_1m.py b/benchmarks/longbench_v2/prepare_n3_1m.py
new file mode 100644
index 000000000..4959653ab
--- /dev/null
+++ b/benchmarks/longbench_v2/prepare_n3_1m.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LongBench-v2 variant: Nemotron-3-Super tokenizer with a 1M context cap.
+
+Same data + fields as ``prepare.py``, but counts ``context_tokens``
+with the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace
+tokenizer and drops samples whose tokenized context exceeds 1048576
+tokens (Nemotron-3-Super's native 1M context window). LongBench-v2
+contexts span 8k-2M words, so the long-bucket rows above 1M tokens
+are filtered out.
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "longbench_v2_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+ return _prepare(
+ tokenizer_name=TOKENIZER_NAME,
+ max_context_tokens=MAX_CONTEXT_TOKENS,
+ output_fpath=OUTPUT_FPATH,
+ )
+
+
+if __name__ == "__main__":
+ prepare()
diff --git a/benchmarks/longcodebench/README.md b/benchmarks/longcodebench/README.md
index 64ca598d1..60192ab36 100644
--- a/benchmarks/longcodebench/README.md
+++ b/benchmarks/longcodebench/README.md
@@ -12,22 +12,53 @@ the long code prompt plus the postfix; the shared
wraps it as a single user message, mirroring NeMo Skills' `prompt_format=openai`
behaviour.
+## Variants
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/longcodebench_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/longcodebench_n3_1m_benchmark.jsonl` |
+
+The N3 1M variant requires HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`).
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/longcodebench/prepare.py \
+ --tokenizer_name cl100k_base \
+ --max_context_tokens 131072 \
+ --output_fpath benchmarks/longcodebench/data/longcodebench_cl100k_128k_benchmark.jsonl
+```
+
## Example usage
```bash
-# Prepare benchmark data
+# Prepare benchmark data (default)
ng_prepare_benchmark "+config_paths=[benchmarks/longcodebench/config.yaml]"
+# Prepare benchmark data (N3 1M variant)
+ng_prepare_benchmark "+config_paths=[benchmarks/longcodebench/config_n3_1m.yaml]"
+
# Running servers
config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\
benchmarks/longcodebench/config.yaml"
ng_run "+config_paths=[$config_paths]"
-# Collecting rollouts
+# Collecting rollouts — default
ng_collect_rollouts \
+agent_name=longcodebench_mcqa_simple_agent \
+input_jsonl_fpath=benchmarks/longcodebench/data/longcodebench_benchmark.jsonl \
+output_jsonl_fpath=results/longcodebench_rollouts.jsonl \
+prompt_config=benchmarks/prompts/generic/default.yaml \
+num_repeats=4
+
+# Collecting rollouts — N3 1M
+ng_collect_rollouts \
+ +agent_name=longcodebench_n3_1m_mcqa_simple_agent \
+ +input_jsonl_fpath=benchmarks/longcodebench/data/longcodebench_n3_1m_benchmark.jsonl \
+ +output_jsonl_fpath=results/longcodebench_n3_1m_rollouts.jsonl \
+ +prompt_config=benchmarks/prompts/generic/default.yaml \
+ +num_repeats=4
```
diff --git a/benchmarks/longcodebench/config_n3_1m.yaml b/benchmarks/longcodebench/config_n3_1m.yaml
new file mode 100644
index 000000000..bc1ed0807
--- /dev/null
+++ b/benchmarks/longcodebench/config_n3_1m.yaml
@@ -0,0 +1,18 @@
+# LongCodeBench — N3 1M-context variant.
+# Same data + fields as `config.yaml`, but `prepare_n3_1m.py` counts
+# `n_tokens` with the Nemotron-3-Super HF tokenizer and drops samples
+# whose tokenized prompt exceeds 1048576 tokens.
+config_paths:
+ - resources_servers/mcqa/configs/mcqa.yaml
+
+longcodebench_n3_1m_mcqa_simple_agent:
+ _inherit_from: mcqa_simple_agent
+ responses_api_agents:
+ simple_agent:
+ datasets:
+ - name: longcodebench_n3_1m
+ type: benchmark
+ jsonl_fpath: benchmarks/longcodebench/data/longcodebench_n3_1m_benchmark.jsonl
+ prompt_config: benchmarks/prompts/generic/default.yaml
+ prepare_script: benchmarks/longcodebench/prepare_n3_1m.py
+ license: Creative Commons Attribution 4.0 International
diff --git a/benchmarks/longcodebench/prepare.py b/benchmarks/longcodebench/prepare.py
index db8a90cb0..a650c2a53 100644
--- a/benchmarks/longcodebench/prepare.py
+++ b/benchmarks/longcodebench/prepare.py
@@ -27,20 +27,35 @@
purely to populate the server's `allowed_letters` set; the option text is not
used for grading because the postfix forces a `\\boxed{X}` answer.
-Skills' prepare also stores a `n_tokens_cl100k_base` field counted with
-tiktoken. The mcqa verifier never reads it; we omit it on the Gym side to
-avoid pulling tiktoken into Gym's main dependency set just for one
-benchmark's metadata column.
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens``
+field, with no length filter. For an N3 1M-context
+variant that filters to fit, see ``prepare_n3_1m.py`` and
+``config_n3_1m.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+ python benchmarks/longcodebench/prepare.py \\
+ --tokenizer_name cl100k_base \\
+ --max_context_tokens 131072
"""
+import argparse
import json
import uuid
from pathlib import Path
+from typing import Callable, Optional
+
+import tiktoken
BENCHMARK_DIR = Path(__file__).parent
DATA_DIR = BENCHMARK_DIR / "data"
-OUTPUT_FPATH = DATA_DIR / "longcodebench_benchmark.jsonl"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "longcodebench_benchmark.jsonl"
OPTION_LETTERS = ("A", "B", "C", "D")
POSTFIX = (
@@ -48,25 +63,58 @@
"'Answer: \\boxed{A/B/C/D}' (e.g. 'Answer: \\boxed{A}')."
)
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default
+
+
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+ """Return a ``text -> token_count`` function.
-def prepare() -> Path:
+ Tries ``tiktoken.get_encoding`` first; falls back to
+ ``transformers.AutoTokenizer`` for HuggingFace model ids.
+ """
+ try:
+ enc = tiktoken.get_encoding(tokenizer_name)
+ return lambda text: len(enc.encode(text, disallowed_special=()))
+ except ValueError:
+ from transformers import AutoTokenizer
+
+ hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+ return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
+
+
+def prepare(
+ tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+ max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+ output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
"""Download LongCodeBench LongCodeQA from HuggingFace and write Gym JSONL."""
from datasets import load_dataset
- print("Downloading LongCodeBench LongCodeQA from HuggingFace...")
+ output_fpath = Path(output_fpath)
+ output_fpath.parent.mkdir(parents=True, exist_ok=True)
+
+ print(f"Downloading LongCodeBench LongCodeQA (tokenizer='{tokenizer_name}') ...")
ds = load_dataset("json", data_files="hf://datasets/Steefano/LCB/LongCodeQA.zip")
data = ds["train"]
- DATA_DIR.mkdir(parents=True, exist_ok=True)
+ count_tokens = _build_token_counter(tokenizer_name)
# Empty-text option dicts: the mcqa server only consumes the option *keys*
# for `strict_single_letter_boxed` grading; option text is irrelevant since
# the prompt postfix forces the model to emit `\boxed{}`.
options = [{letter: ""} for letter in OPTION_LETTERS]
+ kept = 0
+ skipped = 0
rows = []
for entry in data:
question = entry["prompt"].strip() + POSTFIX
+ n_tokens = count_tokens(question)
+ if max_context_tokens is not None and n_tokens > max_context_tokens:
+ skipped += 1
+ continue
+
row = {
"question": question,
"options": options,
@@ -76,15 +124,54 @@ def prepare() -> Path:
"repo": entry["repo"],
"prompt_goal": entry["prompt_goal"],
"is_hard": entry["is_hard"],
+ "n_tokens": n_tokens,
}
rows.append(json.dumps(row) + "\n")
+ kept += 1
- with open(OUTPUT_FPATH, "w", encoding="utf-8") as f:
+ with open(output_fpath, "w", encoding="utf-8") as f:
f.writelines(rows)
- print(f"Wrote {len(rows)} problems to {OUTPUT_FPATH}")
- return OUTPUT_FPATH
+ cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+ print(
+ f"Wrote {kept} problems to {output_fpath} "
+ f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped} over cap)"
+ )
+ return output_fpath
+
+
+def _parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--tokenizer_name",
+ default=DEFAULT_TOKENIZER_NAME,
+ help=(
+ "Tokenizer used for the n_tokens count and length filter. "
+ "Accepts a tiktoken encoding name (e.g. 'cl100k_base', 'o200k_base') "
+ "or a HuggingFace model id (e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+ f"Default: {DEFAULT_TOKENIZER_NAME}"
+ ),
+ )
+ parser.add_argument(
+ "--max_context_tokens",
+ type=int,
+ default=DEFAULT_MAX_CONTEXT_TOKENS,
+ help=(
+ "Drop samples whose tokenized prompt exceeds this many tokens. "
+ "Omit (or pass a negative number) for no filter. "
+ f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+ ),
+ )
+ parser.add_argument(
+ "--output_fpath",
+ type=Path,
+ default=DEFAULT_OUTPUT_FPATH,
+ help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+ )
+ return parser.parse_args()
if __name__ == "__main__":
- prepare()
+ args = _parse_args()
+ cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+ prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/longcodebench/prepare_n3_1m.py b/benchmarks/longcodebench/prepare_n3_1m.py
new file mode 100644
index 000000000..0cb6f9fcb
--- /dev/null
+++ b/benchmarks/longcodebench/prepare_n3_1m.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LongCodeBench variant: Nemotron-3-Super tokenizer with a 1M context cap.
+
+Same data + fields as ``prepare.py``, but counts ``n_tokens`` with the
+``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace tokenizer
+and drops samples whose tokenized prompt exceeds 1048576 tokens
+(Nemotron-3-Super's native 1M context window).
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "longcodebench_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+ return _prepare(
+ tokenizer_name=TOKENIZER_NAME,
+ max_context_tokens=MAX_CONTEXT_TOKENS,
+ output_fpath=OUTPUT_FPATH,
+ )
+
+
+if __name__ == "__main__":
+ prepare()
diff --git a/benchmarks/mrcr/README.md b/benchmarks/mrcr/README.md
index 535bf9002..d31ae8ec0 100644
--- a/benchmarks/mrcr/README.md
+++ b/benchmarks/mrcr/README.md
@@ -8,17 +8,39 @@ to the Nth occurrence and reproduce it exactly" instruction. Scoring:
`SequenceMatcher.ratio()` between stripped response and stripped expected
answer, gated on the response starting with the random prefix.
+## Variants
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/mrcr_benchmark.jsonl` |
+| N3 128k | `config_n3_128k.yaml` | `prepare_n3_128k.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `131072` | `data/mrcr_n3_128k_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/mrcr_n3_1m_benchmark.jsonl` |
+
+The N3 variants require HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`).
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/mrcr/prepare.py \
+ --tokenizer_name nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 \
+ --max_context_tokens 131072 \
+ --output_fpath benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl
+```
+
## Prepare benchmark data
```bash
+# Default (o200k_base, no filter)
ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config.yaml]"
-```
-Downloads the HF dataset, token-counts each sample with `tiktoken o200k_base`,
-and writes `benchmarks/mrcr/data/mrcr_benchmark.jsonl`. Samples over 200000
-input tokens are dropped to leave headroom for model-side tokenizers (which
-can be 7–10% heavier than tiktoken) to stay under a 262144-token native
-context.
+# N3 128k variant
+ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config_n3_128k.yaml]"
+
+# N3 1M variant
+ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config_n3_1m.yaml]"
+```
## Start environment
@@ -29,11 +51,26 @@ ng_run "+config_paths=[benchmarks/mrcr/config.yaml,responses_api_models/vllm_mod
## Collect rollouts
```bash
+# Default variant
ng_collect_rollouts \
+agent_name=mrcr_benchmark_simple_agent \
+input_jsonl_fpath=benchmarks/mrcr/data/mrcr_benchmark.jsonl \
+output_jsonl_fpath=results/mrcr_rollouts.jsonl \
+num_repeats=4
+
+# N3 128k variant
+ng_collect_rollouts \
+ +agent_name=mrcr_n3_128k_benchmark_simple_agent \
+ +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl \
+ +output_jsonl_fpath=results/mrcr_n3_128k_rollouts.jsonl \
+ +num_repeats=4
+
+# N3 1M variant
+ng_collect_rollouts \
+ +agent_name=mrcr_n3_1m_benchmark_simple_agent \
+ +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_n3_1m_benchmark.jsonl \
+ +output_jsonl_fpath=results/mrcr_n3_1m_rollouts.jsonl \
+ +num_repeats=4
```
## Metrics
diff --git a/benchmarks/mrcr/config_n3_128k.yaml b/benchmarks/mrcr/config_n3_128k.yaml
new file mode 100644
index 000000000..362aa9a24
--- /dev/null
+++ b/benchmarks/mrcr/config_n3_128k.yaml
@@ -0,0 +1,28 @@
+# MRCR — Nemotron-3-Super 128k-context variant.
+# Same data + grading as `config.yaml`, but `prepare_n3_128k.py` counts
+# tokens with the Nemotron-3-Super HF tokenizer and drops samples whose
+# tokenized conversation exceeds 131072 tokens.
+config_paths:
+ - resources_servers/mrcr/configs/mrcr.yaml
+
+mrcr_n3_128k_benchmark_resources_server:
+ _inherit_from: mrcr_resources_server
+
+mrcr_n3_128k_benchmark_simple_agent:
+ _inherit_from: mrcr_simple_agent
+ responses_api_agents:
+ simple_agent:
+ resources_server:
+ name: mrcr_n3_128k_benchmark_resources_server
+ datasets:
+ - name: mrcr_n3_128k
+ type: benchmark
+ jsonl_fpath: benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl
+ prompt_config: null
+ prepare_script: benchmarks/mrcr/prepare_n3_128k.py
+ # Rollouts per task for pass@k variance.
+ # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+ # placeholder — it only triggers row duplication for
+ # `type: train`/`validation`. To actually get N rollouts per task,
+ # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+ num_repeats: 1
diff --git a/benchmarks/mrcr/config_n3_1m.yaml b/benchmarks/mrcr/config_n3_1m.yaml
new file mode 100644
index 000000000..c1ab257db
--- /dev/null
+++ b/benchmarks/mrcr/config_n3_1m.yaml
@@ -0,0 +1,28 @@
+# MRCR — Nemotron-3-Super 1M-context variant.
+# Same data + grading as `config.yaml`, but `prepare_n3_1m.py` counts
+# tokens with the Nemotron-3-Super HF tokenizer and drops samples whose
+# tokenized conversation exceeds 1048576 tokens.
+config_paths:
+ - resources_servers/mrcr/configs/mrcr.yaml
+
+mrcr_n3_1m_benchmark_resources_server:
+ _inherit_from: mrcr_resources_server
+
+mrcr_n3_1m_benchmark_simple_agent:
+ _inherit_from: mrcr_simple_agent
+ responses_api_agents:
+ simple_agent:
+ resources_server:
+ name: mrcr_n3_1m_benchmark_resources_server
+ datasets:
+ - name: mrcr_n3_1m
+ type: benchmark
+ jsonl_fpath: benchmarks/mrcr/data/mrcr_n3_1m_benchmark.jsonl
+ prompt_config: null
+ prepare_script: benchmarks/mrcr/prepare_n3_1m.py
+ # Rollouts per task for pass@k variance.
+ # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+ # placeholder — it only triggers row duplication for
+ # `type: train`/`validation`. To actually get N rollouts per task,
+ # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+ num_repeats: 1
diff --git a/benchmarks/mrcr/prepare.py b/benchmarks/mrcr/prepare.py
index 0bcfad81a..2ef8e767b 100644
--- a/benchmarks/mrcr/prepare.py
+++ b/benchmarks/mrcr/prepare.py
@@ -20,18 +20,30 @@
https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/mrcr/prepare.py
Each row in the upstream dataset has a `prompt` field that is a JSON-stringified
-list of OpenAI chat messages. We parse it into `responses_create_params.input`,
-count tokens with tiktoken `o200k_base` (same tokenizer used by the official
-MRCR grading setup), and filter to samples that fit in the model context.
-
-The 200000-token cap leaves headroom for tokenizer drift: a model's own
-tokenizer can produce ~7-10% more tokens than tiktoken `o200k_base`, so
-filtering at 200K tiktoken keeps the model-side worst-case near 220K, which
-combined with ~32K generation stays under a 262144-token native context.
+list of OpenAI chat messages. We parse it into `responses_create_params.input`
+and count tokens by summing the per-message tokenized lengths.
+
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens``
+field, with no length filter. For a 128k-context variant using the
+Nemotron-3-Super HF tokenizer, see ``prepare_n3_128k.py`` and
+``config_n3_128k.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+ python benchmarks/mrcr/prepare.py \\
+ --tokenizer_name nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 \\
+ --max_context_tokens 131072
"""
+import argparse
import json
from pathlib import Path
+from typing import Callable, Optional
import tiktoken
from datasets import load_dataset
@@ -40,30 +52,55 @@
BENCHMARK_DIR = Path(__file__).parent
DATA_DIR = BENCHMARK_DIR / "data"
-OUTPUT_FPATH = DATA_DIR / "mrcr_benchmark.jsonl"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "mrcr_benchmark.jsonl"
+
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None # no filter by default
-MAX_CONTEXT_TOKENS = 200000
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+ """Return a ``text -> token_count`` function.
-def _count_tokens(messages: list[dict]) -> int:
- """Token count using the o200k_base tokenizer — same as Skills prepare."""
- enc = tiktoken.get_encoding("o200k_base")
- return sum(len(enc.encode(m["content"])) for m in messages)
+ Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken
+ encoding, falls back to ``transformers.AutoTokenizer``.
+ """
+ try:
+ enc = tiktoken.get_encoding(tokenizer_name)
+ return lambda text: len(enc.encode(text, disallowed_special=()))
+ except ValueError:
+ from transformers import AutoTokenizer
+ hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+ return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
-def prepare() -> Path:
- DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+def _count_message_tokens(messages: list[dict], count_one: Callable[[str], int]) -> int:
+ """Sum tokens across every message's ``content`` field.
+
+ Matches the per-message summing used by ``nemo_skills/dataset/mrcr/prepare.py``
+ and the official openai/mrcr grading setup.
+ """
+ return sum(count_one(m["content"]) for m in messages)
+
+
+def prepare(
+ tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+ max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+ output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
+ output_fpath = Path(output_fpath)
+ output_fpath.parent.mkdir(parents=True, exist_ok=True)
dataset = load_dataset("openai/mrcr", split="train")
+ count_one = _build_token_counter(tokenizer_name)
kept = 0
skipped_tokens = 0
- with OUTPUT_FPATH.open("w", encoding="utf-8") as fout:
- for idx, entry in tqdm(enumerate(dataset), desc="Preparing MRCR"):
+ with output_fpath.open("w", encoding="utf-8") as fout:
+ for entry in tqdm(dataset, desc="Preparing MRCR"):
messages = json.loads(entry["prompt"])
-
- n_tokens = _count_tokens(messages)
- if n_tokens > MAX_CONTEXT_TOKENS:
+ n_tokens = _count_message_tokens(messages, count_one)
+ if max_context_tokens is not None and n_tokens > max_context_tokens:
skipped_tokens += 1
continue
@@ -77,9 +114,46 @@ def prepare() -> Path:
fout.write(json.dumps(sample) + "\n")
kept += 1
- print(f"Wrote {kept} samples to {OUTPUT_FPATH} (skipped {skipped_tokens} with >{MAX_CONTEXT_TOKENS} tokens)")
- return OUTPUT_FPATH
+ cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+ print(
+ f"Wrote {kept} samples to {output_fpath} "
+ f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped_tokens} over cap)"
+ )
+ return output_fpath
+
+
+def _parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--tokenizer_name",
+ default=DEFAULT_TOKENIZER_NAME,
+ help=(
+ "Tokenizer used for token counting. Accepts a tiktoken encoding name "
+ "(e.g. 'cl100k_base', 'o200k_base') or a HuggingFace model id "
+ "(e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+ f"Default: {DEFAULT_TOKENIZER_NAME}"
+ ),
+ )
+ parser.add_argument(
+ "--max_context_tokens",
+ type=int,
+ default=DEFAULT_MAX_CONTEXT_TOKENS,
+ help=(
+ "Drop samples whose tokenized conversation exceeds this many tokens. "
+ "Omit (or pass a negative number) for no filter. "
+ f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+ ),
+ )
+ parser.add_argument(
+ "--output_fpath",
+ type=Path,
+ default=DEFAULT_OUTPUT_FPATH,
+ help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+ )
+ return parser.parse_args()
if __name__ == "__main__":
- prepare()
+ args = _parse_args()
+ cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+ prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/mrcr/prepare_n3_128k.py b/benchmarks/mrcr/prepare_n3_128k.py
new file mode 100644
index 000000000..c75e0faaa
--- /dev/null
+++ b/benchmarks/mrcr/prepare_n3_128k.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MRCR variant: Nemotron-3-Super tokenizer with a 128k token cap.
+
+Same data + grading as ``prepare.py``, but counts ``n_tokens`` with
+the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace
+tokenizer and drops samples whose tokenized conversation exceeds
+131072 tokens.
+
+Paired with ``config_n3_128k.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 131072
+OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_128k_benchmark.jsonl"
+
+
+def prepare() -> Path:
+ return _prepare(
+ tokenizer_name=TOKENIZER_NAME,
+ max_context_tokens=MAX_CONTEXT_TOKENS,
+ output_fpath=OUTPUT_FPATH,
+ )
+
+
+if __name__ == "__main__":
+ prepare()
diff --git a/benchmarks/mrcr/prepare_n3_1m.py b/benchmarks/mrcr/prepare_n3_1m.py
new file mode 100644
index 000000000..ffcbcdcb6
--- /dev/null
+++ b/benchmarks/mrcr/prepare_n3_1m.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MRCR variant: Nemotron-3-Super tokenizer with a 1M token cap.
+
+Same data + grading as ``prepare.py``, but counts ``n_tokens`` with
+the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace
+tokenizer and drops samples whose tokenized conversation exceeds
+1048576 tokens (Nemotron-3-Super's native 1M context window).
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+ return _prepare(
+ tokenizer_name=TOKENIZER_NAME,
+ max_context_tokens=MAX_CONTEXT_TOKENS,
+ output_fpath=OUTPUT_FPATH,
+ )
+
+
+if __name__ == "__main__":
+ prepare()
diff --git a/resources_servers/graphwalks/README.md b/resources_servers/graphwalks/README.md
new file mode 100644
index 000000000..5f11f4de5
--- /dev/null
+++ b/resources_servers/graphwalks/README.md
@@ -0,0 +1,46 @@
+# GraphWalks resources server
+
+OpenAI's [GraphWalks](https://huggingface.co/datasets/openai/graphwalks)
+long-context benchmark. Each task provides an adjacency list (often
+massive) and asks the model either to:
+
+- **parents**: list every parent of a target node, or
+- **bfs**: list every node reachable at exactly depth N via BFS from a
+ source node.
+
+## Scoring
+
+1. The model must end its response with a line of the form
+ `Final Answer: [n1, n2, ...]`. If the format is missing,
+ `parse_failed=True` and reward=0.
+2. Otherwise reward is the **F1 score** between the predicted node
+ set and the expected node set (continuous in [0, 1]):
+ - both empty → 1.0
+ - one empty (the other non-empty) → 0.0
+ - else `2·P·R / (P + R)`
+
+Grader ported from
+https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluator/graphwalks.py.
+
+## Start environment
+
+```bash
+ng_run "+config_paths=[resources_servers/graphwalks/configs/graphwalks.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]"
+```
+
+## Collect example rollouts
+
+```bash
+ng_collect_rollouts \
+ +agent_name=graphwalks_simple_agent \
+ +input_jsonl_fpath=resources_servers/graphwalks/data/example.jsonl \
+ +output_jsonl_fpath=resources_servers/graphwalks/data/example_rollouts.jsonl
+```
+
+For the full benchmark run see
+[`benchmarks/graphwalks/README.md`](../../benchmarks/graphwalks/README.md).
+
+## Licensing
+
+- Code: Apache 2.0
+- Data ([openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks)): see upstream license
diff --git a/resources_servers/graphwalks/app.py b/resources_servers/graphwalks/app.py
new file mode 100644
index 000000000..6ab73f2d5
--- /dev/null
+++ b/resources_servers/graphwalks/app.py
@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraphWalks (OpenAI) resources server.
+
+Implements F1-over-node-sets grading from the official
+[openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks)
+benchmark. Each task asks the model either to (a) list the parents of a
+node or (b) return BFS-reachable nodes at exactly a given depth.
+
+Ported from:
+ https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluator/graphwalks.py
+
+Scoring:
+ - Parse the model's final line for ``Final Answer: [n1, n2, ...]``.
+ If absent, ``parse_failed=True`` and reward=0.
+ - Otherwise compute F1 between the predicted node set and the
+ expected node set. Empty-vs-empty matches as F1=1.0; either
+ side empty (with the other non-empty) is F1=0.
+ - Reward is the F1 score in [0, 1] — continuous, like MRCR.
+"""
+
+import json
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from nemo_gym.base_resources_server import (
+ BaseResourcesServerConfig,
+ BaseVerifyRequest,
+ BaseVerifyResponse,
+ SimpleResourcesServer,
+)
+from nemo_gym.reward_profile import (
+ compute_pass_majority_metrics,
+ compute_subset_metrics,
+ highest_k_metrics,
+)
+
+
+class GraphWalksResourcesServerConfig(BaseResourcesServerConfig):
+ pass
+
+
+class GraphWalksVerifyRequest(BaseVerifyRequest):
+ expected_answer: str
+ problem_type: str
+ n_tokens: Optional[int] = None
+ prompt_chars: Optional[int] = None
+
+
+class GraphWalksVerifyResponse(GraphWalksVerifyRequest, BaseVerifyResponse):
+ f1: float
+ parse_failed: bool
+ predicted_nodes: List[str]
+
+
+class GraphWalksResourcesServer(SimpleResourcesServer):
+ config: GraphWalksResourcesServerConfig
+
+ async def verify(self, body: GraphWalksVerifyRequest) -> GraphWalksVerifyResponse:
+ response = body.response.output_text
+ predicted_nodes, parse_failed = _parse_node_list(response)
+ try:
+ expected_nodes = set(json.loads(body.expected_answer))
+ except (json.JSONDecodeError, TypeError):
+ expected_nodes = set()
+ f1 = _f1_score(set(predicted_nodes), expected_nodes, parse_failed)
+ return GraphWalksVerifyResponse(
+ **body.model_dump(),
+ reward=f1,
+ f1=f1,
+ parse_failed=parse_failed,
+ predicted_nodes=predicted_nodes,
+ )
+
+ # ──────────────────────────────────────────────────────────
+ # Aggregate metrics overrides
+ # ──────────────────────────────────────────────────────────
+
+ @staticmethod
+ def _score_fn(r: Dict[str, Any]) -> Dict[str, Union[float, bool]]:
+ return {"accuracy": r["reward"]}
+
+ def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]:
+ """Pass@k plus a per-`problem_type` subset breakdown.
+
+ F1 is a continuous score in [0, 1] so pass@k is max-of-k (not
+ combinatorial). majority@k is not meaningful (no discrete
+ extracted answer) — `answer_key` is left None.
+ """
+ metrics, _, _, _ = compute_pass_majority_metrics(tasks, score_fn=self._score_fn)
+ subset_metrics = compute_subset_metrics(tasks, subset_key="problem_type", score_fn=self._score_fn)
+ # compute_subset_metrics emits keys like "/pass@k/accuracy" where
+ # is the raw subset value. Prepend the field name so the key
+ # stays self-describing: "problem_type=/pass@k/accuracy".
+ subset_metrics = {(f"problem_type={k}" if "/" in k else k): v for k, v in subset_metrics.items()}
+ metrics.update(subset_metrics)
+ return metrics
+
+ def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]:
+ key: Dict[str, Any] = {}
+ for name in ("mean/input_tokens", "mean/output_tokens"):
+ if name in agent_metrics:
+ key[name] = agent_metrics[name]
+ key.update(highest_k_metrics(agent_metrics, "pass@1[avg-of-{k}]"))
+ key.update(highest_k_metrics(agent_metrics, "pass@{k}"))
+ return key
+
+
+_FINAL_ANSWER_RE = re.compile(r"Final Answer:\s*\[(.*)\]")
+
+
+def _parse_node_list(response: str) -> Tuple[List[str], bool]:
+ """Parse ``Final Answer: [n1, n2, ...]`` from the last non-empty line.
+
+ Returns ``(nodes, parse_failed)``. ``parse_failed`` is True when the
+ expected format is absent. Empty list with ``parse_failed=False`` means
+ the model explicitly returned no nodes.
+
+ Reference: https://huggingface.co/datasets/openai/graphwalks
+ """
+ lines = [line for line in (response or "").strip().split("\n") if line.strip()]
+ if not lines:
+ return [], True
+
+ match = _FINAL_ANSWER_RE.search(lines[-1])
+ if not match:
+ return [], True
+
+ content = match.group(1)
+ if not content.strip():
+ return [], False
+ return [item.strip() for item in content.split(",") if item.strip()], False
+
+
+def _f1_score(predicted: set, expected: set, parse_failed: bool) -> float:
+ """F1 between two node sets.
+
+ - parse_failed → 0.0 (no answer extracted)
+ - both empty → 1.0 (model correctly returned nothing)
+ - one empty → 0.0
+ - otherwise → 2·P·R / (P + R)
+ """
+ if parse_failed:
+ return 0.0
+ if not expected and not predicted:
+ return 1.0
+ if not predicted or not expected:
+ return 0.0
+ tp = len(predicted & expected)
+ if tp == 0:
+ return 0.0
+ precision = tp / len(predicted)
+ recall = tp / len(expected)
+ return 2 * precision * recall / (precision + recall)
+
+
+if __name__ == "__main__":
+ GraphWalksResourcesServer.run_webserver()
diff --git a/resources_servers/graphwalks/configs/graphwalks.yaml b/resources_servers/graphwalks/configs/graphwalks.yaml
new file mode 100644
index 000000000..6b5f6ed50
--- /dev/null
+++ b/resources_servers/graphwalks/configs/graphwalks.yaml
@@ -0,0 +1,25 @@
+graphwalks_resources_server:
+ resources_servers:
+ graphwalks:
+ entrypoint: app.py
+ domain: other
+ verified: false
+ description: Long-context graph-walks (BFS / parents) with F1-over-node-sets grading from openai/graphwalks
+ value: Improve long-context multi-step graph reasoning and adjacency-list traversal
+
+graphwalks_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: graphwalks_resources_server
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: example
+ type: example
+ jsonl_fpath: resources_servers/graphwalks/data/example.jsonl
+ num_repeats: 1
+ license: MIT
diff --git a/resources_servers/graphwalks/data/.gitignore b/resources_servers/graphwalks/data/.gitignore
new file mode 100644
index 000000000..8d24a9b19
--- /dev/null
+++ b/resources_servers/graphwalks/data/.gitignore
@@ -0,0 +1,6 @@
+*train.jsonl
+*validation.jsonl
+*train_prepare.jsonl
+*validation_prepare.jsonl
+*example_prepare.jsonl
+*benchmark.jsonl
diff --git a/resources_servers/graphwalks/data/example.jsonl b/resources_servers/graphwalks/data/example.jsonl
new file mode 100644
index 000000000..45f92fd58
--- /dev/null
+++ b/resources_servers/graphwalks/data/example.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nnode_0 -> node_1, node_2\nnode_1 -> node_3\nnode_2 -> node_3, node_4\nnode_3 -> node_5\nnode_4 -> node_5\n\nOperation: Find the parents of node node_3."}]}, "expected_answer": "[\"node_1\", \"node_2\"]", "problem_type": "parents", "n_tokens": 120, "prompt_chars": 480, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nalpha -> beta, gamma\nbeta -> delta\ngamma -> delta, epsilon\ndelta -> zeta\nepsilon -> zeta\n\nOperation: Find the parents of node zeta."}]}, "expected_answer": "[\"delta\", \"epsilon\"]", "problem_type": "parents", "n_tokens": 110, "prompt_chars": 430, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nnode_0 -> node_1, node_2\nnode_1 -> node_3, node_4\nnode_2 -> node_5\nnode_3 -> node_6\nnode_4 -> node_6\nnode_5 -> node_7\n\nOperation: Perform a BFS from node node_0 and return only the nodes at exactly depth 2 (not nodes at intermediate depths)."}]}, "expected_answer": "[\"node_3\", \"node_4\", \"node_5\"]", "problem_type": "bfs", "n_tokens": 150, "prompt_chars": 560, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nroot -> a, b\na -> c, d\nb -> e\nc -> f\nd -> f, g\ne -> g\nf -> h\ng -> h\n\nOperation: Perform a BFS from node root and return only the nodes at exactly depth 3 (not nodes at intermediate depths)."}]}, "expected_answer": "[\"f\", \"g\"]", "problem_type": "bfs", "n_tokens": 145, "prompt_chars": 540, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nn0 -> n1\nn1 -> n2, n3\nn2 -> n4\nn3 -> n4, n5\nn4 -> n6\nn5 -> n6\nn6 -> n7\n\nOperation: Find the parents of node n4."}]}, "expected_answer": "[\"n2\", \"n3\"]", "problem_type": "parents", "n_tokens": 130, "prompt_chars": 470, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
diff --git a/resources_servers/graphwalks/data/example_metrics.json b/resources_servers/graphwalks/data/example_metrics.json
new file mode 100644
index 000000000..8a0f7bdb5
--- /dev/null
+++ b/resources_servers/graphwalks/data/example_metrics.json
@@ -0,0 +1,60 @@
+{
+ "name": "example",
+ "type": "example",
+ "jsonl_fpath": "resources_servers/graphwalks/data/example.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": null,
+ "huggingface_identifier": null,
+ "license": "MIT",
+ "Number of examples": 5,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 5,
+ "Average": 77.2,
+ "Min": 69.0,
+ "Max": 90.0,
+ "Standard deviation": 9.71
+ },
+ "Number of turns": {
+ "Total # non-null values": 5,
+ "Average": 1.0,
+ "Min": 1.0,
+ "Max": 1.0,
+ "Standard deviation": 0.0
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Standard deviation": 0.0
+ },
+ "expected_answer": {
+ "unique_count": 5,
+ "total_count": 5
+ },
+ "problem_type": {
+ "unique_count": 2,
+ "total_count": 5
+ },
+ "n_tokens": {
+ "Total # non-null values": 5,
+ "Average": 131.0,
+ "Min": 110.0,
+ "Max": 150.0,
+ "Standard deviation": 16.73
+ },
+ "prompt_chars": {
+ "Total # non-null values": 5,
+ "Average": 496.0,
+ "Min": 430.0,
+ "Max": 560.0,
+ "Standard deviation": 53.2
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/graphwalks/requirements.txt b/resources_servers/graphwalks/requirements.txt
new file mode 100644
index 000000000..151b4ab7b
--- /dev/null
+++ b/resources_servers/graphwalks/requirements.txt
@@ -0,0 +1,2 @@
+-e nemo-gym[dev] @ ../../
+tiktoken
diff --git a/resources_servers/graphwalks/tests/__init__.py b/resources_servers/graphwalks/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/resources_servers/graphwalks/tests/test_app.py b/resources_servers/graphwalks/tests/test_app.py
new file mode 100644
index 000000000..6c4cd9a86
--- /dev/null
+++ b/resources_servers/graphwalks/tests/test_app.py
@@ -0,0 +1,200 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from unittest.mock import MagicMock
+
+import pytest
+
+from nemo_gym.server_utils import ServerClient
+from resources_servers.graphwalks.app import (
+ GraphWalksResourcesServer,
+ GraphWalksResourcesServerConfig,
+ _f1_score,
+ _parse_node_list,
+)
+
+
+class TestSanity:
+ def test_sanity(self) -> None:
+ config = GraphWalksResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ name="",
+ )
+ GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+
+class TestParseNodeList:
+ """Tests for the Final-Answer line parser.
+
+ Reference: https://huggingface.co/datasets/openai/graphwalks
+ """
+
+ def test_parses_well_formed_list(self) -> None:
+ nodes, failed = _parse_node_list("...\nFinal Answer: [node_1, node_2, node_3]")
+ assert nodes == ["node_1", "node_2", "node_3"]
+ assert failed is False
+
+ def test_empty_list_is_valid(self) -> None:
+ """`Final Answer: []` is a valid no-nodes answer, not a parse failure."""
+ nodes, failed = _parse_node_list("Final Answer: []")
+ assert nodes == []
+ assert failed is False
+
+ def test_only_uses_last_line(self) -> None:
+ text = "Final Answer: [decoy]\nbecause I said so\nFinal Answer: [real]"
+ nodes, failed = _parse_node_list(text)
+ assert nodes == ["real"]
+ assert failed is False
+
+ def test_skips_trailing_blank_lines(self) -> None:
+ nodes, failed = _parse_node_list("Final Answer: [a, b]\n\n \n")
+ assert nodes == ["a", "b"]
+ assert failed is False
+
+ def test_missing_format_fails(self) -> None:
+ nodes, failed = _parse_node_list("The answer is node_42.")
+ assert nodes == []
+ assert failed is True
+
+ def test_blank_response_fails(self) -> None:
+ nodes, failed = _parse_node_list("")
+ assert nodes == []
+ assert failed is True
+
+ def test_strips_whitespace_inside_list(self) -> None:
+ nodes, failed = _parse_node_list("Final Answer: [ a ,b , c]")
+ assert nodes == ["a", "b", "c"]
+ assert failed is False
+
+ def test_drops_empty_items(self) -> None:
+ """Trailing commas / double commas should not produce empty entries."""
+ nodes, failed = _parse_node_list("Final Answer: [a,, b,]")
+ assert nodes == ["a", "b"]
+ assert failed is False
+
+
+class TestF1Score:
+ def test_parse_failed_is_zero(self) -> None:
+ assert _f1_score({"a"}, {"a"}, parse_failed=True) == 0.0
+
+ def test_both_empty_is_one(self) -> None:
+ assert _f1_score(set(), set(), parse_failed=False) == 1.0
+
+ def test_predicted_empty_expected_nonempty(self) -> None:
+ assert _f1_score(set(), {"a"}, parse_failed=False) == 0.0
+
+ def test_predicted_nonempty_expected_empty(self) -> None:
+ assert _f1_score({"a"}, set(), parse_failed=False) == 0.0
+
+ def test_exact_match_is_one(self) -> None:
+ assert _f1_score({"a", "b"}, {"a", "b"}, parse_failed=False) == 1.0
+
+ def test_no_overlap_is_zero(self) -> None:
+ assert _f1_score({"a"}, {"b"}, parse_failed=False) == 0.0
+
+ def test_partial_overlap(self) -> None:
+ # P=1/2, R=1/2 → F1=0.5
+ assert math.isclose(_f1_score({"a", "b"}, {"a", "c"}, parse_failed=False), 0.5)
+
+ def test_unequal_sizes(self) -> None:
+ # predicted={a,b,c}, expected={a}; P=1/3, R=1 → F1=0.5
+ assert math.isclose(_f1_score({"a", "b", "c"}, {"a"}, parse_failed=False), 0.5)
+
+
+class TestScoreFn:
+ def test_score_fn_returns_accuracy_equals_reward(self) -> None:
+ assert GraphWalksResourcesServer._score_fn({"reward": 0.73}) == {"accuracy": 0.73}
+
+ def test_score_fn_handles_zero(self) -> None:
+ assert GraphWalksResourcesServer._score_fn({"reward": 0.0}) == {"accuracy": 0.0}
+
+ def test_score_fn_handles_one(self) -> None:
+ assert GraphWalksResourcesServer._score_fn({"reward": 1.0}) == {"accuracy": 1.0}
+
+
+class TestComputeMetrics:
+ @pytest.fixture
+ def server(self) -> GraphWalksResourcesServer:
+ config = GraphWalksResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ name="",
+ )
+ return GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+ def test_compute_metrics_empty(self, server: GraphWalksResourcesServer) -> None:
+ assert server.compute_metrics([]) == {}
+
+ def test_compute_metrics_includes_pass_at_k(self, server: GraphWalksResourcesServer) -> None:
+ tasks = [
+ [{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}],
+ [{"reward": 0.8, "problem_type": "bfs"}, {"reward": 0.6, "problem_type": "bfs"}],
+ ]
+ metrics = server.compute_metrics(tasks)
+ assert "pass@1/accuracy" in metrics
+ assert "pass@2/accuracy" in metrics
+ assert "pass@1[avg-of-2]/accuracy" in metrics
+
+ def test_compute_metrics_includes_subset_breakdown(self, server: GraphWalksResourcesServer) -> None:
+ """Per-problem-type subset should appear as `problem_type=/...`."""
+ tasks = [
+ [{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}],
+ [{"reward": 0.8, "problem_type": "bfs"}, {"reward": 0.6, "problem_type": "bfs"}],
+ ]
+ metrics = server.compute_metrics(tasks)
+ assert any(k.startswith("problem_type=parents/pass@") for k in metrics)
+ assert any(k.startswith("problem_type=bfs/pass@") for k in metrics)
+ # Bare "/..." keys must NOT leak through from compute_subset_metrics.
+ assert not any(k.startswith(("parents/", "bfs/")) for k in metrics)
+
+ def test_compute_metrics_no_majority(self, server: GraphWalksResourcesServer) -> None:
+ """majority@k is skipped because F1 has no discrete answer_key."""
+ tasks = [[{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}]]
+ metrics = server.compute_metrics(tasks)
+ assert not any(k.startswith("majority@") for k in metrics)
+
+
+class TestGetKeyMetrics:
+ @pytest.fixture
+ def server(self) -> GraphWalksResourcesServer:
+ config = GraphWalksResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ name="",
+ )
+ return GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+ def test_get_key_metrics_picks_highest_k(self, server: GraphWalksResourcesServer) -> None:
+ agent_metrics = {
+ "pass@1/accuracy": 50.0,
+ "pass@2/accuracy": 70.0,
+ "pass@4/accuracy": 80.0,
+ "pass@1[avg-of-4]/accuracy": 60.0,
+ "mean/input_tokens": 1000,
+ "mean/output_tokens": 200,
+ }
+ key = server.get_key_metrics(agent_metrics)
+ assert key["pass@4/accuracy"] == 80.0
+ assert key["pass@1[avg-of-4]/accuracy"] == 60.0
+ assert key["mean/input_tokens"] == 1000
+ assert key["mean/output_tokens"] == 200
+ # Lower-k entries should not be in the key set
+ assert "pass@1/accuracy" not in key
+ assert "pass@2/accuracy" not in key