NVIDIA-NeMo · hsiehjackson · May 27, 2026 · May 27, 2026 · May 28, 2026 · May 29, 2026
diff --git a/README.md b/README.md
@@ -208,6 +208,7 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace).
 | Genrm Compare                                 | rlhf                  | GenRM pairwise comparison for RLHF training                                                                                                                                                                                  | Compare multiple candidate responses using GenRM model                                                                                | -     | -          | -                                                         | <a href='resources_servers/genrm_compare/configs/genrm_compare.yaml'>genrm_compare.yaml</a>                                                                                                                                 | -                                                                                                                                                              |
 | Google Search                                 | agent                 | Multi-choice question answering problems with search tools integrated                                                                                                                                                        | Improve knowledge-related benchmarks with search tools                                                                                | ✓     | -          | Apache 2.0                                                | <a href='resources_servers/google_search/configs/google_search.yaml'>google_search.yaml</a>                                                                                                                                 | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-web_search-mcqa'>Nemotron-RL-knowledge-web_search-mcqa</a>                               |
 | Gpqa Diamond                                  | knowledge             | GPQA Diamond multiple-choice question answering problems                                                                                                                                                                     | Evaluate graduate-level scientific reasoning via MCQ verification                                                                     | ✓     | -          | MIT                                                       | <a href='resources_servers/gpqa_diamond/configs/gpqa_diamond.yaml'>gpqa_diamond.yaml</a>                                                                                                                                    | -                                                                                                                                                              |
+| Graphwalks                                    | other                 | Long-context graph-walks (BFS / parents) with F1-over-node-sets grading from openai/graphwalks                                                                                                                               | Improve long-context multi-step graph reasoning and adjacency-list traversal                                                          | -     | -          | -                                                         | <a href='resources_servers/graphwalks/configs/graphwalks.yaml'>graphwalks.yaml</a>                                                                                                                                          | -                                                                                                                                                              |
 | Grl Sokoban                                   | games                 | Single-box Sokoban in Gymnasium API style.                                                                                                                                                                                   | Model emits one move per turn until the puzzle is solved.                                                                             | -     | -          | -                                                         | <a href='resources_servers/grl_sokoban/configs/grl_sokoban.yaml'>grl_sokoban.yaml</a>                                                                                                                                       | -                                                                                                                                                              |
 | Grl Tetris                                    | games                 | Tetris in Gymnasium API style. Model emits one or more moves per turn.                                                                                                                                                       | Multi-step Tetris environment                                                                                                         | -     | -          | -                                                         | <a href='resources_servers/grl_tetris/configs/grl_tetris.yaml'>grl_tetris.yaml</a>                                                                                                                                          | -                                                                                                                                                              |
 | Gymnasium                                     | other                 | Base class for Gymnasium-style servers. Not a standalone server.                                                                                                                                                             | Reusable base class for step/reset style environments                                                                                 | -     | -          | -                                                         | <a href='resources_servers/gymnasium/configs/gymnasium.yaml'>gymnasium.yaml</a>                                                                                                                                             | -                                                                                                                                                              |

diff --git a/benchmarks/graphwalks/README.md b/benchmarks/graphwalks/README.md
@@ -0,0 +1,82 @@
+# GraphWalks benchmark
+
+Benchmark wrapper over the [`graphwalks` resources server](../../resources_servers/graphwalks/README.md)
+for the [openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks) dataset.
+
+Each task supplies an adjacency list and asks the model to either list
+the parents of a node (`problem_type: parents`) or return the BFS
+frontier at exactly depth N (`problem_type: bfs`). Scoring is F1 over
+the predicted node set vs. the expected node set, gated on the model
+producing a `Final Answer: [...]` line.
+
+## Variants
+
+Two preset configs ship alongside this benchmark. Both apply the same
+data + Skills prompt fixes (BFS depth disambiguation, self-parent
+removal); they differ only in the tokenizer used for the `n_tokens`
+column and an optional length filter.
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/graphwalks_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/graphwalks_n3_1m_benchmark.jsonl` |
+
+The N3 1M variant requires HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`).
+
+## Prepare benchmark data
+
+```bash
+# Default (o200k_base, no filter)
+ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config.yaml]"
+
+# N3 1M variant
+ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config_n3_1m.yaml]"
+```
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/graphwalks/prepare.py \
+    --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \
+    --max_context_tokens 131072 \
+    --output_fpath benchmarks/graphwalks/data/graphwalks_llama_128k_benchmark.jsonl
+```
+
+## Start environment
+
+```bash
+ng_run "+config_paths=[benchmarks/graphwalks/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]"
+```
+
+## Collect rollouts
+
+```bash
+# Default variant
+ng_collect_rollouts \
+    +agent_name=graphwalks_benchmark_simple_agent \
+    +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_benchmark.jsonl \
+    +output_jsonl_fpath=results/graphwalks_rollouts.jsonl \
+    +num_repeats=4
+
+# N3 1M variant
+ng_collect_rollouts \
+    +agent_name=graphwalks_n3_1m_benchmark_simple_agent \
+    +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl \
+    +output_jsonl_fpath=results/graphwalks_n3_1m_rollouts.jsonl \
+    +num_repeats=4
+```
+
+## Metrics
+
+`compute_metrics()` emits `pass@k/accuracy`, `pass@1[avg-of-k]/accuracy`
+via `compute_pass_majority_metrics`, plus per-`problem_type` subset
+breakdowns via `compute_subset_metrics(subset_key="problem_type")` —
+stratified pass@k keys like `problem_type=parents/pass@4/accuracy` and
+`problem_type=bfs/pass@4/accuracy`.
+
+For reasoning models the vLLM server should be started with a
+`--reasoning-parser` matching the model (e.g. `nano_v3` for Nemotron-3
+or `deepseek_r1`) so that `<think>...</think>` blocks are stripped
+upstream of `Final Answer:` parsing.
diff --git a/benchmarks/graphwalks/__init__.py b/benchmarks/graphwalks/__init__.py
diff --git a/benchmarks/graphwalks/config.yaml b/benchmarks/graphwalks/config.yaml
@@ -0,0 +1,28 @@
+# Chain to the generic graphwalks resources server + agent config.
+config_paths:
+  - resources_servers/graphwalks/configs/graphwalks.yaml
+
+# Benchmark-specific overrides via `_inherit_from` so the base graphwalks config
+# stays isolated from benchmark use.
+
+graphwalks_benchmark_resources_server:
+  _inherit_from: graphwalks_resources_server
+
+graphwalks_benchmark_simple_agent:
+  _inherit_from: graphwalks_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: graphwalks_benchmark_resources_server
+      datasets:
+      - name: graphwalks
+        type: benchmark
+        jsonl_fpath: benchmarks/graphwalks/data/graphwalks_benchmark.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/graphwalks/prepare.py
+        # Rollouts per task for pass@k variance.
+        # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+        # placeholder — it only triggers row duplication for
+        # `type: train`/`validation`. To actually get N rollouts per task,
+        # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+        num_repeats: 1
diff --git a/benchmarks/graphwalks/config_n3_1m.yaml b/benchmarks/graphwalks/config_n3_1m.yaml
@@ -0,0 +1,28 @@
+# GraphWalks — N3 1M-context variant.
+# Same data + Skills prompt fixes as `config.yaml`, but `prepare_n3_1m.py`
+# counts tokens with the Nemotron-3-Super HF tokenizer and drops samples
+# whose tokenized prompt exceeds 1048576 tokens.
+config_paths:
+  - resources_servers/graphwalks/configs/graphwalks.yaml
+
+graphwalks_n3_1m_benchmark_resources_server:
+  _inherit_from: graphwalks_resources_server
+
+graphwalks_n3_1m_benchmark_simple_agent:
+  _inherit_from: graphwalks_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: graphwalks_n3_1m_benchmark_resources_server
+      datasets:
+      - name: graphwalks_n3_1m
+        type: benchmark
+        jsonl_fpath: benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/graphwalks/prepare_n3_1m.py
+        # Rollouts per task for pass@k variance.
+        # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+        # placeholder — it only triggers row duplication for
+        # `type: train`/`validation`. To actually get N rollouts per task,
+        # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+        num_repeats: 1
diff --git a/benchmarks/graphwalks/data/.gitignore b/benchmarks/graphwalks/data/.gitignore
@@ -0,0 +1 @@
+*benchmark.jsonl
diff --git a/benchmarks/graphwalks/prepare.py b/benchmarks/graphwalks/prepare.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare the GraphWalks benchmark data.
+
+Source: https://huggingface.co/datasets/openai/graphwalks
+
+Ported from:
+    https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/graphwalks/prepare.py
+
+Two upstream-prompt corrections from Skills are preserved here verbatim:
+
+  1. The BFS prompt is rewritten to disambiguate "depth N" — without
+     this rewrite, models often return nodes at intermediate depths.
+  2. The parents prompt sometimes includes the target node inside its
+     own answer set; we strip it.
+
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens``
+field, with no length filter. For an N3 1M-context
+variant that filters to fit, see ``prepare_n3_1m.py`` and
+``config_n3_1m.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+    python benchmarks/graphwalks/prepare.py \\
+        --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \\
+        --max_context_tokens 131072
+"""
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Callable, Optional
+
+import tiktoken
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+BENCHMARK_DIR = Path(__file__).parent
+DATA_DIR = BENCHMARK_DIR / "data"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "graphwalks_benchmark.jsonl"
+
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None  # no filter by default
+
+_BFS_PATTERN = re.compile(r"Perform a BFS from node (\S+) with depth (\d+)")
+_BFS_REPLACEMENT = (
+    r"Perform a BFS from node \1 and return only the nodes at exactly depth \2 "
+    r"(not nodes at intermediate depths)"
+)
+_PARENTS_PATTERN = re.compile(r"Find the parents of node ([^\s.]+)\.")
+
+
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+    """Return a ``text -> token_count`` function.
+
+    Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken
+    encoding, falls back to ``transformers.AutoTokenizer``.
+    """
+    try:
+        enc = tiktoken.get_encoding(tokenizer_name)
+        return lambda text: len(enc.encode(text, disallowed_special=()))
+    except ValueError:
+        from transformers import AutoTokenizer
+
+        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
+
+
+def prepare(
+    tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+    max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+    output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
+    output_fpath = Path(output_fpath)
+    output_fpath.parent.mkdir(parents=True, exist_ok=True)
+
+    dataset = load_dataset("openai/graphwalks", split="train")
+    count_tokens = _build_token_counter(tokenizer_name)
+
+    kept = 0
+    skipped_tokens = 0
+    skipped_self_parent = 0
+    with output_fpath.open("w", encoding="utf-8") as fout:
+        for entry in tqdm(dataset, desc="Preparing GraphWalks"):
+            prompt_text = entry["prompt"]
+            answer_nodes = list(entry["answer_nodes"])
+
+            # Skills fix #1: disambiguate BFS depth.
+            prompt_text = _BFS_PATTERN.sub(_BFS_REPLACEMENT, prompt_text)
+
+            # Skills fix #2: strip the queried node from its own parents answer.
+            m = _PARENTS_PATTERN.search(prompt_text)
+            target = m.group(1) if m else None
+            if target is not None and target in answer_nodes:
+                answer_nodes.remove(target)
+                skipped_self_parent += 1
+
+            n_tokens = count_tokens(prompt_text)
+            if max_context_tokens is not None and n_tokens > max_context_tokens:
+                skipped_tokens += 1
+                continue
+
+            sample = {
+                "responses_create_params": {"input": [{"role": "user", "content": prompt_text}]},
+                "expected_answer": json.dumps(sorted(answer_nodes)),
+                "problem_type": entry["problem_type"],
+                "n_tokens": n_tokens,
+                "prompt_chars": entry["prompt_chars"],
+            }
+            fout.write(json.dumps(sample, ensure_ascii=False) + "\n")
+            kept += 1
+
+    cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+    print(
+        f"Wrote {kept} samples to {output_fpath} "
+        f"(tokenizer={tokenizer_name}, cap={cap_str}; "
+        f"dropped {skipped_tokens} over cap; cleaned {skipped_self_parent} self-parent answers)"
+    )
+    return output_fpath
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--tokenizer_name",
+        default=DEFAULT_TOKENIZER_NAME,
+        help=(
+            "Tokenizer used for token counting. Accepts a tiktoken encoding name "
+            "(e.g. 'cl100k_base', 'o200k_base') or a HuggingFace model id "
+            "(e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+            f"Default: {DEFAULT_TOKENIZER_NAME}"
+        ),
+    )
+    parser.add_argument(
+        "--max_context_tokens",
+        type=int,
+        default=DEFAULT_MAX_CONTEXT_TOKENS,
+        help=(
+            "Drop samples whose tokenized prompt exceeds this many tokens. "
+            "Omit (or pass a negative number) for no filter. "
+            f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+        ),
+    )
+    parser.add_argument(
+        "--output_fpath",
+        type=Path,
+        default=DEFAULT_OUTPUT_FPATH,
+        help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+    cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+    prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)