diff --git a/README.md b/README.md
index c8a379d73..51252e48e 100644
--- a/README.md
+++ b/README.md
@@ -208,6 +208,7 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace).
 | Genrm Compare                                 | rlhf                  | GenRM pairwise comparison for RLHF training                                                                                                                                                                                  | Compare multiple candidate responses using GenRM model                                                                                | -     | -          | -                                                         | <a href='resources_servers/genrm_compare/configs/genrm_compare.yaml'>genrm_compare.yaml</a>                                                                                                                                 | -                                                                                                                                                              |
 | Google Search                                 | agent                 | Multi-choice question answering problems with search tools integrated                                                                                                                                                        | Improve knowledge-related benchmarks with search tools                                                                                | ✓     | -          | Apache 2.0                                                | <a href='resources_servers/google_search/configs/google_search.yaml'>google_search.yaml</a>                                                                                                                                 | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-knowledge-web_search-mcqa'>Nemotron-RL-knowledge-web_search-mcqa</a>                               |
 | Gpqa Diamond                                  | knowledge             | GPQA Diamond multiple-choice question answering problems                                                                                                                                                                     | Evaluate graduate-level scientific reasoning via MCQ verification                                                                     | ✓     | -          | MIT                                                       | <a href='resources_servers/gpqa_diamond/configs/gpqa_diamond.yaml'>gpqa_diamond.yaml</a>                                                                                                                                    | -                                                                                                                                                              |
+| Graphwalks                                    | other                 | Long-context graph-walks (BFS / parents) with F1-over-node-sets grading from openai/graphwalks                                                                                                                               | Improve long-context multi-step graph reasoning and adjacency-list traversal                                                          | -     | -          | -                                                         | <a href='resources_servers/graphwalks/configs/graphwalks.yaml'>graphwalks.yaml</a>                                                                                                                                          | -                                                                                                                                                              |
 | Grl Sokoban                                   | games                 | Single-box Sokoban in Gymnasium API style.                                                                                                                                                                                   | Model emits one move per turn until the puzzle is solved.                                                                             | -     | -          | -                                                         | <a href='resources_servers/grl_sokoban/configs/grl_sokoban.yaml'>grl_sokoban.yaml</a>                                                                                                                                       | -                                                                                                                                                              |
 | Grl Tetris                                    | games                 | Tetris in Gymnasium API style. Model emits one or more moves per turn.                                                                                                                                                       | Multi-step Tetris environment                                                                                                         | -     | -          | -                                                         | <a href='resources_servers/grl_tetris/configs/grl_tetris.yaml'>grl_tetris.yaml</a>                                                                                                                                          | -                                                                                                                                                              |
 | Gymnasium                                     | other                 | Base class for Gymnasium-style servers. Not a standalone server.                                                                                                                                                             | Reusable base class for step/reset style environments                                                                                 | -     | -          | -                                                         | <a href='resources_servers/gymnasium/configs/gymnasium.yaml'>gymnasium.yaml</a>                                                                                                                                             | -                                                                                                                                                              |
diff --git a/benchmarks/graphwalks/README.md b/benchmarks/graphwalks/README.md
new file mode 100644
index 000000000..526a6e915
--- /dev/null
+++ b/benchmarks/graphwalks/README.md
@@ -0,0 +1,82 @@
+# GraphWalks benchmark
+
+Benchmark wrapper over the [`graphwalks` resources server](../../resources_servers/graphwalks/README.md)
+for the [openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks) dataset.
+
+Each task supplies an adjacency list and asks the model to either list
+the parents of a node (`problem_type: parents`) or return the BFS
+frontier at exactly depth N (`problem_type: bfs`). Scoring is F1 over
+the predicted node set vs. the expected node set, gated on the model
+producing a `Final Answer: [...]` line.
+
+## Variants
+
+Two preset configs ship alongside this benchmark. Both apply the same
+data + Skills prompt fixes (BFS depth disambiguation, self-parent
+removal); they differ only in the tokenizer used for the `n_tokens`
+column and an optional length filter.
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/graphwalks_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/graphwalks_n3_1m_benchmark.jsonl` |
+
+The N3 1M variant requires HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`).
+
+## Prepare benchmark data
+
+```bash
+# Default (o200k_base, no filter)
+ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config.yaml]"
+
+# N3 1M variant
+ng_prepare_benchmark "+config_paths=[benchmarks/graphwalks/config_n3_1m.yaml]"
+```
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/graphwalks/prepare.py \
+    --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \
+    --max_context_tokens 131072 \
+    --output_fpath benchmarks/graphwalks/data/graphwalks_llama_128k_benchmark.jsonl
+```
+
+## Start environment
+
+```bash
+ng_run "+config_paths=[benchmarks/graphwalks/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]"
+```
+
+## Collect rollouts
+
+```bash
+# Default variant
+ng_collect_rollouts \
+    +agent_name=graphwalks_benchmark_simple_agent \
+    +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_benchmark.jsonl \
+    +output_jsonl_fpath=results/graphwalks_rollouts.jsonl \
+    +num_repeats=4
+
+# N3 1M variant
+ng_collect_rollouts \
+    +agent_name=graphwalks_n3_1m_benchmark_simple_agent \
+    +input_jsonl_fpath=benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl \
+    +output_jsonl_fpath=results/graphwalks_n3_1m_rollouts.jsonl \
+    +num_repeats=4
+```
+
+## Metrics
+
+`compute_metrics()` emits `pass@k/accuracy`, `pass@1[avg-of-k]/accuracy`
+via `compute_pass_majority_metrics`, plus per-`problem_type` subset
+breakdowns via `compute_subset_metrics(subset_key="problem_type")` —
+stratified pass@k keys like `problem_type=parents/pass@4/accuracy` and
+`problem_type=bfs/pass@4/accuracy`.
+
+For reasoning models the vLLM server should be started with a
+`--reasoning-parser` matching the model (e.g. `nano_v3` for Nemotron-3
+or `deepseek_r1`) so that `<think>...</think>` blocks are stripped
+upstream of `Final Answer:` parsing.
diff --git a/benchmarks/graphwalks/__init__.py b/benchmarks/graphwalks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/graphwalks/config.yaml b/benchmarks/graphwalks/config.yaml
new file mode 100644
index 000000000..61746a329
--- /dev/null
+++ b/benchmarks/graphwalks/config.yaml
@@ -0,0 +1,28 @@
+# Chain to the generic graphwalks resources server + agent config.
+config_paths:
+  - resources_servers/graphwalks/configs/graphwalks.yaml
+
+# Benchmark-specific overrides via `_inherit_from` so the base graphwalks config
+# stays isolated from benchmark use.
+
+graphwalks_benchmark_resources_server:
+  _inherit_from: graphwalks_resources_server
+
+graphwalks_benchmark_simple_agent:
+  _inherit_from: graphwalks_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: graphwalks_benchmark_resources_server
+      datasets:
+      - name: graphwalks
+        type: benchmark
+        jsonl_fpath: benchmarks/graphwalks/data/graphwalks_benchmark.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/graphwalks/prepare.py
+        # Rollouts per task for pass@k variance.
+        # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+        # placeholder — it only triggers row duplication for
+        # `type: train`/`validation`. To actually get N rollouts per task,
+        # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+        num_repeats: 1
diff --git a/benchmarks/graphwalks/config_n3_1m.yaml b/benchmarks/graphwalks/config_n3_1m.yaml
new file mode 100644
index 000000000..cd9ca2ba9
--- /dev/null
+++ b/benchmarks/graphwalks/config_n3_1m.yaml
@@ -0,0 +1,28 @@
+# GraphWalks — N3 1M-context variant.
+# Same data + Skills prompt fixes as `config.yaml`, but `prepare_n3_1m.py`
+# counts tokens with the Nemotron-3-Super HF tokenizer and drops samples
+# whose tokenized prompt exceeds 1048576 tokens.
+config_paths:
+  - resources_servers/graphwalks/configs/graphwalks.yaml
+
+graphwalks_n3_1m_benchmark_resources_server:
+  _inherit_from: graphwalks_resources_server
+
+graphwalks_n3_1m_benchmark_simple_agent:
+  _inherit_from: graphwalks_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: graphwalks_n3_1m_benchmark_resources_server
+      datasets:
+      - name: graphwalks_n3_1m
+        type: benchmark
+        jsonl_fpath: benchmarks/graphwalks/data/graphwalks_n3_1m_benchmark.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/graphwalks/prepare_n3_1m.py
+        # Rollouts per task for pass@k variance.
+        # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+        # placeholder — it only triggers row duplication for
+        # `type: train`/`validation`. To actually get N rollouts per task,
+        # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+        num_repeats: 1
diff --git a/benchmarks/graphwalks/data/.gitignore b/benchmarks/graphwalks/data/.gitignore
new file mode 100644
index 000000000..b06d45fe6
--- /dev/null
+++ b/benchmarks/graphwalks/data/.gitignore
@@ -0,0 +1 @@
+*benchmark.jsonl
diff --git a/benchmarks/graphwalks/prepare.py b/benchmarks/graphwalks/prepare.py
new file mode 100644
index 000000000..cbbd963d8
--- /dev/null
+++ b/benchmarks/graphwalks/prepare.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare the GraphWalks benchmark data.
+
+Source: https://huggingface.co/datasets/openai/graphwalks
+
+Ported from:
+    https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/graphwalks/prepare.py
+
+Two upstream-prompt corrections from Skills are preserved here verbatim:
+
+  1. The BFS prompt is rewritten to disambiguate "depth N" — without
+     this rewrite, models often return nodes at intermediate depths.
+  2. The parents prompt sometimes includes the target node inside its
+     own answer set; we strip it.
+
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens``
+field, with no length filter. For an N3 1M-context
+variant that filters to fit, see ``prepare_n3_1m.py`` and
+``config_n3_1m.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+    python benchmarks/graphwalks/prepare.py \\
+        --tokenizer_name meta-llama/Llama-3.1-8B-Instruct \\
+        --max_context_tokens 131072
+"""
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Callable, Optional
+
+import tiktoken
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+BENCHMARK_DIR = Path(__file__).parent
+DATA_DIR = BENCHMARK_DIR / "data"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "graphwalks_benchmark.jsonl"
+
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None  # no filter by default
+
+_BFS_PATTERN = re.compile(r"Perform a BFS from node (\S+) with depth (\d+)")
+_BFS_REPLACEMENT = (
+    r"Perform a BFS from node \1 and return only the nodes at exactly depth \2 "
+    r"(not nodes at intermediate depths)"
+)
+_PARENTS_PATTERN = re.compile(r"Find the parents of node ([^\s.]+)\.")
+
+
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+    """Return a ``text -> token_count`` function.
+
+    Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken
+    encoding, falls back to ``transformers.AutoTokenizer``.
+    """
+    try:
+        enc = tiktoken.get_encoding(tokenizer_name)
+        return lambda text: len(enc.encode(text, disallowed_special=()))
+    except ValueError:
+        from transformers import AutoTokenizer
+
+        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
+
+
+def prepare(
+    tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+    max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+    output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
+    output_fpath = Path(output_fpath)
+    output_fpath.parent.mkdir(parents=True, exist_ok=True)
+
+    dataset = load_dataset("openai/graphwalks", split="train")
+    count_tokens = _build_token_counter(tokenizer_name)
+
+    kept = 0
+    skipped_tokens = 0
+    skipped_self_parent = 0
+    with output_fpath.open("w", encoding="utf-8") as fout:
+        for entry in tqdm(dataset, desc="Preparing GraphWalks"):
+            prompt_text = entry["prompt"]
+            answer_nodes = list(entry["answer_nodes"])
+
+            # Skills fix #1: disambiguate BFS depth.
+            prompt_text = _BFS_PATTERN.sub(_BFS_REPLACEMENT, prompt_text)
+
+            # Skills fix #2: strip the queried node from its own parents answer.
+            m = _PARENTS_PATTERN.search(prompt_text)
+            target = m.group(1) if m else None
+            if target is not None and target in answer_nodes:
+                answer_nodes.remove(target)
+                skipped_self_parent += 1
+
+            n_tokens = count_tokens(prompt_text)
+            if max_context_tokens is not None and n_tokens > max_context_tokens:
+                skipped_tokens += 1
+                continue
+
+            sample = {
+                "responses_create_params": {"input": [{"role": "user", "content": prompt_text}]},
+                "expected_answer": json.dumps(sorted(answer_nodes)),
+                "problem_type": entry["problem_type"],
+                "n_tokens": n_tokens,
+                "prompt_chars": entry["prompt_chars"],
+            }
+            fout.write(json.dumps(sample, ensure_ascii=False) + "\n")
+            kept += 1
+
+    cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+    print(
+        f"Wrote {kept} samples to {output_fpath} "
+        f"(tokenizer={tokenizer_name}, cap={cap_str}; "
+        f"dropped {skipped_tokens} over cap; cleaned {skipped_self_parent} self-parent answers)"
+    )
+    return output_fpath
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--tokenizer_name",
+        default=DEFAULT_TOKENIZER_NAME,
+        help=(
+            "Tokenizer used for token counting. Accepts a tiktoken encoding name "
+            "(e.g. 'cl100k_base', 'o200k_base') or a HuggingFace model id "
+            "(e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+            f"Default: {DEFAULT_TOKENIZER_NAME}"
+        ),
+    )
+    parser.add_argument(
+        "--max_context_tokens",
+        type=int,
+        default=DEFAULT_MAX_CONTEXT_TOKENS,
+        help=(
+            "Drop samples whose tokenized prompt exceeds this many tokens. "
+            "Omit (or pass a negative number) for no filter. "
+            f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+        ),
+    )
+    parser.add_argument(
+        "--output_fpath",
+        type=Path,
+        default=DEFAULT_OUTPUT_FPATH,
+        help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+    cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+    prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/graphwalks/prepare_n3_1m.py b/benchmarks/graphwalks/prepare_n3_1m.py
new file mode 100644
index 000000000..36d84c456
--- /dev/null
+++ b/benchmarks/graphwalks/prepare_n3_1m.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraphWalks variant: Nemotron-3-Super tokenizer with a 1M token cap.
+
+Same data + Skills fixes as ``prepare.py``, but counts tokens with the
+``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace tokenizer
+and drops samples whose tokenized prompt exceeds 1048576 tokens
+(Nemotron-3-Super's native 1M context window).
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+
+The output JSONL lives alongside the default at
+``data/graphwalks_n3_1m_benchmark.jsonl`` so both variants can
+coexist.
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"  # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "graphwalks_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+    return _prepare(
+        tokenizer_name=TOKENIZER_NAME,
+        max_context_tokens=MAX_CONTEXT_TOKENS,
+        output_fpath=OUTPUT_FPATH,
+    )
+
+
+if __name__ == "__main__":
+    prepare()
diff --git a/benchmarks/longbench_v2/README.md b/benchmarks/longbench_v2/README.md
index 41da06adf..61fbd756b 100644
--- a/benchmarks/longbench_v2/README.md
+++ b/benchmarks/longbench_v2/README.md
@@ -13,26 +13,58 @@ grading; this directory adds only the dataset and prompt.
 Data source: HuggingFace `THUDM/LongBench-v2` (single "train" split,
 which is the full eval set). `prepare.py` preserves every Skills
 field (`index`, `context`, `question`, `choice_A..D`, `expected_answer`,
-`domain`, `sub_domain`, `difficulty`, `length`, `context_tokens` via
-tiktoken `cl100k_base`) and additionally emits `options` and
-`grading_mode` for the mcqa server.
+`domain`, `sub_domain`, `difficulty`, `length`, `context_tokens`) and
+additionally emits `options` and `grading_mode` for the mcqa server.
+
+## Variants
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/longbench_v2_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/longbench_v2_n3_1m_benchmark.jsonl` |
+
+The N3 1M variant requires HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`). LongBench-v2 contexts
+span 8k-2M words, so the long-bucket rows above 1M tokens are filtered
+out under the N3 1M cap.
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/longbench_v2/prepare.py \
+    --tokenizer_name cl100k_base \
+    --max_context_tokens 131072 \
+    --output_fpath benchmarks/longbench_v2/data/longbench_v2_cl100k_128k_benchmark.jsonl
+```
 
 ## Example usage
 
 ```bash
-# Prepare benchmark data
+# Prepare benchmark data (default)
 ng_prepare_benchmark "+config_paths=[benchmarks/longbench_v2/config.yaml]"
 
+# Prepare benchmark data (N3 1M variant)
+ng_prepare_benchmark "+config_paths=[benchmarks/longbench_v2/config_n3_1m.yaml]"
+
 # Running servers
 config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\
 benchmarks/longbench_v2/config.yaml"
 ng_run "+config_paths=[$config_paths]"
 
-# Collecting rollouts
+# Collecting rollouts — default
 ng_collect_rollouts \
     +agent_name=longbench_v2_mcqa_simple_agent \
     +input_jsonl_fpath=benchmarks/longbench_v2/data/longbench_v2_benchmark.jsonl \
     +output_jsonl_fpath=results/longbench_v2_rollouts.jsonl \
     +prompt_config=benchmarks/longbench_v2/prompts/default.yaml \
     +num_repeats=4
+
+# Collecting rollouts — N3 1M
+ng_collect_rollouts \
+    +agent_name=longbench_v2_n3_1m_mcqa_simple_agent \
+    +input_jsonl_fpath=benchmarks/longbench_v2/data/longbench_v2_n3_1m_benchmark.jsonl \
+    +output_jsonl_fpath=results/longbench_v2_n3_1m_rollouts.jsonl \
+    +prompt_config=benchmarks/longbench_v2/prompts/default.yaml \
+    +num_repeats=4
 ```
diff --git a/benchmarks/longbench_v2/config_n3_1m.yaml b/benchmarks/longbench_v2/config_n3_1m.yaml
new file mode 100644
index 000000000..d532cc078
--- /dev/null
+++ b/benchmarks/longbench_v2/config_n3_1m.yaml
@@ -0,0 +1,18 @@
+# LongBench-v2 — N3 1M-context variant.
+# Same data + fields as `config.yaml`, but `prepare_n3_1m.py` counts
+# `context_tokens` with the Nemotron-3-Super HF tokenizer and drops
+# samples whose tokenized context exceeds 1048576 tokens.
+config_paths:
+  - resources_servers/mcqa/configs/mcqa.yaml
+
+longbench_v2_n3_1m_mcqa_simple_agent:
+  _inherit_from: mcqa_simple_agent
+  responses_api_agents:
+    simple_agent:
+      datasets:
+      - name: longbench_v2_n3_1m
+        type: benchmark
+        jsonl_fpath: benchmarks/longbench_v2/data/longbench_v2_n3_1m_benchmark.jsonl
+        prompt_config: benchmarks/longbench_v2/prompts/default.yaml
+        prepare_script: benchmarks/longbench_v2/prepare_n3_1m.py
+        license: Apache 2.0
diff --git a/benchmarks/longbench_v2/prepare.py b/benchmarks/longbench_v2/prepare.py
index 020721f8b..702b73878 100644
--- a/benchmarks/longbench_v2/prepare.py
+++ b/benchmarks/longbench_v2/prepare.py
@@ -28,10 +28,28 @@
 
 Dataset: https://huggingface.co/datasets/THUDM/LongBench-v2
 Paper:   https://arxiv.org/abs/2412.15204
+
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the
+``context_tokens`` field, with no length filter. For an N3 1M-context
+variant that filters to fit, see ``prepare_n3_1m.py`` and
+``config_n3_1m.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+    python benchmarks/longbench_v2/prepare.py \\
+        --tokenizer_name cl100k_base \\
+        --max_context_tokens 131072
 """
 
+import argparse
 import json
 from pathlib import Path
+from typing import Callable, Optional
 
 import tiktoken
 from datasets import load_dataset
@@ -40,23 +58,53 @@
 
 BENCHMARK_DIR = Path(__file__).parent
 DATA_DIR = BENCHMARK_DIR / "data"
-OUTPUT_FPATH = DATA_DIR / "longbench_v2_benchmark.jsonl"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "longbench_v2_benchmark.jsonl"
+
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None  # no filter by default
+
 
-# tiktoken encoding name used by Skills' prepare.py for `context_tokens`.
-TOKENIZER_NAME = "cl100k_base"
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+    """Return a ``text -> token_count`` function.
 
+    Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken
+    encoding, falls back to ``transformers.AutoTokenizer``. The tiktoken
+    path uses ``disallowed_special=()`` because LongBench-v2 contexts
+    sometimes contain raw ``<|endoftext|>`` strings that tiktoken would
+    otherwise refuse to encode.
+    """
+    try:
+        enc = tiktoken.get_encoding(tokenizer_name)
+        return lambda text: len(enc.encode(text, disallowed_special=()))
+    except ValueError:
+        from transformers import AutoTokenizer
 
-def prepare() -> Path:
+        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
+
+
+def prepare(
+    tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+    max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+    output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
     """Download LongBench-v2, convert to Gym JSONL, return the output file path."""
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    output_fpath = Path(output_fpath)
+    output_fpath.parent.mkdir(parents=True, exist_ok=True)
 
-    print(f"Loading THUDM/LongBench-v2 (split='train', {TOKENIZER_NAME} for context tokens) ...")
+    print(f"Loading THUDM/LongBench-v2 (split='train', tokenizer='{tokenizer_name}') ...")
     dataset = load_dataset("THUDM/LongBench-v2", split="train")
-    encoder = tiktoken.get_encoding(TOKENIZER_NAME)
+    count_tokens = _build_token_counter(tokenizer_name)
+
+    kept = 0
+    skipped = 0
+    with open(output_fpath, "w", encoding="utf-8") as out:
+        for entry in tqdm(dataset, desc=f"Writing {output_fpath.name}"):
+            context_tokens = count_tokens(entry["context"])
+            if max_context_tokens is not None and context_tokens > max_context_tokens:
+                skipped += 1
+                continue
 
-    count = 0
-    with open(OUTPUT_FPATH, "w", encoding="utf-8") as out:
-        for entry in tqdm(dataset, desc="Writing longbench_v2_benchmark.jsonl"):
             record = {
                 # Fields preserved verbatim from Skills' prepare.py
                 "index": entry["_id"],
@@ -71,11 +119,7 @@ def prepare() -> Path:
                 "sub_domain": entry["sub_domain"],
                 "difficulty": entry["difficulty"],
                 "length": entry["length"],
-                # disallowed_special=() — some LongBench-v2 contexts contain
-                # raw `<|endoftext|>` strings that tiktoken would otherwise
-                # refuse to encode. We only need the count, so encode them
-                # as plain text.
-                "context_tokens": len(encoder.encode(entry["context"], disallowed_special=())),
+                "context_tokens": context_tokens,
                 # Gym-side additions consumed by the `mcqa` resource server.
                 # mcqa's verify() reads `options`, `expected_answer`, `grading_mode`.
                 "options": [
@@ -87,11 +131,48 @@ def prepare() -> Path:
                 "grading_mode": "strict_single_letter_boxed",
             }
             out.write(json.dumps(record, ensure_ascii=False) + "\n")
-            count += 1
+            kept += 1
+
+    cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+    print(
+        f"Wrote {kept} problems to {output_fpath} "
+        f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped} over cap)"
+    )
+    return output_fpath
+
 
-    print(f"Wrote {count} problems to {OUTPUT_FPATH}")
-    return OUTPUT_FPATH
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--tokenizer_name",
+        default=DEFAULT_TOKENIZER_NAME,
+        help=(
+            "Tokenizer used for the context_tokens count and length filter. "
+            "Accepts a tiktoken encoding name (e.g. 'cl100k_base', 'o200k_base') "
+            "or a HuggingFace model id (e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+            f"Default: {DEFAULT_TOKENIZER_NAME}"
+        ),
+    )
+    parser.add_argument(
+        "--max_context_tokens",
+        type=int,
+        default=DEFAULT_MAX_CONTEXT_TOKENS,
+        help=(
+            "Drop samples whose tokenized context exceeds this many tokens. "
+            "Omit (or pass a negative number) for no filter. "
+            f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+        ),
+    )
+    parser.add_argument(
+        "--output_fpath",
+        type=Path,
+        default=DEFAULT_OUTPUT_FPATH,
+        help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+    )
+    return parser.parse_args()
 
 
 if __name__ == "__main__":
-    prepare()
+    args = _parse_args()
+    cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+    prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/longbench_v2/prepare_n3_1m.py b/benchmarks/longbench_v2/prepare_n3_1m.py
new file mode 100644
index 000000000..4959653ab
--- /dev/null
+++ b/benchmarks/longbench_v2/prepare_n3_1m.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LongBench-v2 variant: Nemotron-3-Super tokenizer with a 1M context cap.
+
+Same data + fields as ``prepare.py``, but counts ``context_tokens``
+with the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace
+tokenizer and drops samples whose tokenized context exceeds 1048576
+tokens (Nemotron-3-Super's native 1M context window). LongBench-v2
+contexts span 8k-2M words, so the long-bucket rows above 1M tokens
+are filtered out.
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"  # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "longbench_v2_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+    return _prepare(
+        tokenizer_name=TOKENIZER_NAME,
+        max_context_tokens=MAX_CONTEXT_TOKENS,
+        output_fpath=OUTPUT_FPATH,
+    )
+
+
+if __name__ == "__main__":
+    prepare()
diff --git a/benchmarks/longcodebench/README.md b/benchmarks/longcodebench/README.md
index 64ca598d1..60192ab36 100644
--- a/benchmarks/longcodebench/README.md
+++ b/benchmarks/longcodebench/README.md
@@ -12,22 +12,53 @@ the long code prompt plus the postfix; the shared
 wraps it as a single user message, mirroring NeMo Skills' `prompt_format=openai`
 behaviour.
 
+## Variants
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/longcodebench_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/longcodebench_n3_1m_benchmark.jsonl` |
+
+The N3 1M variant requires HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`).
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/longcodebench/prepare.py \
+    --tokenizer_name cl100k_base \
+    --max_context_tokens 131072 \
+    --output_fpath benchmarks/longcodebench/data/longcodebench_cl100k_128k_benchmark.jsonl
+```
+
 ## Example usage
 
 ```bash
-# Prepare benchmark data
+# Prepare benchmark data (default)
 ng_prepare_benchmark "+config_paths=[benchmarks/longcodebench/config.yaml]"
 
+# Prepare benchmark data (N3 1M variant)
+ng_prepare_benchmark "+config_paths=[benchmarks/longcodebench/config_n3_1m.yaml]"
+
 # Running servers
 config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\
 benchmarks/longcodebench/config.yaml"
 ng_run "+config_paths=[$config_paths]"
 
-# Collecting rollouts
+# Collecting rollouts — default
 ng_collect_rollouts \
     +agent_name=longcodebench_mcqa_simple_agent \
     +input_jsonl_fpath=benchmarks/longcodebench/data/longcodebench_benchmark.jsonl \
     +output_jsonl_fpath=results/longcodebench_rollouts.jsonl \
     +prompt_config=benchmarks/prompts/generic/default.yaml \
     +num_repeats=4
+
+# Collecting rollouts — N3 1M
+ng_collect_rollouts \
+    +agent_name=longcodebench_n3_1m_mcqa_simple_agent \
+    +input_jsonl_fpath=benchmarks/longcodebench/data/longcodebench_n3_1m_benchmark.jsonl \
+    +output_jsonl_fpath=results/longcodebench_n3_1m_rollouts.jsonl \
+    +prompt_config=benchmarks/prompts/generic/default.yaml \
+    +num_repeats=4
 ```
diff --git a/benchmarks/longcodebench/config_n3_1m.yaml b/benchmarks/longcodebench/config_n3_1m.yaml
new file mode 100644
index 000000000..bc1ed0807
--- /dev/null
+++ b/benchmarks/longcodebench/config_n3_1m.yaml
@@ -0,0 +1,18 @@
+# LongCodeBench — N3 1M-context variant.
+# Same data + fields as `config.yaml`, but `prepare_n3_1m.py` counts
+# `n_tokens` with the Nemotron-3-Super HF tokenizer and drops samples
+# whose tokenized prompt exceeds 1048576 tokens.
+config_paths:
+  - resources_servers/mcqa/configs/mcqa.yaml
+
+longcodebench_n3_1m_mcqa_simple_agent:
+  _inherit_from: mcqa_simple_agent
+  responses_api_agents:
+    simple_agent:
+      datasets:
+      - name: longcodebench_n3_1m
+        type: benchmark
+        jsonl_fpath: benchmarks/longcodebench/data/longcodebench_n3_1m_benchmark.jsonl
+        prompt_config: benchmarks/prompts/generic/default.yaml
+        prepare_script: benchmarks/longcodebench/prepare_n3_1m.py
+        license: Creative Commons Attribution 4.0 International
diff --git a/benchmarks/longcodebench/prepare.py b/benchmarks/longcodebench/prepare.py
index db8a90cb0..a650c2a53 100644
--- a/benchmarks/longcodebench/prepare.py
+++ b/benchmarks/longcodebench/prepare.py
@@ -27,20 +27,35 @@
 purely to populate the server's `allowed_letters` set; the option text is not
 used for grading because the postfix forces a `\\boxed{X}` answer.
 
-Skills' prepare also stores a `n_tokens_cl100k_base` field counted with
-tiktoken. The mcqa verifier never reads it; we omit it on the Gym side to
-avoid pulling tiktoken into Gym's main dependency set just for one
-benchmark's metadata column.
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens``
+field, with no length filter. For an N3 1M-context
+variant that filters to fit, see ``prepare_n3_1m.py`` and
+``config_n3_1m.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+    python benchmarks/longcodebench/prepare.py \\
+        --tokenizer_name cl100k_base \\
+        --max_context_tokens 131072
 """
 
+import argparse
 import json
 import uuid
 from pathlib import Path
+from typing import Callable, Optional
+
+import tiktoken
 
 
 BENCHMARK_DIR = Path(__file__).parent
 DATA_DIR = BENCHMARK_DIR / "data"
-OUTPUT_FPATH = DATA_DIR / "longcodebench_benchmark.jsonl"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "longcodebench_benchmark.jsonl"
 OPTION_LETTERS = ("A", "B", "C", "D")
 
 POSTFIX = (
@@ -48,25 +63,58 @@
     "'Answer: \\boxed{A/B/C/D}' (e.g. 'Answer: \\boxed{A}')."
 )
 
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None  # no filter by default
+
+
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+    """Return a ``text -> token_count`` function.
 
-def prepare() -> Path:
+    Tries ``tiktoken.get_encoding`` first; falls back to
+    ``transformers.AutoTokenizer`` for HuggingFace model ids.
+    """
+    try:
+        enc = tiktoken.get_encoding(tokenizer_name)
+        return lambda text: len(enc.encode(text, disallowed_special=()))
+    except ValueError:
+        from transformers import AutoTokenizer
+
+        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
+
+
+def prepare(
+    tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+    max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+    output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
     """Download LongCodeBench LongCodeQA from HuggingFace and write Gym JSONL."""
     from datasets import load_dataset
 
-    print("Downloading LongCodeBench LongCodeQA from HuggingFace...")
+    output_fpath = Path(output_fpath)
+    output_fpath.parent.mkdir(parents=True, exist_ok=True)
+
+    print(f"Downloading LongCodeBench LongCodeQA (tokenizer='{tokenizer_name}') ...")
     ds = load_dataset("json", data_files="hf://datasets/Steefano/LCB/LongCodeQA.zip")
     data = ds["train"]
 
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    count_tokens = _build_token_counter(tokenizer_name)
 
     # Empty-text option dicts: the mcqa server only consumes the option *keys*
     # for `strict_single_letter_boxed` grading; option text is irrelevant since
     # the prompt postfix forces the model to emit `\boxed{<letter>}`.
     options = [{letter: ""} for letter in OPTION_LETTERS]
 
+    kept = 0
+    skipped = 0
     rows = []
     for entry in data:
         question = entry["prompt"].strip() + POSTFIX
+        n_tokens = count_tokens(question)
+        if max_context_tokens is not None and n_tokens > max_context_tokens:
+            skipped += 1
+            continue
+
         row = {
             "question": question,
             "options": options,
@@ -76,15 +124,54 @@ def prepare() -> Path:
             "repo": entry["repo"],
             "prompt_goal": entry["prompt_goal"],
             "is_hard": entry["is_hard"],
+            "n_tokens": n_tokens,
         }
         rows.append(json.dumps(row) + "\n")
+        kept += 1
 
-    with open(OUTPUT_FPATH, "w", encoding="utf-8") as f:
+    with open(output_fpath, "w", encoding="utf-8") as f:
         f.writelines(rows)
 
-    print(f"Wrote {len(rows)} problems to {OUTPUT_FPATH}")
-    return OUTPUT_FPATH
+    cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+    print(
+        f"Wrote {kept} problems to {output_fpath} "
+        f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped} over cap)"
+    )
+    return output_fpath
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--tokenizer_name",
+        default=DEFAULT_TOKENIZER_NAME,
+        help=(
+            "Tokenizer used for the n_tokens count and length filter. "
+            "Accepts a tiktoken encoding name (e.g. 'cl100k_base', 'o200k_base') "
+            "or a HuggingFace model id (e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+            f"Default: {DEFAULT_TOKENIZER_NAME}"
+        ),
+    )
+    parser.add_argument(
+        "--max_context_tokens",
+        type=int,
+        default=DEFAULT_MAX_CONTEXT_TOKENS,
+        help=(
+            "Drop samples whose tokenized prompt exceeds this many tokens. "
+            "Omit (or pass a negative number) for no filter. "
+            f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+        ),
+    )
+    parser.add_argument(
+        "--output_fpath",
+        type=Path,
+        default=DEFAULT_OUTPUT_FPATH,
+        help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+    )
+    return parser.parse_args()
 
 
 if __name__ == "__main__":
-    prepare()
+    args = _parse_args()
+    cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+    prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/longcodebench/prepare_n3_1m.py b/benchmarks/longcodebench/prepare_n3_1m.py
new file mode 100644
index 000000000..0cb6f9fcb
--- /dev/null
+++ b/benchmarks/longcodebench/prepare_n3_1m.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LongCodeBench variant: Nemotron-3-Super tokenizer with a 1M context cap.
+
+Same data + fields as ``prepare.py``, but counts ``n_tokens`` with the
+``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace tokenizer
+and drops samples whose tokenized prompt exceeds 1048576 tokens
+(Nemotron-3-Super's native 1M context window).
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"  # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "longcodebench_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+    return _prepare(
+        tokenizer_name=TOKENIZER_NAME,
+        max_context_tokens=MAX_CONTEXT_TOKENS,
+        output_fpath=OUTPUT_FPATH,
+    )
+
+
+if __name__ == "__main__":
+    prepare()
diff --git a/benchmarks/mrcr/README.md b/benchmarks/mrcr/README.md
index 535bf9002..d31ae8ec0 100644
--- a/benchmarks/mrcr/README.md
+++ b/benchmarks/mrcr/README.md
@@ -8,17 +8,39 @@ to the Nth occurrence and reproduce it exactly" instruction. Scoring:
 `SequenceMatcher.ratio()` between stripped response and stripped expected
 answer, gated on the response starting with the random prefix.
 
+## Variants
+
+| Variant | Config | Prepare script | Tokenizer | Max tokens | Output |
+|---|---|---|---|---|---|
+| Default | `config.yaml` | `prepare.py` | `o200k_base` (tiktoken) | none (no filter) | `data/mrcr_benchmark.jsonl` |
+| N3 128k | `config_n3_128k.yaml` | `prepare_n3_128k.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `131072` | `data/mrcr_n3_128k_benchmark.jsonl` |
+| N3 1M | `config_n3_1m.yaml` | `prepare_n3_1m.py` | `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (HF) | `1048576` | `data/mrcr_n3_1m_benchmark.jsonl` |
+
+The N3 variants require HF auth for the gated NVIDIA repo
+(`HF_TOKEN` env or `huggingface-cli login`).
+
+For one-off custom builds (different tokenizer / cap / output path),
+invoke `prepare.py` directly:
+
+```bash
+python benchmarks/mrcr/prepare.py \
+    --tokenizer_name nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 \
+    --max_context_tokens 131072 \
+    --output_fpath benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl
+```
+
 ## Prepare benchmark data
 
 ```bash
+# Default (o200k_base, no filter)
 ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config.yaml]"
-```
 
-Downloads the HF dataset, token-counts each sample with `tiktoken o200k_base`,
-and writes `benchmarks/mrcr/data/mrcr_benchmark.jsonl`. Samples over 200000
-input tokens are dropped to leave headroom for model-side tokenizers (which
-can be 7–10% heavier than tiktoken) to stay under a 262144-token native
-context.
+# N3 128k variant
+ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config_n3_128k.yaml]"
+
+# N3 1M variant
+ng_prepare_benchmark "+config_paths=[benchmarks/mrcr/config_n3_1m.yaml]"
+```
 
 ## Start environment
 
@@ -29,11 +51,26 @@ ng_run "+config_paths=[benchmarks/mrcr/config.yaml,responses_api_models/vllm_mod
 ## Collect rollouts
 
 ```bash
+# Default variant
 ng_collect_rollouts \
     +agent_name=mrcr_benchmark_simple_agent \
     +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_benchmark.jsonl \
     +output_jsonl_fpath=results/mrcr_rollouts.jsonl \
     +num_repeats=4
+
+# N3 128k variant
+ng_collect_rollouts \
+    +agent_name=mrcr_n3_128k_benchmark_simple_agent \
+    +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl \
+    +output_jsonl_fpath=results/mrcr_n3_128k_rollouts.jsonl \
+    +num_repeats=4
+
+# N3 1M variant
+ng_collect_rollouts \
+    +agent_name=mrcr_n3_1m_benchmark_simple_agent \
+    +input_jsonl_fpath=benchmarks/mrcr/data/mrcr_n3_1m_benchmark.jsonl \
+    +output_jsonl_fpath=results/mrcr_n3_1m_rollouts.jsonl \
+    +num_repeats=4
 ```
 
 ## Metrics
diff --git a/benchmarks/mrcr/config_n3_128k.yaml b/benchmarks/mrcr/config_n3_128k.yaml
new file mode 100644
index 000000000..362aa9a24
--- /dev/null
+++ b/benchmarks/mrcr/config_n3_128k.yaml
@@ -0,0 +1,28 @@
+# MRCR — Nemotron-3-Super 128k-context variant.
+# Same data + grading as `config.yaml`, but `prepare_n3_128k.py` counts
+# tokens with the Nemotron-3-Super HF tokenizer and drops samples whose
+# tokenized conversation exceeds 131072 tokens.
+config_paths:
+  - resources_servers/mrcr/configs/mrcr.yaml
+
+mrcr_n3_128k_benchmark_resources_server:
+  _inherit_from: mrcr_resources_server
+
+mrcr_n3_128k_benchmark_simple_agent:
+  _inherit_from: mrcr_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: mrcr_n3_128k_benchmark_resources_server
+      datasets:
+      - name: mrcr_n3_128k
+        type: benchmark
+        jsonl_fpath: benchmarks/mrcr/data/mrcr_n3_128k_benchmark.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/mrcr/prepare_n3_128k.py
+        # Rollouts per task for pass@k variance.
+        # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+        # placeholder — it only triggers row duplication for
+        # `type: train`/`validation`. To actually get N rollouts per task,
+        # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+        num_repeats: 1
diff --git a/benchmarks/mrcr/config_n3_1m.yaml b/benchmarks/mrcr/config_n3_1m.yaml
new file mode 100644
index 000000000..c1ab257db
--- /dev/null
+++ b/benchmarks/mrcr/config_n3_1m.yaml
@@ -0,0 +1,28 @@
+# MRCR — Nemotron-3-Super 1M-context variant.
+# Same data + grading as `config.yaml`, but `prepare_n3_1m.py` counts
+# tokens with the Nemotron-3-Super HF tokenizer and drops samples whose
+# tokenized conversation exceeds 1048576 tokens.
+config_paths:
+  - resources_servers/mrcr/configs/mrcr.yaml
+
+mrcr_n3_1m_benchmark_resources_server:
+  _inherit_from: mrcr_resources_server
+
+mrcr_n3_1m_benchmark_simple_agent:
+  _inherit_from: mrcr_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: mrcr_n3_1m_benchmark_resources_server
+      datasets:
+      - name: mrcr_n3_1m
+        type: benchmark
+        jsonl_fpath: benchmarks/mrcr/data/mrcr_n3_1m_benchmark.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/mrcr/prepare_n3_1m.py
+        # Rollouts per task for pass@k variance.
+        # NOTE: for `type: benchmark` datasets, `num_repeats` here is a
+        # placeholder — it only triggers row duplication for
+        # `type: train`/`validation`. To actually get N rollouts per task,
+        # pass `+num_repeats=N` on the `ng_collect_rollouts` CLI.
+        num_repeats: 1
diff --git a/benchmarks/mrcr/prepare.py b/benchmarks/mrcr/prepare.py
index 0bcfad81a..2ef8e767b 100644
--- a/benchmarks/mrcr/prepare.py
+++ b/benchmarks/mrcr/prepare.py
@@ -20,18 +20,30 @@
     https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/mrcr/prepare.py
 
 Each row in the upstream dataset has a `prompt` field that is a JSON-stringified
-list of OpenAI chat messages. We parse it into `responses_create_params.input`,
-count tokens with tiktoken `o200k_base` (same tokenizer used by the official
-MRCR grading setup), and filter to samples that fit in the model context.
-
-The 200000-token cap leaves headroom for tokenizer drift: a model's own
-tokenizer can produce ~7-10% more tokens than tiktoken `o200k_base`, so
-filtering at 200K tiktoken keeps the model-side worst-case near 220K, which
-combined with ~32K generation stays under a 262144-token native context.
+list of OpenAI chat messages. We parse it into `responses_create_params.input`
+and count tokens by summing the per-message tokenized lengths.
+
+Defaults: tokenizer ``o200k_base`` (tiktoken) for the ``n_tokens``
+field, with no length filter. For a 128k-context variant using the
+Nemotron-3-Super HF tokenizer, see ``prepare_n3_128k.py`` and
+``config_n3_128k.yaml``.
+
+Invocation
+----------
+
+``ng_prepare_benchmark`` calls ``prepare()`` with no arguments, using
+the defaults below. To build a custom variant, run this script
+directly::
+
+    python benchmarks/mrcr/prepare.py \\
+        --tokenizer_name nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 \\
+        --max_context_tokens 131072
 """
 
+import argparse
 import json
 from pathlib import Path
+from typing import Callable, Optional
 
 import tiktoken
 from datasets import load_dataset
@@ -40,30 +52,55 @@
 
 BENCHMARK_DIR = Path(__file__).parent
 DATA_DIR = BENCHMARK_DIR / "data"
-OUTPUT_FPATH = DATA_DIR / "mrcr_benchmark.jsonl"
+DEFAULT_OUTPUT_FPATH = DATA_DIR / "mrcr_benchmark.jsonl"
+
+DEFAULT_TOKENIZER_NAME = "o200k_base"
+DEFAULT_MAX_CONTEXT_TOKENS: Optional[int] = None  # no filter by default
 
-MAX_CONTEXT_TOKENS = 200000
 
+def _build_token_counter(tokenizer_name: str) -> Callable[[str], int]:
+    """Return a ``text -> token_count`` function.
 
-def _count_tokens(messages: list[dict]) -> int:
-    """Token count using the o200k_base tokenizer — same as Skills prepare."""
-    enc = tiktoken.get_encoding("o200k_base")
-    return sum(len(enc.encode(m["content"])) for m in messages)
+    Tries ``tiktoken.get_encoding`` first; if the name isn't a tiktoken
+    encoding, falls back to ``transformers.AutoTokenizer``.
+    """
+    try:
+        enc = tiktoken.get_encoding(tokenizer_name)
+        return lambda text: len(enc.encode(text, disallowed_special=()))
+    except ValueError:
+        from transformers import AutoTokenizer
 
+        hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True)
+        return lambda text: len(hf_tokenizer.encode(text, add_special_tokens=False))
 
-def prepare() -> Path:
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+def _count_message_tokens(messages: list[dict], count_one: Callable[[str], int]) -> int:
+    """Sum tokens across every message's ``content`` field.
+
+    Matches the per-message summing used by ``nemo_skills/dataset/mrcr/prepare.py``
+    and the official openai/mrcr grading setup.
+    """
+    return sum(count_one(m["content"]) for m in messages)
+
+
+def prepare(
+    tokenizer_name: str = DEFAULT_TOKENIZER_NAME,
+    max_context_tokens: Optional[int] = DEFAULT_MAX_CONTEXT_TOKENS,
+    output_fpath: Path = DEFAULT_OUTPUT_FPATH,
+) -> Path:
+    output_fpath = Path(output_fpath)
+    output_fpath.parent.mkdir(parents=True, exist_ok=True)
 
     dataset = load_dataset("openai/mrcr", split="train")
+    count_one = _build_token_counter(tokenizer_name)
 
     kept = 0
     skipped_tokens = 0
-    with OUTPUT_FPATH.open("w", encoding="utf-8") as fout:
-        for idx, entry in tqdm(enumerate(dataset), desc="Preparing MRCR"):
+    with output_fpath.open("w", encoding="utf-8") as fout:
+        for entry in tqdm(dataset, desc="Preparing MRCR"):
             messages = json.loads(entry["prompt"])
-
-            n_tokens = _count_tokens(messages)
-            if n_tokens > MAX_CONTEXT_TOKENS:
+            n_tokens = _count_message_tokens(messages, count_one)
+            if max_context_tokens is not None and n_tokens > max_context_tokens:
                 skipped_tokens += 1
                 continue
 
@@ -77,9 +114,46 @@ def prepare() -> Path:
             fout.write(json.dumps(sample) + "\n")
             kept += 1
 
-    print(f"Wrote {kept} samples to {OUTPUT_FPATH} (skipped {skipped_tokens} with >{MAX_CONTEXT_TOKENS} tokens)")
-    return OUTPUT_FPATH
+    cap_str = "none" if max_context_tokens is None else str(max_context_tokens)
+    print(
+        f"Wrote {kept} samples to {output_fpath} "
+        f"(tokenizer={tokenizer_name}, cap={cap_str}; dropped {skipped_tokens} over cap)"
+    )
+    return output_fpath
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--tokenizer_name",
+        default=DEFAULT_TOKENIZER_NAME,
+        help=(
+            "Tokenizer used for token counting. Accepts a tiktoken encoding name "
+            "(e.g. 'cl100k_base', 'o200k_base') or a HuggingFace model id "
+            "(e.g. 'meta-llama/Llama-3.1-8B-Instruct'). "
+            f"Default: {DEFAULT_TOKENIZER_NAME}"
+        ),
+    )
+    parser.add_argument(
+        "--max_context_tokens",
+        type=int,
+        default=DEFAULT_MAX_CONTEXT_TOKENS,
+        help=(
+            "Drop samples whose tokenized conversation exceeds this many tokens. "
+            "Omit (or pass a negative number) for no filter. "
+            f"Default: {DEFAULT_MAX_CONTEXT_TOKENS}"
+        ),
+    )
+    parser.add_argument(
+        "--output_fpath",
+        type=Path,
+        default=DEFAULT_OUTPUT_FPATH,
+        help=f"Output JSONL path. Default: {DEFAULT_OUTPUT_FPATH}",
+    )
+    return parser.parse_args()
 
 
 if __name__ == "__main__":
-    prepare()
+    args = _parse_args()
+    cap = args.max_context_tokens if (args.max_context_tokens is None or args.max_context_tokens >= 0) else None
+    prepare(tokenizer_name=args.tokenizer_name, max_context_tokens=cap, output_fpath=args.output_fpath)
diff --git a/benchmarks/mrcr/prepare_n3_128k.py b/benchmarks/mrcr/prepare_n3_128k.py
new file mode 100644
index 000000000..c75e0faaa
--- /dev/null
+++ b/benchmarks/mrcr/prepare_n3_128k.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MRCR variant: Nemotron-3-Super tokenizer with a 128k token cap.
+
+Same data + grading as ``prepare.py``, but counts ``n_tokens`` with
+the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace
+tokenizer and drops samples whose tokenized conversation exceeds
+131072 tokens.
+
+Paired with ``config_n3_128k.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"  # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 131072
+OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_128k_benchmark.jsonl"
+
+
+def prepare() -> Path:
+    return _prepare(
+        tokenizer_name=TOKENIZER_NAME,
+        max_context_tokens=MAX_CONTEXT_TOKENS,
+        output_fpath=OUTPUT_FPATH,
+    )
+
+
+if __name__ == "__main__":
+    prepare()
diff --git a/benchmarks/mrcr/prepare_n3_1m.py b/benchmarks/mrcr/prepare_n3_1m.py
new file mode 100644
index 000000000..ffcbcdcb6
--- /dev/null
+++ b/benchmarks/mrcr/prepare_n3_1m.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MRCR variant: Nemotron-3-Super tokenizer with a 1M token cap.
+
+Same data + grading as ``prepare.py``, but counts ``n_tokens`` with
+the ``nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16`` HuggingFace
+tokenizer and drops samples whose tokenized conversation exceeds
+1048576 tokens (Nemotron-3-Super's native 1M context window).
+
+Paired with ``config_n3_1m.yaml``. Requires HF auth for the gated
+NVIDIA repo (``HF_TOKEN`` env or ``huggingface-cli login``).
+"""
+
+from pathlib import Path
+
+from .prepare import prepare as _prepare
+
+
+TOKENIZER_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"  # pragma: allowlist secret
+MAX_CONTEXT_TOKENS = 1048576
+OUTPUT_FPATH = Path(__file__).parent / "data" / "mrcr_n3_1m_benchmark.jsonl"
+
+
+def prepare() -> Path:
+    return _prepare(
+        tokenizer_name=TOKENIZER_NAME,
+        max_context_tokens=MAX_CONTEXT_TOKENS,
+        output_fpath=OUTPUT_FPATH,
+    )
+
+
+if __name__ == "__main__":
+    prepare()
diff --git a/resources_servers/graphwalks/README.md b/resources_servers/graphwalks/README.md
new file mode 100644
index 000000000..5f11f4de5
--- /dev/null
+++ b/resources_servers/graphwalks/README.md
@@ -0,0 +1,46 @@
+# GraphWalks resources server
+
+OpenAI's [GraphWalks](https://huggingface.co/datasets/openai/graphwalks)
+long-context benchmark. Each task provides an adjacency list (often
+massive) and asks the model either to:
+
+- **parents**: list every parent of a target node, or
+- **bfs**: list every node reachable at exactly depth N via BFS from a
+  source node.
+
+## Scoring
+
+1. The model must end its response with a line of the form
+   `Final Answer: [n1, n2, ...]`. If the format is missing,
+   `parse_failed=True` and reward=0.
+2. Otherwise reward is the **F1 score** between the predicted node
+   set and the expected node set (continuous in [0, 1]):
+   - both empty → 1.0
+   - one empty (the other non-empty) → 0.0
+   - else `2·P·R / (P + R)`
+
+Grader ported from
+https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluator/graphwalks.py.
+
+## Start environment
+
+```bash
+ng_run "+config_paths=[resources_servers/graphwalks/configs/graphwalks.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]"
+```
+
+## Collect example rollouts
+
+```bash
+ng_collect_rollouts \
+    +agent_name=graphwalks_simple_agent \
+    +input_jsonl_fpath=resources_servers/graphwalks/data/example.jsonl \
+    +output_jsonl_fpath=resources_servers/graphwalks/data/example_rollouts.jsonl
+```
+
+For the full benchmark run see
+[`benchmarks/graphwalks/README.md`](../../benchmarks/graphwalks/README.md).
+
+## Licensing
+
+- Code: Apache 2.0
+- Data ([openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks)): see upstream license
diff --git a/resources_servers/graphwalks/app.py b/resources_servers/graphwalks/app.py
new file mode 100644
index 000000000..6ab73f2d5
--- /dev/null
+++ b/resources_servers/graphwalks/app.py
@@ -0,0 +1,170 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraphWalks (OpenAI) resources server.
+
+Implements F1-over-node-sets grading from the official
+[openai/graphwalks](https://huggingface.co/datasets/openai/graphwalks)
+benchmark. Each task asks the model either to (a) list the parents of a
+node or (b) return BFS-reachable nodes at exactly a given depth.
+
+Ported from:
+    https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/evaluation/evaluator/graphwalks.py
+
+Scoring:
+  - Parse the model's final line for ``Final Answer: [n1, n2, ...]``.
+    If absent, ``parse_failed=True`` and reward=0.
+  - Otherwise compute F1 between the predicted node set and the
+    expected node set. Empty-vs-empty matches as F1=1.0; either
+    side empty (with the other non-empty) is F1=0.
+  - Reward is the F1 score in [0, 1] — continuous, like MRCR.
+"""
+
+import json
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from nemo_gym.base_resources_server import (
+    BaseResourcesServerConfig,
+    BaseVerifyRequest,
+    BaseVerifyResponse,
+    SimpleResourcesServer,
+)
+from nemo_gym.reward_profile import (
+    compute_pass_majority_metrics,
+    compute_subset_metrics,
+    highest_k_metrics,
+)
+
+
+class GraphWalksResourcesServerConfig(BaseResourcesServerConfig):
+    pass
+
+
+class GraphWalksVerifyRequest(BaseVerifyRequest):
+    expected_answer: str
+    problem_type: str
+    n_tokens: Optional[int] = None
+    prompt_chars: Optional[int] = None
+
+
+class GraphWalksVerifyResponse(GraphWalksVerifyRequest, BaseVerifyResponse):
+    f1: float
+    parse_failed: bool
+    predicted_nodes: List[str]
+
+
+class GraphWalksResourcesServer(SimpleResourcesServer):
+    config: GraphWalksResourcesServerConfig
+
+    async def verify(self, body: GraphWalksVerifyRequest) -> GraphWalksVerifyResponse:
+        response = body.response.output_text
+        predicted_nodes, parse_failed = _parse_node_list(response)
+        try:
+            expected_nodes = set(json.loads(body.expected_answer))
+        except (json.JSONDecodeError, TypeError):
+            expected_nodes = set()
+        f1 = _f1_score(set(predicted_nodes), expected_nodes, parse_failed)
+        return GraphWalksVerifyResponse(
+            **body.model_dump(),
+            reward=f1,
+            f1=f1,
+            parse_failed=parse_failed,
+            predicted_nodes=predicted_nodes,
+        )
+
+    # ──────────────────────────────────────────────────────────
+    # Aggregate metrics overrides
+    # ──────────────────────────────────────────────────────────
+
+    @staticmethod
+    def _score_fn(r: Dict[str, Any]) -> Dict[str, Union[float, bool]]:
+        return {"accuracy": r["reward"]}
+
+    def compute_metrics(self, tasks: List[List[Dict[str, Any]]]) -> Dict[str, Any]:
+        """Pass@k plus a per-`problem_type` subset breakdown.
+
+        F1 is a continuous score in [0, 1] so pass@k is max-of-k (not
+        combinatorial). majority@k is not meaningful (no discrete
+        extracted answer) — `answer_key` is left None.
+        """
+        metrics, _, _, _ = compute_pass_majority_metrics(tasks, score_fn=self._score_fn)
+        subset_metrics = compute_subset_metrics(tasks, subset_key="problem_type", score_fn=self._score_fn)
+        # compute_subset_metrics emits keys like "<value>/pass@k/accuracy" where
+        # <value> is the raw subset value. Prepend the field name so the key
+        # stays self-describing: "problem_type=<value>/pass@k/accuracy".
+        subset_metrics = {(f"problem_type={k}" if "/" in k else k): v for k, v in subset_metrics.items()}
+        metrics.update(subset_metrics)
+        return metrics
+
+    def get_key_metrics(self, agent_metrics: Dict[str, Any]) -> Dict[str, Any]:
+        key: Dict[str, Any] = {}
+        for name in ("mean/input_tokens", "mean/output_tokens"):
+            if name in agent_metrics:
+                key[name] = agent_metrics[name]
+        key.update(highest_k_metrics(agent_metrics, "pass@1[avg-of-{k}]"))
+        key.update(highest_k_metrics(agent_metrics, "pass@{k}"))
+        return key
+
+
+_FINAL_ANSWER_RE = re.compile(r"Final Answer:\s*\[(.*)\]")
+
+
+def _parse_node_list(response: str) -> Tuple[List[str], bool]:
+    """Parse ``Final Answer: [n1, n2, ...]`` from the last non-empty line.
+
+    Returns ``(nodes, parse_failed)``. ``parse_failed`` is True when the
+    expected format is absent. Empty list with ``parse_failed=False`` means
+    the model explicitly returned no nodes.
+
+    Reference: https://huggingface.co/datasets/openai/graphwalks
+    """
+    lines = [line for line in (response or "").strip().split("\n") if line.strip()]
+    if not lines:
+        return [], True
+
+    match = _FINAL_ANSWER_RE.search(lines[-1])
+    if not match:
+        return [], True
+
+    content = match.group(1)
+    if not content.strip():
+        return [], False
+    return [item.strip() for item in content.split(",") if item.strip()], False
+
+
+def _f1_score(predicted: set, expected: set, parse_failed: bool) -> float:
+    """F1 between two node sets.
+
+    - parse_failed → 0.0 (no answer extracted)
+    - both empty   → 1.0 (model correctly returned nothing)
+    - one empty    → 0.0
+    - otherwise    → 2·P·R / (P + R)
+    """
+    if parse_failed:
+        return 0.0
+    if not expected and not predicted:
+        return 1.0
+    if not predicted or not expected:
+        return 0.0
+    tp = len(predicted & expected)
+    if tp == 0:
+        return 0.0
+    precision = tp / len(predicted)
+    recall = tp / len(expected)
+    return 2 * precision * recall / (precision + recall)
+
+
+if __name__ == "__main__":
+    GraphWalksResourcesServer.run_webserver()
diff --git a/resources_servers/graphwalks/configs/graphwalks.yaml b/resources_servers/graphwalks/configs/graphwalks.yaml
new file mode 100644
index 000000000..6b5f6ed50
--- /dev/null
+++ b/resources_servers/graphwalks/configs/graphwalks.yaml
@@ -0,0 +1,25 @@
+graphwalks_resources_server:
+  resources_servers:
+    graphwalks:
+      entrypoint: app.py
+      domain: other
+      verified: false
+      description: Long-context graph-walks (BFS / parents) with F1-over-node-sets grading from openai/graphwalks
+      value: Improve long-context multi-step graph reasoning and adjacency-list traversal
+
+graphwalks_simple_agent:
+  responses_api_agents:
+    simple_agent:
+      entrypoint: app.py
+      resources_server:
+        type: resources_servers
+        name: graphwalks_resources_server
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+      - name: example
+        type: example
+        jsonl_fpath: resources_servers/graphwalks/data/example.jsonl
+        num_repeats: 1
+        license: MIT
diff --git a/resources_servers/graphwalks/data/.gitignore b/resources_servers/graphwalks/data/.gitignore
new file mode 100644
index 000000000..8d24a9b19
--- /dev/null
+++ b/resources_servers/graphwalks/data/.gitignore
@@ -0,0 +1,6 @@
+*train.jsonl
+*validation.jsonl
+*train_prepare.jsonl
+*validation_prepare.jsonl
+*example_prepare.jsonl
+*benchmark.jsonl
diff --git a/resources_servers/graphwalks/data/example.jsonl b/resources_servers/graphwalks/data/example.jsonl
new file mode 100644
index 000000000..45f92fd58
--- /dev/null
+++ b/resources_servers/graphwalks/data/example.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nnode_0 -> node_1, node_2\nnode_1 -> node_3\nnode_2 -> node_3, node_4\nnode_3 -> node_5\nnode_4 -> node_5\n\nOperation: Find the parents of node node_3."}]}, "expected_answer": "[\"node_1\", \"node_2\"]", "problem_type": "parents", "n_tokens": 120, "prompt_chars": 480, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nalpha -> beta, gamma\nbeta -> delta\ngamma -> delta, epsilon\ndelta -> zeta\nepsilon -> zeta\n\nOperation: Find the parents of node zeta."}]}, "expected_answer": "[\"delta\", \"epsilon\"]", "problem_type": "parents", "n_tokens": 110, "prompt_chars": 430, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nnode_0 -> node_1, node_2\nnode_1 -> node_3, node_4\nnode_2 -> node_5\nnode_3 -> node_6\nnode_4 -> node_6\nnode_5 -> node_7\n\nOperation: Perform a BFS from node node_0 and return only the nodes at exactly depth 2 (not nodes at intermediate depths)."}]}, "expected_answer": "[\"node_3\", \"node_4\", \"node_5\"]", "problem_type": "bfs", "n_tokens": 150, "prompt_chars": 560, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nroot -> a, b\na -> c, d\nb -> e\nc -> f\nd -> f, g\ne -> g\nf -> h\ng -> h\n\nOperation: Perform a BFS from node root and return only the nodes at exactly depth 3 (not nodes at intermediate depths)."}]}, "expected_answer": "[\"f\", \"g\"]", "problem_type": "bfs", "n_tokens": 145, "prompt_chars": 540, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
+{"responses_create_params": {"input": [{"role": "user", "content": "You will be given a graph as an adjacency list, an operation, and a node. Your answer must be the set of all reachable nodes for the operation. Return the answer in the last line of your response in the form 'Final Answer: [node1, node2, ...]'.\n\nAdjacency list:\nn0 -> n1\nn1 -> n2, n3\nn2 -> n4\nn3 -> n4, n5\nn4 -> n6\nn5 -> n6\nn6 -> n7\n\nOperation: Find the parents of node n4."}]}, "expected_answer": "[\"n2\", \"n3\"]", "problem_type": "parents", "n_tokens": 130, "prompt_chars": 470, "agent_ref": {"type": "responses_api_agents", "name": "graphwalks_simple_agent"}}
diff --git a/resources_servers/graphwalks/data/example_metrics.json b/resources_servers/graphwalks/data/example_metrics.json
new file mode 100644
index 000000000..8a0f7bdb5
--- /dev/null
+++ b/resources_servers/graphwalks/data/example_metrics.json
@@ -0,0 +1,60 @@
+{
+    "name": "example",
+    "type": "example",
+    "jsonl_fpath": "resources_servers/graphwalks/data/example.jsonl",
+    "num_repeats": 1,
+    "gitlab_identifier": null,
+    "huggingface_identifier": null,
+    "license": "MIT",
+    "Number of examples": 5,
+    "Number of tools": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    },
+    "Json-dumped number of words (proxy for token count)": {
+        "Total # non-null values": 5,
+        "Average": 77.2,
+        "Min": 69.0,
+        "Max": 90.0,
+        "Standard deviation": 9.71
+    },
+    "Number of turns": {
+        "Total # non-null values": 5,
+        "Average": 1.0,
+        "Min": 1.0,
+        "Max": 1.0,
+        "Standard deviation": 0.0
+    },
+    "Temperature": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    },
+    "expected_answer": {
+        "unique_count": 5,
+        "total_count": 5
+    },
+    "problem_type": {
+        "unique_count": 2,
+        "total_count": 5
+    },
+    "n_tokens": {
+        "Total # non-null values": 5,
+        "Average": 131.0,
+        "Min": 110.0,
+        "Max": 150.0,
+        "Standard deviation": 16.73
+    },
+    "prompt_chars": {
+        "Total # non-null values": 5,
+        "Average": 496.0,
+        "Min": 430.0,
+        "Max": 560.0,
+        "Standard deviation": 53.2
+    }
+}
\ No newline at end of file
diff --git a/resources_servers/graphwalks/requirements.txt b/resources_servers/graphwalks/requirements.txt
new file mode 100644
index 000000000..151b4ab7b
--- /dev/null
+++ b/resources_servers/graphwalks/requirements.txt
@@ -0,0 +1,2 @@
+-e nemo-gym[dev] @ ../../
+tiktoken
diff --git a/resources_servers/graphwalks/tests/__init__.py b/resources_servers/graphwalks/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/resources_servers/graphwalks/tests/test_app.py b/resources_servers/graphwalks/tests/test_app.py
new file mode 100644
index 000000000..6c4cd9a86
--- /dev/null
+++ b/resources_servers/graphwalks/tests/test_app.py
@@ -0,0 +1,200 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from unittest.mock import MagicMock
+
+import pytest
+
+from nemo_gym.server_utils import ServerClient
+from resources_servers.graphwalks.app import (
+    GraphWalksResourcesServer,
+    GraphWalksResourcesServerConfig,
+    _f1_score,
+    _parse_node_list,
+)
+
+
+class TestSanity:
+    def test_sanity(self) -> None:
+        config = GraphWalksResourcesServerConfig(
+            host="0.0.0.0",
+            port=8080,
+            entrypoint="",
+            name="",
+        )
+        GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+
+class TestParseNodeList:
+    """Tests for the Final-Answer line parser.
+
+    Reference: https://huggingface.co/datasets/openai/graphwalks
+    """
+
+    def test_parses_well_formed_list(self) -> None:
+        nodes, failed = _parse_node_list("...\nFinal Answer: [node_1, node_2, node_3]")
+        assert nodes == ["node_1", "node_2", "node_3"]
+        assert failed is False
+
+    def test_empty_list_is_valid(self) -> None:
+        """`Final Answer: []` is a valid no-nodes answer, not a parse failure."""
+        nodes, failed = _parse_node_list("Final Answer: []")
+        assert nodes == []
+        assert failed is False
+
+    def test_only_uses_last_line(self) -> None:
+        text = "Final Answer: [decoy]\nbecause I said so\nFinal Answer: [real]"
+        nodes, failed = _parse_node_list(text)
+        assert nodes == ["real"]
+        assert failed is False
+
+    def test_skips_trailing_blank_lines(self) -> None:
+        nodes, failed = _parse_node_list("Final Answer: [a, b]\n\n   \n")
+        assert nodes == ["a", "b"]
+        assert failed is False
+
+    def test_missing_format_fails(self) -> None:
+        nodes, failed = _parse_node_list("The answer is node_42.")
+        assert nodes == []
+        assert failed is True
+
+    def test_blank_response_fails(self) -> None:
+        nodes, failed = _parse_node_list("")
+        assert nodes == []
+        assert failed is True
+
+    def test_strips_whitespace_inside_list(self) -> None:
+        nodes, failed = _parse_node_list("Final Answer: [  a  ,b ,   c]")
+        assert nodes == ["a", "b", "c"]
+        assert failed is False
+
+    def test_drops_empty_items(self) -> None:
+        """Trailing commas / double commas should not produce empty entries."""
+        nodes, failed = _parse_node_list("Final Answer: [a,, b,]")
+        assert nodes == ["a", "b"]
+        assert failed is False
+
+
+class TestF1Score:
+    def test_parse_failed_is_zero(self) -> None:
+        assert _f1_score({"a"}, {"a"}, parse_failed=True) == 0.0
+
+    def test_both_empty_is_one(self) -> None:
+        assert _f1_score(set(), set(), parse_failed=False) == 1.0
+
+    def test_predicted_empty_expected_nonempty(self) -> None:
+        assert _f1_score(set(), {"a"}, parse_failed=False) == 0.0
+
+    def test_predicted_nonempty_expected_empty(self) -> None:
+        assert _f1_score({"a"}, set(), parse_failed=False) == 0.0
+
+    def test_exact_match_is_one(self) -> None:
+        assert _f1_score({"a", "b"}, {"a", "b"}, parse_failed=False) == 1.0
+
+    def test_no_overlap_is_zero(self) -> None:
+        assert _f1_score({"a"}, {"b"}, parse_failed=False) == 0.0
+
+    def test_partial_overlap(self) -> None:
+        # P=1/2, R=1/2 → F1=0.5
+        assert math.isclose(_f1_score({"a", "b"}, {"a", "c"}, parse_failed=False), 0.5)
+
+    def test_unequal_sizes(self) -> None:
+        # predicted={a,b,c}, expected={a}; P=1/3, R=1 → F1=0.5
+        assert math.isclose(_f1_score({"a", "b", "c"}, {"a"}, parse_failed=False), 0.5)
+
+
+class TestScoreFn:
+    def test_score_fn_returns_accuracy_equals_reward(self) -> None:
+        assert GraphWalksResourcesServer._score_fn({"reward": 0.73}) == {"accuracy": 0.73}
+
+    def test_score_fn_handles_zero(self) -> None:
+        assert GraphWalksResourcesServer._score_fn({"reward": 0.0}) == {"accuracy": 0.0}
+
+    def test_score_fn_handles_one(self) -> None:
+        assert GraphWalksResourcesServer._score_fn({"reward": 1.0}) == {"accuracy": 1.0}
+
+
+class TestComputeMetrics:
+    @pytest.fixture
+    def server(self) -> GraphWalksResourcesServer:
+        config = GraphWalksResourcesServerConfig(
+            host="0.0.0.0",
+            port=8080,
+            entrypoint="",
+            name="",
+        )
+        return GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+    def test_compute_metrics_empty(self, server: GraphWalksResourcesServer) -> None:
+        assert server.compute_metrics([]) == {}
+
+    def test_compute_metrics_includes_pass_at_k(self, server: GraphWalksResourcesServer) -> None:
+        tasks = [
+            [{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}],
+            [{"reward": 0.8, "problem_type": "bfs"}, {"reward": 0.6, "problem_type": "bfs"}],
+        ]
+        metrics = server.compute_metrics(tasks)
+        assert "pass@1/accuracy" in metrics
+        assert "pass@2/accuracy" in metrics
+        assert "pass@1[avg-of-2]/accuracy" in metrics
+
+    def test_compute_metrics_includes_subset_breakdown(self, server: GraphWalksResourcesServer) -> None:
+        """Per-problem-type subset should appear as `problem_type=<value>/...`."""
+        tasks = [
+            [{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}],
+            [{"reward": 0.8, "problem_type": "bfs"}, {"reward": 0.6, "problem_type": "bfs"}],
+        ]
+        metrics = server.compute_metrics(tasks)
+        assert any(k.startswith("problem_type=parents/pass@") for k in metrics)
+        assert any(k.startswith("problem_type=bfs/pass@") for k in metrics)
+        # Bare "<value>/..." keys must NOT leak through from compute_subset_metrics.
+        assert not any(k.startswith(("parents/", "bfs/")) for k in metrics)
+
+    def test_compute_metrics_no_majority(self, server: GraphWalksResourcesServer) -> None:
+        """majority@k is skipped because F1 has no discrete answer_key."""
+        tasks = [[{"reward": 1.0, "problem_type": "parents"}, {"reward": 0.5, "problem_type": "parents"}]]
+        metrics = server.compute_metrics(tasks)
+        assert not any(k.startswith("majority@") for k in metrics)
+
+
+class TestGetKeyMetrics:
+    @pytest.fixture
+    def server(self) -> GraphWalksResourcesServer:
+        config = GraphWalksResourcesServerConfig(
+            host="0.0.0.0",
+            port=8080,
+            entrypoint="",
+            name="",
+        )
+        return GraphWalksResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+    def test_get_key_metrics_picks_highest_k(self, server: GraphWalksResourcesServer) -> None:
+        agent_metrics = {
+            "pass@1/accuracy": 50.0,
+            "pass@2/accuracy": 70.0,
+            "pass@4/accuracy": 80.0,
+            "pass@1[avg-of-4]/accuracy": 60.0,
+            "mean/input_tokens": 1000,
+            "mean/output_tokens": 200,
+        }
+        key = server.get_key_metrics(agent_metrics)
+        assert key["pass@4/accuracy"] == 80.0
+        assert key["pass@1[avg-of-4]/accuracy"] == 60.0
+        assert key["mean/input_tokens"] == 1000
+        assert key["mean/output_tokens"] == 200
+        # Lower-k entries should not be in the key set
+        assert "pass@1/accuracy" not in key
+        assert "pass@2/accuracy" not in key