diff --git a/nemo_retriever/src/nemo_retriever/llm/clients/judge.py b/nemo_retriever/src/nemo_retriever/llm/clients/judge.py
index 97b01c5dc..61198a0e2 100644
--- a/nemo_retriever/src/nemo_retriever/llm/clients/judge.py
+++ b/nemo_retriever/src/nemo_retriever/llm/clients/judge.py
@@ -153,6 +153,8 @@ def judge(self, query: str, reference: str, candidate: str) -> JudgeResult:
 def _parse_judge_response(raw: str) -> JudgeResult:
     """Parse the judge's JSON response into a JudgeResult."""
     text = raw.strip()
+    # Reasoning models can emit a <think>...</think> block before the final JSON.
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
 
     text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
     text = re.sub(r"\s*```$", "", text, flags=re.MULTILINE)
diff --git a/nemo_retriever/src/nemo_retriever/retriever_graph_utils.py b/nemo_retriever/src/nemo_retriever/retriever_graph_utils.py
index a5b6857d7..89ae103fc 100644
--- a/nemo_retriever/src/nemo_retriever/retriever_graph_utils.py
+++ b/nemo_retriever/src/nemo_retriever/retriever_graph_utils.py
@@ -22,11 +22,20 @@ def hits_lists_to_rerank_dataframe(
     query_texts: list[str],
     hits_per_query: list[list[dict[str, Any]]],
 ) -> pd.DataFrame:
-    """One row per (query, hit) with payload to rebuild hits after reranking."""
+    """One row per (query, hit) with payload to rebuild hits after reranking.
+
+    Returns a DataFrame with the columns ``query``, ``text``, ``_hit`` even when
+    there are no hits — ``pd.DataFrame([])`` yields a column-less DataFrame,
+    which crashes the downstream rerank actor with ``KeyError: 'query'`` on a
+    legitimate empty-results path (empty/unmatched corpus, freshly-ingested
+    table, etc.).
+    """
     rows: list[dict[str, Any]] = []
     for q, hits in zip(query_texts, hits_per_query):
         for h in hits:
             rows.append({"query": q, "text": str(h.get("text", "")), "_hit": dict(h)})
+    if not rows:
+        return pd.DataFrame(columns=["query", "text", "_hit"])
     return pd.DataFrame(rows)
 
 
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
index 8db8f323e..e86d3ebd4 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/cli.py
@@ -2,14 +2,16 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""`retriever skill-eval run` benchmark."""
+"""`retriever skill-eval` benchmark."""
 
 from __future__ import annotations
 
+import json
 import logging
 import os
 import shutil
 from collections import defaultdict
+from dataclasses import asdict, fields
 from pathlib import Path
 from typing import Any, Optional
 
@@ -22,14 +24,21 @@
 from nemo_retriever.skill_eval.report import overall_recall, write_summary
 from nemo_retriever.skill_eval.runner import (
     CONDITIONS,
+    DEFAULT_AGENT_MODELS,
+    SUPPORTED_AGENTS,
+    UNSCORABLE_JUDGE_ERRORS,
+    TrialResult,
+    _apply_judge,
+    archive_session_log,
     cleanup_condition_workdir,
+    extract_compact_trace,
     run_condition,
     save_trial,
 )
 
 DEFAULT_ORDER = ("c1_base", "c2_retriever", "c3_retriever_skill")
 
-app = typer.Typer(help="Benchmark Claude with vs. without the /nemo-retriever skill on a folder of PDFs.")
+app = typer.Typer(help="Benchmark coding agents with vs. without the /nemo-retriever skill on a folder of PDFs.")
 logger = logging.getLogger(__name__)
 
 
@@ -85,6 +94,61 @@ def _build_judge(cfg: dict) -> Optional[Any]:
     return judge
 
 
+def _build_trace_summarizer(cfg: dict) -> Optional[Any]:
+    """Construct a ``TraceSummarizer`` from ``cfg['summarizer']`` or return ``None``."""
+    sum_cfg = cfg.get("summarizer") or {}
+    if not sum_cfg.get("enabled", True):
+        typer.echo("Trace summarizer disabled by config (summarizer.enabled=false).")
+        return None
+    if shutil.which("claude") is None:
+        typer.echo("Trace summarizer disabled: `claude` CLI is not on PATH.")
+        return None
+    from nemo_retriever.skill_eval.trace_summarizer import TraceSummarizer
+
+    summarizer = TraceSummarizer.from_kwargs(
+        model=str(sum_cfg.get("model", "claude-opus-4-7")),
+    )
+    typer.echo(f"Trace summarizer enabled: model={summarizer.model}")
+    return summarizer
+
+
+def _resolve_agent(value: str) -> str:
+    agent = value.strip().lower()
+    if agent not in SUPPORTED_AGENTS:
+        raise typer.BadParameter(f"agent must be one of {', '.join(SUPPORTED_AGENTS)}")
+    return agent
+
+
+def _resolve_agent_model(cfg: dict, agent: str, override: Optional[str]) -> str:
+    if override:
+        return override
+    models = cfg.get("agent_models")
+    if isinstance(models, dict) and models.get(agent):
+        return str(models[agent])
+    if cfg.get("agent_model"):
+        return str(cfg["agent_model"])
+    return DEFAULT_AGENT_MODELS[agent]
+
+
+def _resolve_conditions(value: Optional[str], cfg: dict) -> list[str]:
+    if value is not None:
+        selected = [c.strip() for c in value.split(",") if c.strip()]
+    else:
+        raw = cfg.get("conditions") or list(DEFAULT_ORDER)
+        if isinstance(raw, str):
+            selected = [c.strip() for c in raw.split(",") if c.strip()]
+        elif isinstance(raw, list):
+            selected = [str(c).strip() for c in raw if str(c).strip()]
+        else:
+            raise typer.BadParameter("config 'conditions' must be a list or comma-separated string")
+    if not selected:
+        raise typer.BadParameter("at least one condition must be selected")
+    for c in selected:
+        if c not in CONDITIONS:
+            raise typer.BadParameter(f"unknown condition '{c}'. Choose from {CONDITIONS}.")
+    return selected
+
+
 def _resolve_domain_label(entries: list[DatasetEntry], cfg: dict, domain: str) -> str:
     """Pick a human-readable label for the setup prompt.
 
@@ -112,12 +176,12 @@ def run_command(
         "--eval-manifest",
         help="Path to an agent-eval manifest (JSON list). Overrides config.eval_manifest_path.",
     ),
-    conditions: str = typer.Option(
-        ",".join(DEFAULT_ORDER),
+    conditions: Optional[str] = typer.Option(
+        None,
         "--conditions",
         help=(
-            "Comma-separated conditions in execution order. Each (condition, domain) workdir is deleted after it runs, "
-            "so only one LanceDB is on disk at a time."
+            "Comma-separated conditions in execution order. Defaults to config.conditions, then "
+            f"{','.join(DEFAULT_ORDER)}. Each (agent, condition, domain) workdir is deleted after it runs."
         ),
     ),
     domains: Optional[str] = typer.Option(
@@ -128,19 +192,26 @@ def run_command(
     artifacts_root: Optional[Path] = typer.Option(
         None, "--artifacts-root", help="Override the artifact root; defaults to <repo>/nemo_retriever/artifacts/"
     ),
+    agent_name: Optional[str] = typer.Option(
+        None,
+        "--agent",
+        help="Agent CLI to evaluate: claude or codex. Overrides config.agent.",
+    ),
+    model_override: Optional[str] = typer.Option(
+        None,
+        "--model",
+        help="Agent model override for this run.",
+    ),
 ) -> None:
-    """Run the benchmark across the dataset's domains × selected conditions, sequentially."""
+    """Run the benchmark across the dataset's domains x selected conditions."""
     logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
-    if shutil.which("claude") is None:
-        typer.echo("Error: `claude` CLI is not on PATH; install Claude Code first.", err=True)
-        raise typer.Exit(code=2)
 
     cfg = load_config(config)
-    selected = [c.strip() for c in conditions.split(",") if c.strip()]
-    for c in selected:
-        if c not in CONDITIONS:
-            typer.echo(f"Error: unknown condition '{c}'. Choose from {CONDITIONS}.", err=True)
-            raise typer.Exit(code=2)
+    agent = _resolve_agent(str(agent_name or cfg.get("agent") or "claude"))
+    if shutil.which(agent) is None:
+        typer.echo(f"Error: `{agent}` CLI is not on PATH.", err=True)
+        raise typer.Exit(code=2)
+    selected = _resolve_conditions(conditions, cfg)
 
     manifest_path = eval_manifest or cfg.get("eval_manifest_path")
     if not manifest_path:
@@ -170,9 +241,13 @@ def run_command(
     skill_source = Path(
         str(cfg.get("skill_source_dir") or REPO_ROOT / ".claude" / "skills" / "nemo-retriever")
     ).expanduser()
+    if any(c in ("c2_retriever", "c3_retriever_skill") for c in selected) and not (skill_source / "SKILL.md").is_file():
+        typer.echo(f"Error: skill source '{skill_source}' does not contain SKILL.md.", err=True)
+        raise typer.Exit(code=2)
+
     workdir_root = Path(str(cfg.get("per_trial_workdir_root", "/tmp/skill_eval"))).expanduser()
     workdir_root.mkdir(parents=True, exist_ok=True)
-    model = str(cfg.get("agent_model", "claude-opus-4-7"))
+    model = _resolve_agent_model(cfg, agent, model_override)
     budget = float(cfg.get("per_trial_budget_usd", 5.0))
     timeout = int(cfg.get("per_trial_timeout_s", 600))
     testdata_prefixes_raw = cfg.get("testdata_prefixes") or []
@@ -182,15 +257,21 @@ def run_command(
     testdata_prefixes = tuple(str(p) for p in testdata_prefixes_raw)
 
     judge = _build_judge(cfg)
+    summarizer = _build_trace_summarizer(cfg)
 
     base_dir = str(artifacts_root) if artifacts_root else None
     session_dir = create_session_dir("skilleval", base_dir=base_dir)
     typer.echo(f"Session dir: {session_dir}")
+    typer.echo(f"Agent: {agent}  model={model}  conditions={selected}")
 
-    (session_dir / "config.yaml").write_text(yaml.safe_dump(cfg, default_flow_style=False), encoding="utf-8")
+    resolved_cfg = dict(cfg)
+    resolved_cfg["agent"] = agent
+    resolved_cfg["agent_model"] = model
+    resolved_cfg["conditions"] = selected
+    (session_dir / "config.yaml").write_text(yaml.safe_dump(resolved_cfg, default_flow_style=False), encoding="utf-8")
 
-    # Results are keyed (condition, domain) so the report can break out per-domain numbers.
-    results_by_key: dict[tuple[str, str], list] = {}
+    # Results are keyed (agent, condition, domain) so reports can compare agent runs.
+    results_by_key: dict[tuple[str, str, str], list[TrialResult]] = {}
     for cond in selected:
         for domain in domain_order:
             domain_entries = by_domain[domain]
@@ -204,10 +285,11 @@ def run_command(
                 raise typer.Exit(code=2)
             domain_label = _resolve_domain_label(domain_entries, cfg, domain)
             typer.echo(
-                f"Starting session for {cond}/{domain} — setup + {len(domain_entries)} query turns "
+                f"Starting {agent} session for {cond}/{domain} - setup + {len(domain_entries)} query turns "
                 f"(pdfs={pdf_source})"
             )
             workdir, results = run_condition(
+                agent=agent,
                 condition=cond,
                 entries=domain_entries,
                 workdir_root=workdir_root,
@@ -221,28 +303,57 @@ def run_command(
                 judge=judge,
                 testdata_prefixes=testdata_prefixes,
             )
+            if summarizer is not None and results:
+                trace = extract_compact_trace(agent, workdir, results[0].session_id)
+                if trace:
+                    narrative = summarizer.summarize(condition=f"{agent}/{cond}", domain=domain, trace=trace)
+                    if narrative:
+                        for r in results:
+                            if r.is_setup:
+                                r.tool_use_summary = narrative
+                                break
+                        typer.echo(f"  tool-use summary: {len(narrative)} chars")
+                    else:
+                        typer.echo("  tool-use summary: (summarizer returned empty)")
+                else:
+                    typer.echo("  tool-use summary skipped: session JSONL unavailable")
             for r in results:
                 save_trial(r, session_dir)
                 kind = "setup" if r.is_setup else f"entry_id={r.entry_id} query_id={r.query_id}"
                 judge_str = "" if r.is_setup or r.judge_score is None else f" judge={r.judge_score}"
+                cost_str = f"${r.total_cost_usd:.3f}" if r.cost_available else "n/a"
                 typer.echo(
-                    f"  turn {r.num_turns} [{domain}] {kind}: status={r.status} "
+                    f"  turn {r.num_turns} [{agent}/{domain}] {kind}: status={r.status} "
                     f"tokens(in/out/cache_r)={r.input_tokens}/{r.output_tokens}/{r.cache_read_input_tokens} "
-                    f"cost=${r.total_cost_usd:.3f} retrieved={len(r.ranked_retrieved)}{judge_str}"
+                    f"cost={cost_str} retrieved={len(r.ranked_retrieved)}{judge_str}"
                 )
-            results_by_key[(cond, domain)] = results
+            results_by_key[(agent, cond, domain)] = results
 
             entries_by_id = {e.entry_id: e for e in domain_entries}
             scores = overall_recall(results, entries_by_id)
             typer.echo(
-                f"\nRecall for {cond}/{domain}: "
+                f"\nRecall for {agent}/{cond}/{domain}: "
                 f"recall@1={scores['recall_1']:.3f}  "
                 f"recall@5={scores['recall_5']:.3f}  "
                 f"recall@10={scores['recall_10']:.3f}"
             )
 
+            if results:
+                archived = archive_session_log(
+                    session_dir=session_dir,
+                    agent=agent,
+                    condition=cond,
+                    domain=domain,
+                    session_uuid=results[0].session_id,
+                    workdir=workdir,
+                )
+                if archived is not None:
+                    typer.echo(f"  archived session log: {archived.relative_to(session_dir)}")
+                else:
+                    typer.echo(f"  session log not found for archiving ({agent}/{cond}/{domain})")
+
             cleanup_condition_workdir(workdir)
-            typer.echo(f"Cleaned up workdir for {cond}/{domain}\n")
+            typer.echo(f"Cleaned up workdir for {agent}/{cond}/{domain}\n")
 
     if judge is not None:
         typer.echo("\nLLM-as-judge scores (mean over query turns, 0-5 scale):")
@@ -250,7 +361,7 @@ def run_command(
             scored: list[int] = []
             errored = 0
             for domain in domain_order:
-                for r in results_by_key.get((cond, domain), []):
+                for r in results_by_key.get((agent, cond, domain), []):
                     if r.is_setup:
                         continue
                     if r.judge_score is not None:
@@ -259,17 +370,193 @@ def run_command(
                         errored += 1
             if scored:
                 mean_score = sum(scored) / len(scored)
-                typer.echo(f"  {cond}: mean={mean_score:.2f}  n={len(scored)}  errors={errored}")
+                typer.echo(f"  {agent}/{cond}: mean={mean_score:.2f}  n={len(scored)}  errors={errored}")
             else:
-                typer.echo(f"  {cond}: no scores  errors={errored} (check judge config / litellm install)")
+                typer.echo(f"  {agent}/{cond}: no scores  errors={errored} (check judge config / litellm install)")
 
     json_path, md_path = write_summary(
         session_dir=session_dir,
         results_by_key=results_by_key,
         entries=entries,
-        config=cfg,
+        config=resolved_cfg,
+        agent=agent,
+        model=model,
         config_path=str(config) if config else "<packaged default>",
     )
     typer.echo(f"\nWrote {json_path}")
     typer.echo(f"Wrote {md_path}")
     typer.echo("\nDone.")
+
+
+def _needs_rescore(trial: dict[str, Any]) -> bool:
+    """Return whether a query-turn trial needs fresh judge scoring."""
+    if trial.get("is_setup"):
+        return False
+    judge_error = trial.get("judge_error") or ""
+    if judge_error in UNSCORABLE_JUDGE_ERRORS:
+        return False
+    score = trial.get("judge_score")
+    if score is None:
+        return True
+    if judge_error:
+        return True
+    return False
+
+
+def _load_trial(path: Path) -> tuple[dict[str, Any], TrialResult] | None:
+    """Load a trial JSON and reconstruct a ``TrialResult``.
+
+    Returns ``None`` (and logs a warning) if the file is missing, truncated,
+    or otherwise unparseable, so callers can skip individual corrupt trials
+    without aborting the whole run.
+    """
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+        known = {f.name for f in fields(TrialResult)}
+        ctor_kwargs = {k: v for k, v in data.items() if k in known}
+        return data, TrialResult(**ctor_kwargs)
+    except (OSError, ValueError, TypeError) as exc:
+        typer.echo(f"  {path.name}: skip (corrupt trial: {exc})", err=True)
+        return None
+
+
+def _iter_trial_files(session_dir: Path) -> list[Path]:
+    return sorted((session_dir / "trials").rglob("*.json"))
+
+
+@app.command("rescore")
+def rescore_command(
+    session_dir: Path = typer.Argument(
+        ...,
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        help="Artifact session directory from a previous `retriever skill-eval run`.",
+    ),
+    config: Optional[Path] = typer.Option(
+        None,
+        "--config",
+        help="Judge/manifest config to use. Defaults to the session's own config.yaml.",
+    ),
+    eval_manifest: Optional[Path] = typer.Option(
+        None,
+        "--eval-manifest",
+        help="Manifest path. Overrides eval_manifest_path from --config / session config.",
+    ),
+    force: bool = typer.Option(
+        False,
+        "--force",
+        help="Rescore every query-turn trial, not just the empty/failed ones.",
+    ),
+) -> None:
+    """Re-judge query-turn trials with missing or failed judge scores."""
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+
+    session_dir = session_dir.resolve()
+    trials_dir = session_dir / "trials"
+    if not trials_dir.is_dir():
+        typer.echo(f"Error: {trials_dir} does not exist - is this a skill_eval session dir?", err=True)
+        raise typer.Exit(code=2)
+
+    session_cfg_path = session_dir / "config.yaml"
+    if config is not None:
+        cfg = load_config(config)
+        config_path_str = str(config)
+    elif session_cfg_path.is_file():
+        cfg = load_config(session_cfg_path)
+        config_path_str = str(session_cfg_path)
+    else:
+        typer.echo(
+            f"Error: no --config given and {session_cfg_path} is missing; cannot resolve judge settings.",
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    manifest_path = eval_manifest or cfg.get("eval_manifest_path")
+    if not manifest_path:
+        typer.echo("Error: config is missing 'eval_manifest_path' and --eval-manifest was not provided.", err=True)
+        raise typer.Exit(code=2)
+    entries = load_eval_manifest(Path(str(manifest_path)).expanduser().resolve())
+    entries_by_id = {e.entry_id: e for e in entries}
+
+    judge = _build_judge(cfg)
+    if judge is None:
+        typer.echo("Error: judge is not configured (see messages above). Cannot rescore.", err=True)
+        raise typer.Exit(code=2)
+
+    trial_files = _iter_trial_files(session_dir)
+    candidates = []
+    for path in trial_files:
+        loaded = _load_trial(path)
+        if loaded is None:
+            continue
+        data, _ = loaded
+        if data.get("is_setup"):
+            continue
+        if force or _needs_rescore(data):
+            candidates.append(path)
+
+    typer.echo(
+        f"Rescoring {len(candidates)} trial(s) out of {len(trial_files)} on disk "
+        f"(force={'on' if force else 'off'})."
+    )
+
+    rescored = 0
+    unscorable = 0
+    still_failed = 0
+    for path in candidates:
+        loaded = _load_trial(path)
+        if loaded is None:
+            continue
+        raw, result = loaded
+        entry = entries_by_id.get(result.entry_id)
+        if entry is None:
+            typer.echo(f"  {path.name}: skip (entry_id={result.entry_id} not in manifest)")
+            continue
+
+        result.judge_score = None
+        result.judge_reasoning = ""
+        result.judge_error = ""
+
+        _apply_judge(judge, entry, result)
+
+        raw.update(asdict(result))
+        path.write_text(json.dumps(raw, indent=2) + "\n", encoding="utf-8")
+
+        if result.judge_score is not None:
+            rescored += 1
+            typer.echo(f"  {path.name}: entry_id={result.entry_id} judge={result.judge_score}")
+        elif result.judge_error in UNSCORABLE_JUDGE_ERRORS:
+            unscorable += 1
+            typer.echo(f"  {path.name}: entry_id={result.entry_id} unscorable ({result.judge_error})")
+        else:
+            still_failed += 1
+            typer.echo(
+                f"  {path.name}: entry_id={result.entry_id} still failed " f"(error={result.judge_error or 'unknown'})"
+            )
+
+    typer.echo(f"\nRescored {rescored}; unscorable {unscorable}; still failed {still_failed}.")
+
+    results_by_key: dict[tuple[str, str, str], list[TrialResult]] = defaultdict(list)
+    for path in trial_files:
+        loaded = _load_trial(path)
+        if loaded is None:
+            continue
+        _, result = loaded
+        results_by_key[(result.agent, result.condition, result.domain)].append(result)
+
+    agent = str(cfg.get("agent") or next((r.agent for rows in results_by_key.values() for r in rows), "claude"))
+    model = _resolve_agent_model(cfg, agent, None)
+
+    json_path, md_path = write_summary(
+        session_dir=session_dir,
+        results_by_key=dict(results_by_key),
+        entries=entries,
+        config=cfg,
+        agent=agent,
+        model=model,
+        config_path=config_path_str,
+    )
+    typer.echo(f"Wrote {json_path}")
+    typer.echo(f"Wrote {md_path}")
+    typer.echo("\nDone.")
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
index 6a17f9820..df5e68364 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/configs/skill_eval.yaml
@@ -54,9 +54,16 @@ pdf_dirs: {}
 testdata_prefixes: []
 
 # ---------------------------------------------------------------------------
-# Agent model and per-trial limits
+# Agent selection, model, and per-trial limits
 # ---------------------------------------------------------------------------
-agent_model: claude-opus-4-7
+# `agent` must be `claude` or `codex`. Override per run with --agent.
+agent: claude
+agent_models:
+  claude: claude-opus-4-7
+  codex: gpt-5.5
+# Back-compat fallback: if set, this overrides agent_models.<agent> unless
+# --model is passed. Leave unset for per-agent defaults.
+# agent_model: claude-opus-4-7
 per_trial_budget_usd: 5.0
 per_trial_timeout_s: 600
 per_trial_workdir_root: /tmp/skill_eval
@@ -92,3 +99,15 @@ judge:
   api_key_env: NVIDIA_API_KEY
   temperature: 0.1
   max_tokens: 4096
+
+# ---------------------------------------------------------------------------
+# Tool-use summarizer
+# ---------------------------------------------------------------------------
+# After each agent/domain/condition session, the harness reads the agent session
+# JSONL and asks Claude to narrate what tools were called and what strategy was
+# used. The result is stamped onto the setup-turn TrialResult and rendered in
+# session_summary.md. Shells out to `claude --print`, so it reuses Claude Code
+# auth and does not require a separate API key. Set enabled: false to skip it.
+summarizer:
+  enabled: true
+  model: claude-opus-4-7
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/report.py b/nemo_retriever/src/nemo_retriever/skill_eval/report.py
index 5efa60d1d..2c6e00023 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/report.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/report.py
@@ -2,7 +2,7 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Aggregate per-trial results into a per-condition / per-domain session summary."""
+"""Aggregate per-trial results into a per-agent / per-condition / per-domain summary."""
 
 from __future__ import annotations
 
@@ -77,16 +77,18 @@ def _aggregate(
         metrics["output_tokens"] = mean(r.output_tokens for r in query_results)
         metrics["cache_read_input_tokens"] = mean(r.cache_read_input_tokens for r in query_results)
         metrics["cache_creation_input_tokens"] = mean(r.cache_creation_input_tokens for r in query_results)
-        metrics["total_cost_usd"] = mean(r.total_cost_usd for r in query_results)
+        costed = [r.total_cost_usd for r in query_results if r.cost_available]
+        metrics["total_cost_usd"] = mean(costed) if costed else None
         metrics["duration_ms"] = mean(r.duration_ms for r in query_results)
     # When aggregating across multiple sessions there may be more than one setup
-    # turn (one per domain); sum them so the "one-time cost" reflects the full run.
+    # turn (one per domain); sum them so the one-time cost reflects the full run.
     if setup_results:
         metrics["setup_input_tokens"] = sum(r.input_tokens for r in setup_results)
         metrics["setup_output_tokens"] = sum(r.output_tokens for r in setup_results)
         metrics["setup_cache_read_input_tokens"] = sum(r.cache_read_input_tokens for r in setup_results)
         metrics["setup_cache_creation_input_tokens"] = sum(r.cache_creation_input_tokens for r in setup_results)
-        metrics["setup_cost_usd"] = sum(r.total_cost_usd for r in setup_results)
+        setup_costed = [r.total_cost_usd for r in setup_results if r.cost_available]
+        metrics["setup_cost_usd"] = sum(setup_costed) if setup_costed else None
         metrics["setup_duration_ms"] = sum(r.duration_ms for r in setup_results)
         metrics["setup_status"] = (
             "ok" if all(r.status == "ok" for r in setup_results) else ",".join(r.status for r in setup_results)
@@ -95,7 +97,8 @@ def _aggregate(
     metrics["session_output_tokens"] = sum(r.output_tokens for r in results)
     metrics["session_cache_read_input_tokens"] = sum(r.cache_read_input_tokens for r in results)
     metrics["session_cache_creation_input_tokens"] = sum(r.cache_creation_input_tokens for r in results)
-    metrics["session_total_cost_usd"] = sum(r.total_cost_usd for r in results)
+    session_costed = [r.total_cost_usd for r in results if r.cost_available]
+    metrics["session_total_cost_usd"] = sum(session_costed) if session_costed else None
     metrics["num_query_turns"] = len(query_results)
     metrics["success_rate"] = sum(1 for r in results if r.status == "ok") / len(results)
     metrics["retriever_used_rate"] = sum(1 for r in results if r.retriever_used_ever) / len(results)
@@ -107,12 +110,15 @@ def _aggregate(
         metrics["judge_score_mean"] = sum(judge_scores) / len(judge_scores)
         metrics["judge_score_n"] = len(judge_scores)
 
+    tool_use_summary = next((r.tool_use_summary for r in setup_results if r.tool_use_summary), "")
+
     return {
         "run_name": run_name,
         "success": all(r.status == "ok" for r in results),
         "metrics": metrics,
-        "tags": [results[0].condition, *extra_tags, f"n_queries={len(query_results)}"],
+        "tags": [results[0].agent, results[0].condition, *extra_tags, f"n_queries={len(query_results)}"],
         "artifact_dir": artifact_dir,
+        "tool_use_summary": tool_use_summary,
     }
 
 
@@ -121,22 +127,28 @@ def aggregate_condition(results: Iterable[TrialResult], entries_by_id: dict[int,
     results_list = list(results)
     if not results_list:
         return {}
+    agent = getattr(results_list[0], "agent", "claude")
+    condition = results_list[0].condition
     return _aggregate(
         results_list,
         entries_by_id,
-        run_name=results_list[0].condition,
-        artifact_dir=str(Path("trials") / results_list[0].condition),
+        run_name=f"{agent}/{condition}",
+        artifact_dir=str(Path("trials") / agent / condition),
     )
 
 
+def _fmt_cost(value: Any) -> str:
+    return "n/a" if value is None else f"${float(value):.3f}"
+
+
 def _md_row(row: dict[str, Any]) -> str:
     m = row.get("metrics", {})
-    judge_cell = f"{m['judge_score_mean']:.2f} (n={m.get('judge_score_n', 0)})" if "judge_score_mean" in m else "—"
+    judge_cell = f"{m['judge_score_mean']:.2f} (n={m.get('judge_score_n', 0)})" if "judge_score_mean" in m else "-"
     return (
-        "| {cond} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} "
-        "| {ipt:.0f} | {opt:.0f} | {cr:.0f} | {cc:.0f} | ${cost:.3f} |"
+        "| {run} | {sr:.2f} | {retr:.2f} | {r1:.3f} | {r5:.3f} | {r10:.3f} | {judge} "
+        "| {ipt:.0f} | {opt:.0f} | {cr:.0f} | {cc:.0f} | {cost} |"
     ).format(
-        cond=row.get("run_name", "?"),
+        run=row.get("run_name", "?"),
         sr=m.get("success_rate", 0.0),
         retr=m.get("retriever_used_rate", 0.0),
         r1=m.get("recall_1", 0.0),
@@ -147,12 +159,12 @@ def _md_row(row: dict[str, Any]) -> str:
         opt=m.get("output_tokens", 0.0),
         cr=m.get("cache_read_input_tokens", 0.0),
         cc=m.get("cache_creation_input_tokens", 0.0),
-        cost=m.get("total_cost_usd", 0.0),
+        cost=_fmt_cost(m.get("total_cost_usd")),
     )
 
 
 _MAIN_TABLE_HEADER = (
-    "| condition | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | q_input | q_output "
+    "| run | success_rate | retr_used | recall@1 | recall@5 | recall@10 | judge | q_input | q_output "
     "| q_cache_read | q_cache_create | q_cost |"
 )
 _MAIN_TABLE_DIVIDER = "|---|---|---|---|---|---|---|---|---|---|---|---|"
@@ -163,16 +175,19 @@ def write_summary_md(
     rows_by_domain: dict[str, list[dict[str, Any]]],
     overall_rows: list[dict[str, Any]],
     config: dict[str, Any],
+    agent: str,
+    model: str,
 ) -> Path:
     lines = [
-        f"# skill_eval session summary — `{session_dir.name}`",
+        f"# skill_eval session summary - `{session_dir.name}`",
         "",
-        f"- Agent model: `{config.get('agent_model', '?')}`",
+        f"- Agent: `{agent}`",
+        f"- Agent model: `{model}`",
         f"- Per-trial budget: ${config.get('per_trial_budget_usd', '?')}",
         f"- Per-trial timeout: {config.get('per_trial_timeout_s', '?')}s",
         "",
         "_Agent-session tokens only. Pipeline-side LLM calls (embeddings, VLM, etc.) are not instrumented._",
-        "_Each (condition, domain) is one Claude session: turn 1 = setup, turns 2..N = query turns._",
+        "_Each (agent, condition, domain) is one agent session: turn 1 = setup, turns 2..N = query turns._",
         "",
         "## Overall (averaged across all queries in this run)",
         "",
@@ -198,21 +213,21 @@ def write_summary_md(
 
     lines += [
         "",
-        "## Setup turns (one-time cost per condition, summed across domains)",
+        "## Setup turns (one-time cost per run, summed across domains)",
         "",
-        "| condition | status | setup_input | setup_output | setup_cache_read | setup_cost | setup_ms |",
+        "| run | status | setup_input | setup_output | setup_cache_read | setup_cost | setup_ms |",
         "|---|---|---|---|---|---|---|",
     ]
     for row in overall_rows:
         m = row.get("metrics", {})
         lines.append(
-            "| {cond} | {st} | {ipt:.0f} | {opt:.0f} | {cr:.0f} | ${cost:.3f} | {ms:.0f} |".format(
-                cond=row.get("run_name", "?"),
+            "| {run} | {st} | {ipt:.0f} | {opt:.0f} | {cr:.0f} | {cost} | {ms:.0f} |".format(
+                run=row.get("run_name", "?"),
                 st=m.get("setup_status", "?"),
                 ipt=m.get("setup_input_tokens", 0),
                 opt=m.get("setup_output_tokens", 0),
                 cr=m.get("setup_cache_read_input_tokens", 0),
-                cost=m.get("setup_cost_usd", 0.0),
+                cost=_fmt_cost(m.get("setup_cost_usd")),
                 ms=m.get("setup_duration_ms", 0),
             )
         )
@@ -221,20 +236,20 @@ def write_summary_md(
         "",
         "## Session totals (setup + all query turns)",
         "",
-        "| condition | query_turns | total_input | total_output | total_cache_read | total_cache_create | total_cost |",
+        "| run | query_turns | total_input | total_output | total_cache_read | total_cache_create | total_cost |",
         "|---|---|---|---|---|---|---|",
     ]
     for row in overall_rows:
         m = row.get("metrics", {})
         lines.append(
-            "| {cond} | {n} | {ipt} | {opt} | {cr} | {cc} | ${cost:.3f} |".format(
-                cond=row.get("run_name", "?"),
+            "| {run} | {n} | {ipt} | {opt} | {cr} | {cc} | {cost} |".format(
+                run=row.get("run_name", "?"),
                 n=m.get("num_query_turns", 0),
                 ipt=m.get("session_input_tokens", 0),
                 opt=m.get("session_output_tokens", 0),
                 cr=m.get("session_cache_read_input_tokens", 0),
                 cc=m.get("session_cache_creation_input_tokens", 0),
-                cost=m.get("session_total_cost_usd", 0.0),
+                cost=_fmt_cost(m.get("session_total_cost_usd")),
             )
         )
 
@@ -248,53 +263,84 @@ def write_summary_md(
         lines.append("## Diagnostics")
         lines.extend(diag_lines)
 
+    summary_blocks: list[tuple[str, str]] = []
+    for domain in sorted(rows_by_domain):
+        for row in rows_by_domain[domain]:
+            text = row.get("tool_use_summary") or ""
+            if text:
+                summary_blocks.append((str(row.get("run_name", "?")), text))
+    if summary_blocks:
+        lines += ["", "## Tool-use summaries", ""]
+        for run_name, text in summary_blocks:
+            lines.append(f"### {run_name}")
+            lines.append("")
+            lines.append(text)
+            lines.append("")
+
     out = session_dir / "session_summary.md"
     out.write_text("\n".join(lines) + "\n", encoding="utf-8")
     return out
 
 
+def _condition_order(condition: str) -> int:
+    try:
+        return CONDITIONS.index(condition)
+    except ValueError:
+        return len(CONDITIONS)
+
+
 def write_summary(
     session_dir: Path,
-    results_by_key: dict[tuple[str, str], list[TrialResult]],
+    results_by_key: dict[tuple[str, str, str], list[TrialResult]],
     entries: list[DatasetEntry],
     config: dict[str, Any],
+    agent: str,
+    model: str,
     config_path: str,
 ) -> tuple[Path, Path]:
     entries_by_id = {e.entry_id: e for e in entries}
 
-    # Per-(condition, domain) rows.
+    # Per-(agent, condition, domain) rows.
     domain_rows: dict[str, list[dict[str, Any]]] = defaultdict(list)
-    # Roll-up per condition across all domains.
-    by_condition: dict[str, list[TrialResult]] = defaultdict(list)
+    # Roll-up per agent/condition across all domains.
+    by_run: dict[tuple[str, str], list[TrialResult]] = defaultdict(list)
 
-    for (cond, domain), results in results_by_key.items():
+    for (agent_name, cond, domain), results in results_by_key.items():
         if not results:
             continue
+        if domain:
+            artifact_dir = str(Path("trials") / agent_name / cond / domain)
+        else:
+            artifact_dir = str(Path("trials") / agent_name / cond)
         domain_rows[domain].append(
             _aggregate(
                 results,
                 entries_by_id,
-                run_name=f"{cond}/{domain}",
-                artifact_dir=str(Path("trials") / cond / domain) if domain else str(Path("trials") / cond),
-                extra_tags=(f"domain={domain}",) if domain else (),
+                run_name=f"{agent_name}/{cond}/{domain}" if domain else f"{agent_name}/{cond}",
+                artifact_dir=artifact_dir,
+                extra_tags=(f"agent={agent_name}", f"domain={domain}") if domain else (f"agent={agent_name}",),
             )
         )
-        by_condition[cond].extend(results)
+        by_run[(agent_name, cond)].extend(results)
 
     overall_rows: list[dict[str, Any]] = []
-    for cond in CONDITIONS:
-        results = by_condition.get(cond, [])
+    for agent_name, cond in sorted(by_run, key=lambda x: (x[0], _condition_order(x[1]), x[1])):
+        results = by_run[(agent_name, cond)]
         if not results:
             continue
         overall_rows.append(
             _aggregate(
                 results,
                 entries_by_id,
-                run_name=cond,
-                artifact_dir=str(Path("trials") / cond),
+                run_name=f"{agent_name}/{cond}",
+                artifact_dir=str(Path("trials") / agent_name / cond),
+                extra_tags=(f"agent={agent_name}",),
             )
         )
 
+    for rows in domain_rows.values():
+        rows.sort(key=lambda row: tuple(str(row.get("run_name", "")).split("/", 2)[:2]))
+
     flat_rows = overall_rows + [r for rows in domain_rows.values() for r in rows]
     json_path = write_session_summary(
         session_dir=session_dir,
@@ -302,5 +348,5 @@ def write_summary(
         session_type="skill_eval",
         config_path=config_path,
     )
-    md_path = write_summary_md(session_dir, dict(domain_rows), overall_rows, config)
+    md_path = write_summary_md(session_dir, dict(domain_rows), overall_rows, config, agent=agent, model=model)
     return json_path, md_path
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
index 8c9a74bcf..49cfa5ef8 100644
--- a/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/runner.py
@@ -2,7 +2,7 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Per-trial runner: build sandboxed workdir, spawn `claude -p`, parse outputs."""
+"""Per-trial runner: build sandboxed workdirs, spawn an agent CLI, parse outputs."""
 
 from __future__ import annotations
 
@@ -25,7 +25,13 @@
 
 logger = logging.getLogger(__name__)
 
+BASE_CONDITION = "c1_base"
 CONDITIONS = ("c1_base", "c2_retriever", "c3_retriever_skill")
+SUPPORTED_AGENTS = ("claude", "codex")
+DEFAULT_AGENT_MODELS = {
+    "claude": "claude-opus-4-7",
+    "codex": "gpt-5.5",
+}
 
 
 @functools.lru_cache(maxsize=8)
@@ -47,6 +53,7 @@ class TrialResult:
     total_cost_usd: float
     model_id: str
     session_id: str
+    agent: str = "claude"
     input_tokens: int = 0
     output_tokens: int = 0
     cache_read_input_tokens: int = 0
@@ -64,18 +71,16 @@ class TrialResult:
     judge_score: int | None = None
     judge_reasoning: str = ""
     judge_error: str = ""
+    tool_use_summary: str = ""
+    cost_available: bool = True
 
 
 def _remap_pdf_paths(text: str, prefixes: tuple[str, ...]) -> str:
     """Rewrite caller-supplied path prefixes in *text* to ``./pdfs/``.
 
-    Some agent-eval manifests' paraphrased prompts hard-code paths from the
-    dataset source tree. Each trial workdir symlinks the domain's PDFs to
-    ``./pdfs/``, so the agent only needs the basename — rewriting the prefix
-    lets the natural-language reference resolve to a real file.
-
-    Prefixes are configured per-run via the ``testdata_prefixes`` config key
-    (no dataset paths are hardcoded in this module).
+    Some agent-eval manifests' paraphrased prompts hard-code dataset-source
+    paths in the user-facing text. Each trial workdir symlinks the domain's
+    PDFs to ``./pdfs/``, so the agent only needs the basename.
     """
     for prefix in prefixes:
         text = text.replace(prefix, "./pdfs")
@@ -147,45 +152,47 @@ def _copy_skill(skill_source: Path, dest: Path) -> None:
 
 
 def _c1_settings_json() -> str:
-    """Project-level settings for the c1_base trial.
+    """Project-level settings for the c1_base Claude trial.
 
-    `--permission-mode bypassPermissions` auto-approves tool calls that aren't
-    explicitly denied; the deny patterns below catch every reasonable path
-    into the nemo_retriever library so the agent has to fall back on CPU-only
-    primitives (Read, Grep, pdftotext, etc.).
+    ``--permission-mode bypassPermissions`` auto-approves tool calls that aren't
+    explicitly denied; these deny patterns catch every reasonable path into the
+    nemo_retriever library so Claude has to fall back on CPU-only primitives.
     """
     return json.dumps({"permissions": {"deny": list(_C1_BASH_DENY_PATTERNS)}}, indent=2) + "\n"
 
 
 def _build_condition_workdir(
+    agent: str,
     condition: str,
     root: Path,
     pdf_source: Path,
     skill_source: Path,
     domain: str = "",
 ) -> Path:
-    """Build one workdir per condition. Shared across all turns in the session.
+    """Build one workdir per agent/condition/domain session.
 
     Workdir contents:
       - pdfs/ symlink farm into the source PDF folder
-      - .claude/ sandbox (settings + per-condition skill copy)
-      - .bin/retriever shim (c1 only) so retriever is unavailable on PATH
-
-    The agent itself creates any retrieval artifacts (e.g., ./lancedb/) inside the
-    workdir on the setup turn.
+      - .claude/ sandbox (settings + per-condition skill copy for Claude)
+      - .codex/ skill copy for Codex skill-aware installations
+      - .bin/retriever shim (c1 only) so the retriever CLI is unavailable on PATH
     """
     domain_seg = f"_{domain}" if domain else ""
-    workdir = root / f"{condition}{domain_seg}_{uuid.uuid4().hex[:8]}"
+    workdir = root / f"{agent}_{condition}{domain_seg}_{uuid.uuid4().hex[:8]}"
     workdir.mkdir(parents=True, exist_ok=True)
     _build_pdf_symlinks(pdf_source, workdir / "pdfs")
-    (workdir / ".claude").mkdir(parents=True, exist_ok=True)
-    # c1 gets explicit Bash deny rules; c2/c3 keep the empty settings.json.
-    settings_text = _c1_settings_json() if condition == "c1_base" else "{}\n"
-    (workdir / ".claude" / "settings.json").write_text(settings_text, encoding="utf-8")
-    # c2 and c3 both have retriever installed AND the nemo-retriever skill loaded.
-    # The c2/c3 distinction is purely the prompt style (NL vs explicit slash command).
+
+    if agent == "claude":
+        (workdir / ".claude").mkdir(parents=True, exist_ok=True)
+        settings_text = _c1_settings_json() if condition == "c1_base" else "{}\n"
+        (workdir / ".claude" / "settings.json").write_text(settings_text, encoding="utf-8")
+
     if condition in ("c2_retriever", "c3_retriever_skill"):
-        _copy_skill(skill_source, workdir / ".claude" / "skills" / "nemo-retriever")
+        if agent == "claude":
+            _copy_skill(skill_source, workdir / ".claude" / "skills" / "nemo-retriever")
+        elif agent == "codex":
+            _copy_skill(skill_source, workdir / ".codex" / "skills" / "nemo-retriever")
+
     if condition == "c1_base":
         _write_shim(workdir / ".bin", "retriever")
         # Empty HuggingFace cache redirect; env vars are wired up in _env_for.
@@ -194,10 +201,7 @@ def _build_condition_workdir(
 
 
 def cleanup_condition_workdir(workdir: Path) -> None:
-    """Remove a condition's scratch workdir (PDFs symlinks, .claude/, agent-built
-    artifacts like .venv/, lancedb/, scratch scripts). Called after a session
-    completes and its results have been persisted to the artifact dir.
-    """
+    """Remove a condition's scratch workdir after results have been persisted."""
     if not workdir.exists():
         return
     shutil.rmtree(workdir, ignore_errors=True)
@@ -210,8 +214,6 @@ def _env_for(condition: str, workdir: Path) -> dict[str, str]:
         env["PATH"] = f"{workdir / '.bin'}{os.pathsep}{env.get('PATH', '')}"
         # Point HuggingFace cache env vars at an empty workdir-local dir so
         # any HF Python tooling the agent invokes sees no cached models.
-        # Direct filesystem reads (e.g. `ls ~/.cache/huggingface/`) are
-        # blocked separately by the Bash deny rules in settings.json.
         hf_empty = str(workdir / ".hf_empty")
         env["HF_HOME"] = hf_empty
         env["HF_HUB_CACHE"] = hf_empty
@@ -219,7 +221,7 @@ def _env_for(condition: str, workdir: Path) -> dict[str, str]:
     return env
 
 
-def _build_command(
+def _build_claude_command(
     condition: str,
     model: str,
     budget_usd: float,
@@ -228,10 +230,11 @@ def _build_command(
     *,
     resume: bool = False,
 ) -> list[str]:
-    """Build the `claude -p` command. First turn uses --session-id; subsequent turns use --resume.
+    """Build the ``claude --print`` command.
 
-    We deliberately do NOT pass --no-session-persistence because multi-turn requires
-    the session to persist between subprocess invocations.
+    First turn uses ``--session-id``; subsequent turns use ``--resume``. We
+    deliberately keep session persistence enabled because this benchmark is
+    multi-turn.
     """
     cmd = [
         "claude",
@@ -249,22 +252,73 @@ def _build_command(
         "--setting-sources",
         "project",
     ]
-    # c2/c3 run fully un-gated. c1 omits --allow-dangerously-skip-permissions
-    # so the project-level settings.json deny rules are actually consulted by
-    # Claude Code instead of being short-circuited.
+    # c2/c3 run fully ungated. c1 omits the dangerous skip flag so the
+    # project-level deny rules are consulted.
     if condition != "c1_base":
         cmd.append("--allow-dangerously-skip-permissions")
     if resume:
         cmd.extend(["--resume", session_uuid])
     else:
         cmd.extend(["--session-id", session_uuid])
-    # Only c1 disables skills entirely. c2 has the skill loaded but uses NL prompt
-    # (relying on description-based auto-discovery); c3 explicitly invokes via slash.
+    # Only c1 disables skills entirely. c2 has the skill loaded but uses an NL
+    # prompt; c3 explicitly invokes via slash.
     if condition == "c1_base":
         cmd.append("--disable-slash-commands")
     return cmd
 
 
+def _build_codex_command(
+    model: str,
+    session_uuid: str,
+    workdir: Path,
+    *,
+    resume: bool = False,
+) -> list[str]:
+    """Build a non-interactive Codex command.
+
+    Codex assigns the first session id itself; subsequent turns resume the id
+    parsed from the setup turn's JSONL events.
+    """
+    common = [
+        "--json",
+        "--model",
+        model,
+        "--skip-git-repo-check",
+        "--ignore-user-config",
+        "--ignore-rules",
+        "--dangerously-bypass-approvals-and-sandbox",
+    ]
+    if resume:
+        return ["codex", "exec", "resume", *common, session_uuid, "-"]
+    return [
+        "codex",
+        "exec",
+        *common,
+        "--cd",
+        str(workdir),
+        "--add-dir",
+        str(workdir),
+        "-",
+    ]
+
+
+def _build_command(
+    *,
+    agent: str,
+    condition: str,
+    model: str,
+    budget_usd: float,
+    session_uuid: str,
+    workdir: Path,
+    resume: bool = False,
+) -> list[str]:
+    if agent == "claude":
+        return _build_claude_command(condition, model, budget_usd, session_uuid, workdir, resume=resume)
+    if agent == "codex":
+        return _build_codex_command(model, session_uuid, workdir, resume=resume)
+    raise ValueError(f"unsupported agent: {agent}")
+
+
 def _parse_envelope(raw: str) -> dict[str, Any]:
     raw = raw.strip()
     if not raw:
@@ -284,7 +338,42 @@ def _parse_envelope(raw: str) -> dict[str, Any]:
         return {}
 
 
-def _populate_tokens(result: TrialResult, envelope: dict[str, Any]) -> None:
+def _parse_jsonl_events(raw: str) -> list[dict[str, Any]]:
+    events: list[dict[str, Any]] = []
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(ev, dict):
+            events.append(ev)
+    return events
+
+
+def _codex_session_id(events: list[dict[str, Any]], fallback: str) -> str:
+    for ev in events:
+        if ev.get("type") != "session_meta":
+            continue
+        payload = ev.get("payload") or {}
+        if isinstance(payload, dict) and payload.get("id"):
+            return str(payload["id"])
+    return fallback
+
+
+def _codex_has_error(events: list[dict[str, Any]]) -> bool:
+    for ev in events:
+        payload = ev.get("payload") or {}
+        if not isinstance(payload, dict):
+            continue
+        if payload.get("type") in {"error", "task_failed", "turn_aborted"}:
+            return True
+    return False
+
+
+def _populate_claude_tokens(result: TrialResult, envelope: dict[str, Any]) -> None:
     usage = envelope.get("usage") or {}
     result.input_tokens = int(usage.get("input_tokens") or 0)
     result.output_tokens = int(usage.get("output_tokens") or 0)
@@ -295,6 +384,59 @@ def _populate_tokens(result: TrialResult, envelope: dict[str, Any]) -> None:
     result.ephemeral_1h_input_tokens = int(cache_detail.get("ephemeral_1h_input_tokens") or 0)
 
 
+_CODEX_USAGE_FIELDS = (
+    "input_tokens",
+    "output_tokens",
+    "cached_input_tokens",
+    "reasoning_output_tokens",
+)
+
+
+def _extract_codex_total_usage(events: list[dict[str, Any]]) -> dict[str, int]:
+    """Return the most recent cumulative ``total_token_usage`` from codex events.
+
+    Each ``token_count`` event carries running session-wide counters; we want the
+    last one so deltas between two snapshots equal one turn's true work.
+    """
+    for ev in reversed(events):
+        if ev.get("type") != "event_msg":
+            continue
+        payload = ev.get("payload") or {}
+        if not isinstance(payload, dict) or payload.get("type") != "token_count":
+            continue
+        info = payload.get("info") or {}
+        if not isinstance(info, dict):
+            continue
+        usage = info.get("total_token_usage") or {}
+        if not isinstance(usage, dict):
+            continue
+        return {k: int(usage.get(k) or 0) for k in _CODEX_USAGE_FIELDS}
+    return {k: 0 for k in _CODEX_USAGE_FIELDS}
+
+
+def _populate_codex_tokens(
+    result: TrialResult,
+    current_totals: dict[str, int],
+    prior_totals: dict[str, int],
+) -> None:
+    """Set per-turn token fields as the delta of cumulative ``total_token_usage``.
+
+    Codex's resumed-session log is append-only across all turns, and each
+    ``token_count`` event reports cumulative counters, so per-turn cost is the
+    difference between snapshots taken before and after the subprocess call.
+    ``output_tokens`` here folds in ``reasoning_output_tokens`` so the column
+    reflects everything the model emitted, matching Claude's accounting.
+    """
+
+    def d(key: str) -> int:
+        return max(0, current_totals.get(key, 0) - prior_totals.get(key, 0))
+
+    result.input_tokens = d("input_tokens")
+    result.output_tokens = d("output_tokens") + d("reasoning_output_tokens")
+    result.cache_read_input_tokens = d("cached_input_tokens")
+    result.cache_creation_input_tokens = 0
+
+
 def _parse_output_json(workdir: Path) -> tuple[str, list[dict[str, Any]], str, list[str]]:
     out_path = workdir / "output.json"
     errors: list[str] = []
@@ -332,26 +474,43 @@ def _extract_model_id(envelope: dict[str, Any], fallback: str) -> str:
     return str(envelope.get("model") or fallback)
 
 
+def _extract_claude_error_detail(envelope: dict[str, Any]) -> str:
+    for key in ("error", "message", "result"):
+        value = envelope.get(key)
+        if value:
+            return str(value)
+
+    content = envelope.get("content")
+    if isinstance(content, str) and content:
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict) and item.get("text"):
+                parts.append(str(item["text"]))
+        if parts:
+            return " ".join(parts)
+    return ""
+
+
 _PIPELINE_SEP = re.compile(r"(?:;|&&|\|\||\||\n|\$\(|`)")
 _ENV_ASSIGN = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
 _WRAPPER_CMDS = {"sudo", "time", "nice", "nohup", "exec", "env", "command", "builtin"}
 
 
 def _retriever_in_command(cmd: str) -> bool:
-    """Does this shell command line invoke the retriever CLI as a command?
+    """Return whether this shell command invokes the retriever CLI as a command.
 
-    Matches when the **executable** in any pipeline segment is the retriever
-    CLI — ``retriever``, ``./retriever``, ``/abs/path/retriever``, ``uv run
-    retriever``, or ``python -m nemo_retriever``. Deliberately does *not*
-    match cases where ``retriever`` appears only as a path argument (e.g.
-    ``cat .bin/retriever``, ``ls /path/retriever/``, ``echo "use retriever"``).
+    Matches when the executable in any pipeline segment is the retriever CLI:
+    ``retriever``, ``./retriever``, ``/abs/path/retriever``, ``uv run
+    retriever``, or ``python -m nemo_retriever``. Deliberately does not match
+    cases where ``retriever`` appears only as a path argument or prose.
     """
     if not cmd:
         return False
 
     for segment in _PIPELINE_SEP.split(cmd):
         seg = segment.strip()
-        # Strip leading env-var assignments and command wrappers (sudo, time, ...).
         while seg:
             first = seg.split(None, 1)
             if not first:
@@ -371,17 +530,11 @@ def _retriever_in_command(cmd: str) -> bool:
         if head == "retriever" or head == "./retriever":
             return True
         if head.endswith("/retriever") and "/" in head[: -len("/retriever") + 1]:
-            # An absolute or relative path whose final component is `retriever`,
-            # e.g. /home/.../venv/bin/retriever. Reject pure ``/retriever`` which
-            # is implausible as a real binary path. Also reject ``.bin/retriever``
-            # paths: c1_base's workdir setup installs a deny-shim with that exact
-            # name (see ``_write_shim``); invoking the shim is the *opposite* of
-            # using the real retriever CLI.
+            # Reject c1_base's deny shim; invoking it is the opposite of using
+            # the real retriever CLI.
             if "/.bin/retriever" in head:
                 continue
             return True
-        # ``uv run retriever ...`` and ``python -m nemo_retriever ...`` —
-        # check the first two tokens of the segment.
         tokens = seg.split()
         if len(tokens) >= 3 and tokens[0] == "uv" and tokens[1] == "run" and tokens[2] == "retriever":
             return True
@@ -396,33 +549,271 @@ def _retriever_in_command(cmd: str) -> bool:
 
 
 def _claude_session_log_path(workdir: Path, session_uuid: str) -> Path:
-    """Claude Code persists per-session transcripts at
-    ``~/.claude/projects/<slug>/<session_id>.jsonl`` where ``<slug>`` is the
-    project dir with ``/`` and ``_`` both replaced by ``-`` (and a leading ``-``
-    preserved for the filesystem root).
-    """
+    """Return Claude Code's per-session JSONL transcript path."""
     slug = str(workdir).replace("/", "-").replace("_", "-")
     if not slug.startswith("-"):
         slug = "-" + slug
     return Path.home() / ".claude" / "projects" / slug / f"{session_uuid}.jsonl"
 
 
-def _scan_transcript_for_signals(
-    envelope: dict[str, Any],
-    workdir: Path | None = None,
-    session_uuid: str | None = None,
-) -> tuple[int | None, bool]:
-    """Detect whether the agent invoked the ``retriever`` CLI.
+def _codex_session_log_path(session_uuid: str) -> Path | None:
+    sessions_root = Path.home() / ".codex" / "sessions"
+    if not sessions_root.exists():
+        return None
+    matches = sorted(
+        sessions_root.glob(f"**/*{session_uuid}.jsonl"),
+        key=lambda p: p.stat().st_mtime if p.exists() else 0,
+        reverse=True,
+    )
+    return matches[0] if matches else None
 
-    Primary signal: scan the Claude Code session jsonl for tool-use entries that
-    spawn a shell command containing ``retriever``. This catches every actual
-    invocation, regardless of whether the agent quoted it in its final reply.
 
-    Fallback signal: if the session log isn't accessible (older runs, missing
-    file), look for ``retriever`` in the envelope's ``result`` text — the legacy
-    proxy. This undercounts but never overcounts.
-    """
-    # Primary: tool-call trace.
+def _codex_session_meta_from_log(path: Path) -> dict[str, Any]:
+    try:
+        with path.open(encoding="utf-8") as f:
+            for raw_line in f:
+                raw_line = raw_line.strip()
+                if not raw_line:
+                    continue
+                try:
+                    ev = json.loads(raw_line)
+                except json.JSONDecodeError:
+                    continue
+                if ev.get("type") != "session_meta":
+                    continue
+                payload = ev.get("payload") or {}
+                return payload if isinstance(payload, dict) else {}
+    except OSError:
+        return {}
+    return {}
+
+
+def _codex_session_log_for_workdir(workdir: Path) -> Path | None:
+    sessions_root = Path.home() / ".codex" / "sessions"
+    if not sessions_root.exists():
+        return None
+    workdir_str = str(workdir)
+    matches = sorted(
+        sessions_root.glob("**/rollout-*.jsonl"),
+        key=lambda p: p.stat().st_mtime if p.exists() else 0,
+        reverse=True,
+    )
+    for path in matches:
+        meta = _codex_session_meta_from_log(path)
+        if str(meta.get("cwd") or "") == workdir_str:
+            return path
+    return None
+
+
+def _read_jsonl_events(path: Path | None) -> list[dict[str, Any]]:
+    if path is None:
+        return []
+    try:
+        return _parse_jsonl_events(path.read_text(encoding="utf-8"))
+    except OSError:
+        return []
+
+
+_TRACE_TOOL_INPUT_CAP = 200
+_TRACE_FINAL_TEXT_CAP = 400
+
+
+def _truncate(s: str, cap: int) -> str:
+    s = " ".join(s.split())
+    return s if len(s) <= cap else s[: cap - 1] + "..."
+
+
+def _format_tool_input(name: str, inp: dict[str, Any]) -> str:
+    """Render a Claude tool_use input dict to a single short line."""
+    if name == "Bash":
+        cmd = str(inp.get("command", ""))
+        return f"Bash: {_truncate(cmd, _TRACE_TOOL_INPUT_CAP)}"
+    if name == "Read":
+        path = str(inp.get("file_path", ""))
+        offset = inp.get("offset")
+        limit = inp.get("limit")
+        tail = f" offset={offset} limit={limit}" if offset is not None or limit is not None else ""
+        return f"Read: {path}{tail}"
+    if name == "Grep":
+        pat = str(inp.get("pattern", ""))
+        path = str(inp.get("path", ""))
+        return f"Grep: pattern={_truncate(pat, 80)} path={path}"
+    if name == "Glob":
+        return f"Glob: {inp.get('pattern', '')}"
+    if name in ("Edit", "Write"):
+        return f"{name}: {inp.get('file_path', '')}"
+    parts = [f"{k}={_truncate(str(v), 80)}" for k, v in inp.items()]
+    return f"{name}: " + " ".join(parts) if parts else name
+
+
+def _extract_claude_compact_trace(workdir: Path, session_uuid: str) -> str | None:
+    """Walk a Claude Code JSONL transcript and emit a turn-organized trace."""
+    log_path = _claude_session_log_path(workdir, session_uuid)
+    if not log_path.exists():
+        return None
+
+    turn_idx = 0
+    lines_out: list[str] = []
+    current_assistant_text: list[str] = []
+    try:
+        with log_path.open(encoding="utf-8") as f:
+            for raw_line in f:
+                raw_line = raw_line.strip()
+                if not raw_line:
+                    continue
+                try:
+                    ev = json.loads(raw_line)
+                except json.JSONDecodeError:
+                    continue
+                msg = ev.get("message") or {}
+                role = msg.get("role") or ev.get("type")
+                content = msg.get("content")
+
+                if role == "user":
+                    if current_assistant_text:
+                        joined = " ".join(current_assistant_text).strip()
+                        if joined:
+                            lines_out.append(f"  assistant: {_truncate(joined, _TRACE_FINAL_TEXT_CAP)}")
+                        current_assistant_text = []
+                    turn_idx += 1
+                    user_text = ""
+                    if isinstance(content, str):
+                        user_text = content
+                    elif isinstance(content, list):
+                        for item in content:
+                            if isinstance(item, dict) and item.get("type") == "text":
+                                user_text = str(item.get("text", ""))
+                                break
+                    label = "setup" if turn_idx == 1 else f"query {turn_idx - 1}"
+                    lines_out.append("")
+                    lines_out.append(f"[Turn {turn_idx} - {label}]")
+                    if user_text:
+                        lines_out.append(f"  user: {_truncate(user_text, _TRACE_FINAL_TEXT_CAP)}")
+                elif role == "assistant" and isinstance(content, list):
+                    for item in content:
+                        if not isinstance(item, dict):
+                            continue
+                        itype = item.get("type")
+                        if itype == "tool_use":
+                            name = str(item.get("name", "?"))
+                            inp = item.get("input") or {}
+                            if isinstance(inp, dict):
+                                lines_out.append(f"  tool_use {_format_tool_input(name, inp)}")
+                            else:
+                                lines_out.append(f"  tool_use {name}")
+                        elif itype == "text":
+                            text = str(item.get("text", "")).strip()
+                            if text:
+                                current_assistant_text.append(text)
+    except OSError:
+        return None
+
+    if current_assistant_text:
+        joined = " ".join(current_assistant_text).strip()
+        if joined:
+            lines_out.append(f"  assistant: {_truncate(joined, _TRACE_FINAL_TEXT_CAP)}")
+
+    trace = "\n".join(lines_out).strip()
+    return trace or None
+
+
+def _string_from_content_items(content: Any, *, input_text: bool = True) -> str:
+    if isinstance(content, str):
+        return content
+    if not isinstance(content, list):
+        return ""
+    out: list[str] = []
+    wanted = "input_text" if input_text else "output_text"
+    fallback = "text"
+    for item in content:
+        if not isinstance(item, dict):
+            continue
+        if item.get("type") in {wanted, fallback}:
+            out.append(str(item.get("text") or ""))
+    return " ".join(x for x in out if x).strip()
+
+
+def _codex_tool_arguments(payload: dict[str, Any]) -> Any:
+    args = payload.get("arguments") or ""
+    if isinstance(args, str):
+        try:
+            return json.loads(args)
+        except json.JSONDecodeError:
+            return args
+    return args
+
+
+def _codex_tool_command(payload: dict[str, Any]) -> str:
+    args = _codex_tool_arguments(payload)
+    if isinstance(args, dict):
+        for key in ("cmd", "command"):
+            value = args.get(key)
+            if isinstance(value, str):
+                return value
+        return json.dumps(args, sort_keys=False)
+    return str(args)
+
+
+def _format_codex_tool_input(payload: dict[str, Any]) -> str:
+    name = str(payload.get("name") or "?")
+    args = _codex_tool_arguments(payload)
+    if not isinstance(args, str):
+        args = json.dumps(args, sort_keys=False)
+    return f"{name}: {_truncate(args, _TRACE_TOOL_INPUT_CAP)}"
+
+
+def _extract_codex_compact_trace(session_uuid: str) -> str | None:
+    log_path = _codex_session_log_path(session_uuid)
+    events = _read_jsonl_events(log_path)
+    if not events:
+        return None
+
+    turn_idx = 0
+    lines_out: list[str] = []
+    for ev in events:
+        etype = ev.get("type")
+        payload = ev.get("payload") or {}
+        if not isinstance(payload, dict):
+            continue
+
+        if etype == "event_msg" and payload.get("type") == "user_message":
+            turn_idx += 1
+            label = "setup" if turn_idx == 1 else f"query {turn_idx - 1}"
+            lines_out.append("")
+            lines_out.append(f"[Turn {turn_idx} - {label}]")
+            text = str(payload.get("message") or "")
+            if text:
+                lines_out.append(f"  user: {_truncate(text, _TRACE_FINAL_TEXT_CAP)}")
+        elif etype == "event_msg" and payload.get("type") == "agent_message":
+            text = str(payload.get("message") or "")
+            if text:
+                lines_out.append(f"  assistant: {_truncate(text, _TRACE_FINAL_TEXT_CAP)}")
+        elif etype == "response_item":
+            ptype = payload.get("type")
+            if ptype == "function_call":
+                lines_out.append(f"  tool_use {_format_codex_tool_input(payload)}")
+            elif ptype == "message" and payload.get("role") == "assistant":
+                text = _string_from_content_items(payload.get("content"), input_text=False)
+                if text:
+                    lines_out.append(f"  assistant: {_truncate(text, _TRACE_FINAL_TEXT_CAP)}")
+
+    trace = "\n".join(lines_out).strip()
+    return trace or None
+
+
+def extract_compact_trace(agent: str, workdir: Path, session_uuid: str) -> str | None:
+    if agent == "claude":
+        return _extract_claude_compact_trace(workdir, session_uuid)
+    if agent == "codex":
+        return _extract_codex_compact_trace(session_uuid)
+    return None
+
+
+def _scan_claude_transcript_for_signals(
+    envelope: dict[str, Any],
+    workdir: Path | None,
+    session_uuid: str | None,
+) -> tuple[int | None, bool]:
     if workdir is not None and session_uuid:
         log_path = _claude_session_log_path(workdir, session_uuid)
         if log_path.exists():
@@ -443,25 +834,68 @@ def _scan_transcript_for_signals(
                         for item in content:
                             if not isinstance(item, dict):
                                 continue
-                            if item.get("type") != "tool_use":
-                                continue
-                            if item.get("name") != "Bash":
+                            if item.get("type") != "tool_use" or item.get("name") != "Bash":
                                 continue
                             cmd = (item.get("input") or {}).get("command") or ""
                             if _retriever_in_command(cmd):
                                 return 1, True
                 return None, False
             except OSError:
-                pass  # fall through to fallback
+                pass
 
-    # Fallback: scan the assistant's final text.
     text = str(envelope.get("result") or "")
     used = "retriever " in text or "\nretriever\n" in text
     return (1 if used else None), used
 
 
+def _scan_codex_transcript_for_signals(
+    session_uuid: str,
+    fallback_events: list[dict[str, Any]],
+) -> tuple[int | None, bool]:
+    log_events = _read_jsonl_events(_codex_session_log_path(session_uuid))
+    events = log_events or fallback_events
+    for ev in events:
+        if ev.get("type") != "response_item":
+            continue
+        payload = ev.get("payload") or {}
+        if not isinstance(payload, dict) or payload.get("type") != "function_call":
+            continue
+        if _retriever_in_command(_codex_tool_command(payload)):
+            return 1, True
+
+    text_parts: list[str] = []
+    for ev in events:
+        payload = ev.get("payload") or {}
+        if not isinstance(payload, dict):
+            continue
+        if ev.get("type") == "event_msg" and payload.get("type") == "agent_message":
+            text_parts.append(str(payload.get("message") or ""))
+        elif ev.get("type") == "response_item" and payload.get("type") == "message":
+            text_parts.append(_string_from_content_items(payload.get("content"), input_text=False))
+    text = "\n".join(text_parts)
+    used = "retriever " in text or "\nretriever\n" in text
+    return (1 if used else None), used
+
+
+def _scan_transcript_for_signals(
+    *,
+    agent: str,
+    envelope: dict[str, Any],
+    codex_events: list[dict[str, Any]],
+    workdir: Path | None = None,
+    session_uuid: str | None = None,
+) -> tuple[int | None, bool]:
+    """Detect whether the agent invoked the ``retriever`` CLI."""
+    if agent == "claude":
+        return _scan_claude_transcript_for_signals(envelope, workdir, session_uuid)
+    if agent == "codex" and session_uuid:
+        return _scan_codex_transcript_for_signals(session_uuid, codex_events)
+    return None, False
+
+
 def _run_one_turn(
     *,
+    agent: str,
     condition: str,
     prompt: str,
     trial_id: str,
@@ -477,15 +911,21 @@ def _run_one_turn(
     timeout_s: int,
     model: str,
 ) -> TrialResult:
-    """Execute one turn. Query turns (is_setup=False) expect the agent to write
-    ./output.json; the setup turn does not."""
+    """Execute one turn. Query turns expect the agent to write ``./output.json``."""
     out_path = workdir / "output.json"
     if out_path.exists():
         out_path.unlink()
 
     domain_tag = f"[{domain}] " if domain else ""
     label = "setup" if is_setup else f"entry_id={entry_id}, query_id={query_id}"
-    logger.info("turn %d for %s %s(%s)", turn_idx + 1, condition, domain_tag, label)
+    logger.info("turn %d for %s/%s %s(%s)", turn_idx + 1, agent, condition, domain_tag, label)
+
+    prior_codex_usage: dict[str, int] = {k: 0 for k in _CODEX_USAGE_FIELDS}
+    if agent == "codex":
+        prior_log = _codex_session_log_path(session_uuid)
+        if prior_log is not None:
+            prior_codex_usage = _extract_codex_total_usage(_read_jsonl_events(prior_log))
+
     t0 = time.monotonic()
     try:
         proc = subprocess.run(
@@ -512,34 +952,74 @@ def _run_one_turn(
             total_cost_usd=0.0,
             model_id=model,
             session_id=session_uuid,
+            agent=agent,
             errors=[f"turn exceeded {timeout_s}s wall timeout"],
             is_setup=is_setup,
             domain=domain,
+            cost_available=(agent == "claude"),
+        )
+
+    elapsed_ms = int((time.monotonic() - t0) * 1000)
+    envelope: dict[str, Any] = {}
+    codex_events: list[dict[str, Any]] = []
+    token_events: list[dict[str, Any]] = []
+    if agent == "claude":
+        envelope = _parse_envelope(proc.stdout)
+        agent_error = bool(envelope.get("is_error", False))
+        duration_ms = int(envelope.get("duration_ms") or elapsed_ms)
+        duration_api_ms = int(envelope.get("duration_api_ms") or 0)
+        total_cost_usd = float(envelope.get("total_cost_usd") or 0.0)
+        model_id = _extract_model_id(envelope, fallback=model)
+        actual_session_id = str(envelope.get("session_id") or session_uuid)
+    else:
+        codex_events = _parse_jsonl_events(proc.stdout)
+        agent_error = _codex_has_error(codex_events)
+        duration_ms = elapsed_ms
+        duration_api_ms = 0
+        total_cost_usd = 0.0
+        model_id = model
+        log_path = _codex_session_log_path(session_uuid)
+        if log_path is None and is_setup:
+            log_path = _codex_session_log_for_workdir(workdir)
+        token_events = _read_jsonl_events(log_path) if log_path is not None else codex_events
+        actual_session_id = _codex_session_id(
+            token_events,
+            fallback=_codex_session_id(codex_events, fallback=session_uuid),
         )
 
-    envelope = _parse_envelope(proc.stdout)
     stderr = proc.stderr.strip()
     result = TrialResult(
         trial_id=trial_id,
         condition=condition,
         entry_id=entry_id,
         query_id=query_id,
-        status="ok" if proc.returncode == 0 and not envelope.get("is_error", False) else "error",
+        status="ok" if proc.returncode == 0 and not agent_error else "error",
         extraction_method="n/a" if is_setup else "output_json",
-        duration_ms=int(envelope.get("duration_ms") or (time.monotonic() - t0) * 1000),
-        duration_api_ms=int(envelope.get("duration_api_ms") or 0),
+        duration_ms=duration_ms,
+        duration_api_ms=duration_api_ms,
         num_turns=turn_idx + 1,
-        total_cost_usd=float(envelope.get("total_cost_usd") or 0.0),
-        model_id=_extract_model_id(envelope, fallback=model),
-        session_id=str(envelope.get("session_id") or session_uuid),
+        total_cost_usd=total_cost_usd,
+        model_id=model_id,
+        session_id=actual_session_id,
+        agent=agent,
         is_setup=is_setup,
         domain=domain,
+        cost_available=(agent == "claude"),
     )
-    _populate_tokens(result, envelope)
+    if agent == "claude":
+        _populate_claude_tokens(result, envelope)
+    else:
+        current_codex_usage = _extract_codex_total_usage(token_events or codex_events)
+        _populate_codex_tokens(result, current_codex_usage, prior_codex_usage)
     if proc.returncode != 0:
         result.errors.append(f"non-zero exit {proc.returncode}")
-    if envelope.get("is_error"):
+    if agent == "claude" and envelope.get("is_error"):
         result.errors.append(f"envelope is_error: {envelope.get('subtype') or '?'}")
+        detail = _extract_claude_error_detail(envelope)
+        if detail:
+            result.errors.append(f"claude error: {detail[:500]}")
+    if agent == "codex" and agent_error:
+        result.errors.append("codex event stream reported an error")
     if stderr:
         result.errors.append(f"stderr: {stderr[:500]}")
 
@@ -557,24 +1037,39 @@ def _run_one_turn(
         if out_path.exists():
             out_path.rename(workdir / f"output_e{entry_id}.json")
 
-    first_use, used = _scan_transcript_for_signals(envelope, workdir=workdir, session_uuid=session_uuid)
+    first_use, used = _scan_transcript_for_signals(
+        agent=agent,
+        envelope=envelope,
+        codex_events=codex_events,
+        workdir=workdir,
+        session_uuid=actual_session_id,
+    )
     result.retriever_first_use_turn = first_use
     result.retriever_used_ever = used
-    # c1 has the skill unavailable; leave skill_fired=None to distinguish from "loaded but didn't fire".
+    # c1 has the skill unavailable; leave skill_fired=None to distinguish from
+    # "loaded but didn't fire".
     if condition in ("c2_retriever", "c3_retriever_skill"):
         result.skill_fired = used and (first_use is not None) and first_use <= 2
     return result
 
 
+UNSCORABLE_JUDGE_ERRORS: frozenset[str] = frozenset({"no_ground_truth", "empty_candidate"})
+
+
 def _apply_judge(judge: Any, entry: DatasetEntry, result: TrialResult) -> None:
     """Score ``result.final_answer`` against ``entry.ground_truth_answer``.
 
-    Mutates the result in place. Skips silently when the judge is unset, the
-    ground-truth answer is empty, or the trial didn't produce a final answer.
-    Errors are recorded on the result rather than raised so a flaky judge
-    endpoint never breaks an in-flight session.
+    Missing ground truth and empty candidates are recorded as terminal
+    ``judge_error`` values so ``rescore`` can skip intrinsically unscorable
+    trials instead of retrying them forever.
     """
-    if judge is None or not entry.ground_truth_answer or not result.final_answer:
+    if judge is None:
+        return
+    if not entry.ground_truth_answer:
+        result.judge_error = "no_ground_truth"
+        return
+    if not result.final_answer:
+        result.judge_error = "empty_candidate"
         return
     try:
         verdict = judge.judge(
@@ -582,7 +1077,7 @@ def _apply_judge(judge: Any, entry: DatasetEntry, result: TrialResult) -> None:
             reference=entry.ground_truth_answer,
             candidate=result.final_answer,
         )
-    except Exception as exc:  # defensive — LLMJudge already catches, but be safe.
+    except Exception as exc:
         result.judge_error = f"judge_invocation_error: {exc}"
         logger.warning("LLMJudge raised for entry_id=%s: %s", result.entry_id, exc, exc_info=True)
         return
@@ -594,6 +1089,7 @@ def _apply_judge(judge: Any, entry: DatasetEntry, result: TrialResult) -> None:
 
 def run_condition(
     *,
+    agent: str,
     condition: str,
     entries: list[DatasetEntry],
     workdir_root: Path,
@@ -607,20 +1103,17 @@ def run_condition(
     judge: Any = None,
     testdata_prefixes: tuple[str, ...] = (),
 ) -> tuple[Path, list[TrialResult]]:
-    """Run one Claude Code session covering setup + all `entries` for `condition`.
-
-    Turn 1 creates the session via --session-id; subsequent turns resume it. The
-    first TrialResult has is_setup=True; the rest are query results, one per entry.
-    All ``entries`` are expected to share the same ``domain`` (the caller groups
-    by domain so each session sees a single PDF corpus).
-    """
+    """Run one agent session covering setup + all entries for one condition."""
+    if agent not in SUPPORTED_AGENTS:
+        raise ValueError(f"unsupported agent: {agent}")
     if condition not in CONDITIONS:
         raise ValueError(f"unknown condition: {condition}")
-    workdir = _build_condition_workdir(condition, workdir_root, pdf_source, skill_source, domain=domain)
+    workdir = _build_condition_workdir(agent, condition, workdir_root, pdf_source, skill_source, domain=domain)
     session_uuid = str(uuid.uuid4())
     env = _env_for(condition, workdir)
     logger.info(
-        "starting session for %s/%s: workdir=%s session_id=%s",
+        "starting session for %s/%s/%s: workdir=%s session_id=%s",
+        agent,
         condition,
         domain or "default",
         workdir,
@@ -629,9 +1122,18 @@ def run_condition(
 
     results: list[TrialResult] = []
 
-    setup_trial_id = f"{condition}_{domain or 'default'}_setup_t1"
-    setup_cmd = _build_command(condition, model, budget_usd, session_uuid, workdir, resume=False)
+    setup_trial_id = f"{agent}_{condition}_{domain or 'default'}_setup_t1"
+    setup_cmd = _build_command(
+        agent=agent,
+        condition=condition,
+        model=model,
+        budget_usd=budget_usd,
+        session_uuid=session_uuid,
+        workdir=workdir,
+        resume=False,
+    )
     setup_result = _run_one_turn(
+        agent=agent,
         condition=condition,
         prompt=_render_setup_prompt(condition, domain_label),
         trial_id=setup_trial_id,
@@ -649,13 +1151,33 @@ def run_condition(
     )
     results.append(setup_result)
 
-    resume_cmd = _build_command(condition, model, budget_usd, session_uuid, workdir, resume=True)
+    if setup_result.status != "ok":
+        logger.warning(
+            "setup turn failed for %s/%s/%s; skipping %d query turns",
+            agent,
+            condition,
+            domain or "default",
+            len(entries),
+        )
+        return workdir, results
+
+    session_uuid = setup_result.session_id or session_uuid
+    resume_cmd = _build_command(
+        agent=agent,
+        condition=condition,
+        model=model,
+        budget_usd=budget_usd,
+        session_uuid=session_uuid,
+        workdir=workdir,
+        resume=True,
+    )
     for i, entry in enumerate(entries):
         turn_idx = i + 1
         result = _run_one_turn(
+            agent=agent,
             condition=condition,
             prompt=_render_prompt(entry, condition, testdata_prefixes),
-            trial_id=f"{condition}_{domain or 'default'}_e{entry.entry_id}_t{turn_idx + 1}",
+            trial_id=f"{agent}_{condition}_{domain or 'default'}_e{entry.entry_id}_t{turn_idx + 1}",
             entry_id=entry.entry_id,
             query_id=entry.query_id,
             domain=domain,
@@ -674,10 +1196,43 @@ def run_condition(
 
 
 def save_trial(result: TrialResult, session_dir: Path) -> Path:
-    parts = [session_dir, "trials", result.condition]
+    parts = [session_dir, "trials", result.agent, result.condition]
     if result.domain:
         parts.append(result.domain)
     out = Path(*[str(p) for p in parts]) / f"{result.trial_id}.json"
     out.parent.mkdir(parents=True, exist_ok=True)
     out.write_text(json.dumps(asdict(result), indent=2) + "\n", encoding="utf-8")
     return out
+
+
+def archive_session_log(
+    *,
+    session_dir: Path,
+    agent: str,
+    condition: str,
+    domain: str,
+    session_uuid: str,
+    workdir: Path,
+) -> Path | None:
+    """Copy the agent's rollout log into the artifact dir so it survives ``cleanup_condition_workdir``.
+
+    Without this, the per-trial JSONs are the only persistent record of the run —
+    you cannot retroactively recompute token deltas, tool-use signals, or anything
+    else that requires the raw event stream.
+    """
+    if agent == "claude":
+        src = _claude_session_log_path(workdir, session_uuid)
+    elif agent == "codex":
+        src = _codex_session_log_path(session_uuid)
+    else:
+        return None
+    if src is None or not src.exists():
+        return None
+    parts = [session_dir, "trials", agent, condition]
+    if domain:
+        parts.append(domain)
+    logs_dir = Path(*[str(p) for p in parts]) / "logs"
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    dest = logs_dir / src.name
+    shutil.copy2(src, dest)
+    return dest
diff --git a/nemo_retriever/src/nemo_retriever/skill_eval/trace_summarizer.py b/nemo_retriever/src/nemo_retriever/skill_eval/trace_summarizer.py
new file mode 100644
index 000000000..1a6b087eb
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/skill_eval/trace_summarizer.py
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""LLM-generated tool-use summaries via the ``claude`` CLI.
+
+Reads a compact trace of one agent session (setup turn + N query turns) and
+asks a strong Anthropic model to narrate what the agent did: which tools it
+called, in what order, what strategy it took, and where it improvised.
+
+Shells out to ``claude --print`` so it reuses Claude Code's existing auth. Each
+call runs in a neutral temp cwd with ``--setting-sources user`` so project-level
+skills or settings do not leak into the summarization session.
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+import tempfile
+
+logger = logging.getLogger(__name__)
+
+_SUMMARIZER_PROMPT_TEMPLATE = """\
+You are summarizing the tool-use trace of a coding agent that just ran an
+information-retrieval benchmark over a corpus of PDFs.
+
+Produce a concise markdown narrative with these sections:
+
+**Overall strategy** - one or two sentences. What approach did the agent take?
+Did it build an index, fall back to grep/pdftotext, use a skill?
+
+**Tool-use breakdown** - bulleted list of tool names with counts and one or two
+representative invocations each. Keep inputs short.
+
+**Notable patterns** - retries, dead ends, fallback chains, suspicious behavior.
+Skip this section if nothing stands out.
+
+**Per-question variation** - only include if the agent's approach changed
+between query turns. Otherwise omit.
+
+Be terse. Aim for under 250 words total. Do not editorialize about whether the
+strategy was good or bad; just describe what happened.
+
+---
+
+Condition: {condition}
+Domain: {domain}
+
+Trace:
+{trace}
+"""
+
+_DEFAULT_MODEL = "claude-opus-4-7"
+
+
+class TraceSummarizer:
+    """Per-session tool-use narrator backed by the ``claude`` CLI."""
+
+    def __init__(
+        self,
+        *,
+        model: str = _DEFAULT_MODEL,
+        timeout: float = 120.0,
+    ):
+        self.model = model
+        self._timeout = timeout
+
+    @classmethod
+    def from_kwargs(cls, **kwargs) -> "TraceSummarizer":
+        return cls(**kwargs)
+
+    def summarize(self, condition: str, domain: str, trace: str) -> str:
+        """Return a markdown narrative of ``trace``. Empty string on failure."""
+        if not trace.strip():
+            return ""
+
+        prompt = _SUMMARIZER_PROMPT_TEMPLATE.format(condition=condition, domain=domain, trace=trace)
+        cmd = [
+            "claude",
+            "--print",
+            "--model",
+            self.model,
+            "--setting-sources",
+            "user",
+        ]
+        with tempfile.TemporaryDirectory(prefix="skill_eval_summarize_") as tmpdir:
+            try:
+                proc = subprocess.run(
+                    cmd,
+                    input=prompt,
+                    capture_output=True,
+                    text=True,
+                    timeout=self._timeout,
+                    cwd=tmpdir,
+                    check=False,
+                )
+            except subprocess.TimeoutExpired:
+                logger.warning("trace summarizer timed out after %ss", self._timeout)
+                return ""
+            except FileNotFoundError:
+                logger.warning("trace summarizer: `claude` CLI not on PATH")
+                return ""
+
+        if proc.returncode != 0:
+            logger.warning(
+                "trace summarizer exited %d: %s",
+                proc.returncode,
+                (proc.stderr or "")[:300],
+            )
+            return ""
+        return (proc.stdout or "").strip()