From 349236775bc75668e7c2d1622e786439d0c8690f Mon Sep 17 00:00:00 2001
From: Peyton Montei <peyton@entire.io>
Date: Thu, 4 Jun 2026 13:24:26 -0400
Subject: [PATCH 1/4] feat(review): codex review correctness (skills,
 discovery, live tokens)

Agent-layer correctness for `entire review`, independent of the command
surface (no cmd/setup/fix churn):

- Codex invokes its real review skill verbatim (no /review paraphrase);
  codex's skill system loads the matching SKILL.md. Adds per-spawn
  review.<agent>.model / reasoning_effort, threaded through RunConfig into
  codex CLI flags -m / -c model_reasoning_effort=.
- Codex skill discovery in $name form over ~/.codex/{skills,plugins,superpowers};
  lifts the generic SKILL.md scanners + version dedupe + frontmatter parse into
  the shared skilldiscovery package (SlashForm/DollarForm); claude delegates to
  it. Codex curated built-ins dropped (discovered on disk, not bundled).
- Honest live tokens: claude emits input-only Tokens during streaming (output
  snapshot is misleading) with final totals on result; codex tails its rollout
  transcript (located by thread_id) and emits cumulative Tokens per token_count,
  since `codex exec --json` only carries usage at turn.completed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Entire-Checkpoint: 4cbe537bd34d
---
 cmd/entire/cli/agent/claudecode/discovery.go  | 284 ++----------------
 cmd/entire/cli/agent/claudecode/reviewer.go   |  34 ++-
 .../cli/agent/claudecode/reviewer_test.go     |  77 ++++-
 .../testdata/stream_with_deltas.jsonl         |   7 +
 cmd/entire/cli/agent/codex/AGENT.md           |   8 +
 cmd/entire/cli/agent/codex/discovery.go       |  49 ++-
 cmd/entire/cli/agent/codex/discovery_test.go  | 104 ++++++-
 cmd/entire/cli/agent/codex/review_tokens.go   | 134 +++++++++
 .../cli/agent/codex/review_tokens_test.go     | 128 ++++++++
 cmd/entire/cli/agent/codex/reviewer.go        | 135 ++++++---
 cmd/entire/cli/agent/codex/reviewer_test.go   |  87 +++++-
 .../cli/agent/skilldiscovery/registry.go      |  19 +-
 .../cli/agent/skilldiscovery/registry_test.go |   6 +-
 cmd/entire/cli/agent/skilldiscovery/scan.go   | 257 ++++++++++++++++
 cmd/entire/cli/review/types/reviewer.go       |  12 +
 cmd/entire/cli/settings/settings.go           |  13 +-
 16 files changed, 1028 insertions(+), 326 deletions(-)
 create mode 100644 cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl
 create mode 100644 cmd/entire/cli/agent/codex/review_tokens.go
 create mode 100644 cmd/entire/cli/agent/codex/review_tokens_test.go
 create mode 100644 cmd/entire/cli/agent/skilldiscovery/scan.go
diff --git a/cmd/entire/cli/agent/claudecode/discovery.go b/cmd/entire/cli/agent/claudecode/discovery.go
index 9566901d8..e2d2f8619 100644
--- a/cmd/entire/cli/agent/claudecode/discovery.go
+++ b/cmd/entire/cli/agent/claudecode/discovery.go
@@ -2,14 +2,9 @@ package claudecode
 
 import (
 	"context"
-	"errors"
 	"log/slog"
 	"os"
 	"path/filepath"
-	"sort"
-	"strings"
-
-	"golang.org/x/mod/semver"
 
 	"github.com/entireio/cli/cmd/entire/cli/agent"
 	"github.com/entireio/cli/cmd/entire/cli/agent/skilldiscovery"
@@ -21,16 +16,18 @@ import (
 // (nil, nil) when HOME is unreadable or directories are missing — discovery
 // is best-effort.
 //
-// Claude Code exposes three kinds of invocable content per plugin:
-//   - skills:   <plugin>/skills/<name>/SKILL.md   (YAML frontmatter with name + description)
-//   - commands: <plugin>/commands/<name>.md       (YAML frontmatter with description; name = filename)
-//   - agents:   <plugin>/agents/<name>.md         (YAML frontmatter with description; name = filename)
+// Claude Code exposes three kinds of invocable content per plugin, all invoked
+// via the same slash-prefix syntax (`/name`, `/plugin:name`):
+//   - skills:   <plugin>/skills/<name>/SKILL.md   (frontmatter: name + description)
+//   - commands: <plugin>/commands/<name>.md       (frontmatter: description; name = filename)
+//   - agents:   <plugin>/agents/<name>.md         (frontmatter: description; name = filename)
+//
+// All three are walked because any can be a review tool — the pr-review-toolkit
+// plugin, for example, ships its review skills as commands/agents (not skills/).
 //
-// All three are walked because users invoke them via the same slash-prefix
-// syntax (`/plugin:name`) and any of them can be a review tool. The
-// pr-review-toolkit plugin, for example, ships its review skills as
-// commands/agents (not skills/), and was silently missed by a skills-only
-// walker.
+// The generic SKILL.md / markdown scanning, version dedupe, and frontmatter
+// parsing live in the shared skilldiscovery package; this method supplies the
+// Claude-specific roots and slash invocation form.
 //
 //nolint:unparam // error return is part of SkillDiscoverer contract; future implementations may report hard failures
 func (c *ClaudeCodeAgent) DiscoverReviewSkills(ctx context.Context) ([]agent.DiscoveredSkill, error) {
@@ -40,255 +37,22 @@ func (c *ClaudeCodeAgent) DiscoverReviewSkills(ctx context.Context) ([]agent.Dis
 		return nil, nil
 	}
 
+	form := skilldiscovery.SlashForm
 	var found []agent.DiscoveredSkill
-	found = append(found, scanPluginCache(ctx, filepath.Join(home, ".claude", "plugins", "cache"))...)
-	found = append(found, scanUserSkills(ctx, filepath.Join(home, ".claude", "skills"))...)
-	found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "commands"), "")...)
-	found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "agents"), "")...)
-	found = dedupeByInvocation(found)
+	found = append(found, skilldiscovery.ScanPluginCache(ctx, filepath.Join(home, ".claude", "plugins", "cache"),
+		func(versionRoot, pluginName string) []agent.DiscoveredSkill {
+			var out []agent.DiscoveredSkill
+			out = append(out, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(versionRoot, "skills"), pluginName, form)...)
+			out = append(out, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "commands"), pluginName, form)...)
+			out = append(out, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "agents"), pluginName, form)...)
+			return out
+		})...)
+	found = append(found, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(home, ".claude", "skills"), "", form)...)
+	found = append(found, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "commands"), "", form)...)
+	found = append(found, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "agents"), "", form)...)
+	found = skilldiscovery.DedupeByInvocation(found)
 	if len(found) == 0 {
 		return nil, nil
 	}
 	return found, nil
 }
-
-// dedupeByInvocation collapses entries sharing an invocation name. Plugins
-// can ship a skill and a same-named command wrapper that forwards to it;
-// scan order keeps the skill over its wrapper.
-func dedupeByInvocation(in []agent.DiscoveredSkill) []agent.DiscoveredSkill {
-	if len(in) < 2 {
-		return in
-	}
-	seen := make(map[string]struct{}, len(in))
-	out := make([]agent.DiscoveredSkill, 0, len(in))
-	for _, s := range in {
-		if _, dup := seen[s.Name]; dup {
-			continue
-		}
-		seen[s.Name] = struct{}{}
-		out = append(out, s)
-	}
-	return out
-}
-
-// scanPluginCache walks <root>/<marketplace>/<plugin>/<version>/{skills,commands,agents}/
-// One plugin can contribute through any or all three directories.
-//
-// Multiple version directories per plugin are common after upgrades. Walking
-// every version produces duplicate skills (same invocation name, same
-// description) — confusing in the picker and wasteful in the prompt. We pick
-// a single version per plugin via pickLatestVersion: prefer valid semver
-// (highest), fall back to lexicographic max.
-func scanPluginCache(ctx context.Context, root string) []agent.DiscoveredSkill {
-	entries, err := os.ReadDir(root)
-	if err != nil {
-		logging.Debug(ctx, "claude-code discovery: plugin cache unreadable",
-			slog.String("root", root), slog.String("error", err.Error()))
-		return nil
-	}
-	var found []agent.DiscoveredSkill
-	for _, marketEntry := range entries {
-		if !marketEntry.IsDir() {
-			continue
-		}
-		marketRoot := filepath.Join(root, marketEntry.Name())
-		pluginEntries, err := os.ReadDir(marketRoot)
-		if err != nil {
-			continue
-		}
-		for _, pluginEntry := range pluginEntries {
-			if !pluginEntry.IsDir() {
-				continue
-			}
-			pluginName := pluginEntry.Name()
-			pluginRoot := filepath.Join(marketRoot, pluginName)
-			versionEntries, err := os.ReadDir(pluginRoot)
-			if err != nil {
-				continue
-			}
-			versionDir, ok := pickLatestVersion(versionEntries)
-			if !ok {
-				continue
-			}
-			versionRoot := filepath.Join(pluginRoot, versionDir)
-			found = append(found, readSkillsDir(ctx, filepath.Join(versionRoot, "skills"), pluginName)...)
-			found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "commands"), pluginName)...)
-			found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "agents"), pluginName)...)
-		}
-	}
-	return found
-}
-
-// pickLatestVersion returns the name of the "newest" version directory among
-// entries. Strategy:
-//
-//   - If any entry name parses as semver (with or without a leading "v"), pick
-//     the highest semver among those that parse. Non-semver entries are
-//     ignored when at least one semver entry exists.
-//   - Otherwise, fall back to the lexicographic max of all directory names.
-//     This handles the "unknown" sentinel some plugins ship and one-off names.
-//
-// Returns ("", false) if no usable directory entry exists.
-func pickLatestVersion(entries []os.DirEntry) (string, bool) {
-	var dirs []string
-	for _, e := range entries {
-		if e.IsDir() {
-			dirs = append(dirs, e.Name())
-		}
-	}
-	if len(dirs) == 0 {
-		return "", false
-	}
-	var semverDirs []string
-	for _, d := range dirs {
-		if semver.IsValid(semverWithV(d)) {
-			semverDirs = append(semverDirs, d)
-		}
-	}
-	if len(semverDirs) > 0 {
-		sort.Slice(semverDirs, func(i, j int) bool {
-			return semver.Compare(semverWithV(semverDirs[i]), semverWithV(semverDirs[j])) > 0
-		})
-		return semverDirs[0], true
-	}
-	sort.Sort(sort.Reverse(sort.StringSlice(dirs)))
-	return dirs[0], true
-}
-
-// semverWithV ensures a version string has the "v" prefix that
-// golang.org/x/mod/semver requires. Plugin version dirs are usually bare
-// (e.g. "0.1.0"), but we tolerate either form.
-func semverWithV(s string) string {
-	if strings.HasPrefix(s, "v") {
-		return s
-	}
-	return "v" + s
-}
-
-// scanUserSkills walks ~/.claude/skills/<skill>/SKILL.md.
-func scanUserSkills(ctx context.Context, root string) []agent.DiscoveredSkill {
-	return readSkillsDir(ctx, root, "" /* no plugin prefix */)
-}
-
-// readSkillsDir reads each skill subdirectory's SKILL.md, parses frontmatter,
-// and emits a DiscoveredSkill if Matches() returns true.
-func readSkillsDir(ctx context.Context, dir, pluginName string) []agent.DiscoveredSkill {
-	entries, err := os.ReadDir(dir)
-	if err != nil {
-		return nil
-	}
-	var found []agent.DiscoveredSkill
-	for _, skillEntry := range entries {
-		if !skillEntry.IsDir() {
-			continue
-		}
-		skillDir := filepath.Join(dir, skillEntry.Name())
-		skillFile := filepath.Join(skillDir, "SKILL.md")
-		data, err := os.ReadFile(skillFile) //nolint:gosec // G304: skillFile is constructed from a ReadDir walk under HOME, not user input
-		if err != nil {
-			continue
-		}
-		name, description, parseErr := parseSkillFrontmatter(data)
-		if parseErr != nil {
-			logging.Debug(ctx, "claude-code discovery: skipping malformed SKILL.md",
-				slog.String("path", skillFile), slog.String("error", parseErr.Error()))
-			continue
-		}
-		if name == "" {
-			name = skillEntry.Name()
-		}
-		invocation := invocationName(name, pluginName)
-		if !skilldiscovery.Matches(invocation, description) {
-			continue
-		}
-		found = append(found, agent.DiscoveredSkill{
-			Name:        invocation,
-			Description: description,
-			SourcePath:  skillFile,
-		})
-	}
-	return found
-}
-
-// scanFlatMarkdownDir reads *.md files directly under dir (no nesting), parses
-// their YAML frontmatter for `description:`, and derives the invocation name
-// from the filename (stripping the .md suffix). Used for both plugin
-// commands/agents and user-level ~/.claude/commands and ~/.claude/agents.
-//
-// Frontmatter shape differs from SKILL.md — no `name:` field, so the
-// filename is the source of truth for the invocation name.
-func scanFlatMarkdownDir(ctx context.Context, dir, pluginName string) []agent.DiscoveredSkill {
-	entries, err := os.ReadDir(dir)
-	if err != nil {
-		return nil
-	}
-	var found []agent.DiscoveredSkill
-	for _, entry := range entries {
-		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".md") {
-			continue
-		}
-		baseName := strings.TrimSuffix(entry.Name(), ".md")
-		if strings.EqualFold(baseName, "README") {
-			continue
-		}
-		filePath := filepath.Join(dir, entry.Name())
-		data, err := os.ReadFile(filePath) //nolint:gosec // G304: filePath is constructed from a ReadDir walk under HOME, not user input
-		if err != nil {
-			continue
-		}
-		_, description, parseErr := parseSkillFrontmatter(data)
-		if parseErr != nil {
-			logging.Debug(ctx, "claude-code discovery: skipping malformed command/agent",
-				slog.String("path", filePath), slog.String("error", parseErr.Error()))
-			continue
-		}
-		invocation := invocationName(baseName, pluginName)
-		if !skilldiscovery.Matches(invocation, description) {
-			continue
-		}
-		found = append(found, agent.DiscoveredSkill{
-			Name:        invocation,
-			Description: description,
-			SourcePath:  filePath,
-		})
-	}
-	return found
-}
-
-// invocationName builds the slash-prefixed invocation form. Plugin-prefixed
-// names use "/plugin:name"; bare names use "/name".
-func invocationName(name, pluginName string) string {
-	if pluginName == "" {
-		return "/" + name
-	}
-	return "/" + pluginName + ":" + name
-}
-
-// parseSkillFrontmatter extracts `name:` and `description:` from a minimal
-// YAML frontmatter block. Purpose-built for the tiny subset of YAML these
-// SKILL.md / command / agent files actually use.
-//
-// Trims surrounding double-quotes from values so `description: "foo bar"`
-// is returned as `foo bar` — the command/agent frontmatter quotes values;
-// SKILL.md files usually don't.
-func parseSkillFrontmatter(data []byte) (name, description string, err error) {
-	s := string(data)
-	if !strings.HasPrefix(s, "---\n") && !strings.HasPrefix(s, "---\r\n") {
-		return "", "", errors.New("no frontmatter delimiter")
-	}
-	body := strings.TrimPrefix(strings.TrimPrefix(s, "---\r\n"), "---\n")
-	end := strings.Index(body, "\n---")
-	if end < 0 {
-		return "", "", errors.New("no closing frontmatter delimiter")
-	}
-	for _, line := range strings.Split(body[:end], "\n") {
-		line = strings.TrimSpace(line)
-		switch {
-		case strings.HasPrefix(line, "name:"):
-			name = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "name:")), `"`)
-		case strings.HasPrefix(line, "description:"):
-			description = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "description:")), `"`)
-		}
-	}
-	return name, description, nil
-}
diff --git a/cmd/entire/cli/agent/claudecode/reviewer.go b/cmd/entire/cli/agent/claudecode/reviewer.go
index a30e01049..816625f3d 100644
--- a/cmd/entire/cli/agent/claudecode/reviewer.go
+++ b/cmd/entire/cli/agent/claudecode/reviewer.go
@@ -53,9 +53,17 @@ func buildReviewCmd(ctx context.Context, cfg reviewtypes.RunConfig) *exec.Cmd {
 // Emits Started first, Finished{Success:...} last (success follows result.is_error).
 // On a scanner error (torn stream), emits RunError then Finished{Success:false}.
 //
-// Tokens are emitted only at the terminal `result` envelope, not
-// incrementally — claude's per-assistant `usage` fields aren't cumulative
-// and summing them across messages would double-count.
+// Live-token semantics: Claude's assistant envelopes carry a usage snapshot
+// taken at the START of each turn — input_tokens/cache_* are populated but
+// output_tokens is essentially zero (a 1–8 token "initial decision" count
+// that does not update as text streams). The true per-turn output is only
+// surfaced on `result` (aggregate across all turns in the run) or on the
+// late `message_delta` event of --include-partial-messages mode.
+//
+// We emit `Tokens{In: <sum>, Out: 0}` on each assistant envelope so the TUI
+// can show context size growing across multi-turn runs, and `Tokens{In, Out}`
+// on `result` with the final aggregate. Sending the misleading early
+// "Out: 1" snapshot was removed after capturing real claude stream-json.
 //
 // Package-private; called directly from this package's tests so they can
 // drive raw stdout fixtures through the parser without going through the
@@ -110,6 +118,20 @@ func parseClaudeOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 						out <- reviewtypes.ToolCall{Name: block.Name, Args: string(block.Input)}
 					}
 				}
+				// The usage snapshot here is captured at the start of the
+				// turn, so output_tokens is essentially zero (1–8 tokens of
+				// initial decision, not the true output count). Surface input
+				// only — TUI consumers render input-only Tokens differently
+				// from finalized {In, Out} pairs. The final tally comes from
+				// the `result` envelope below.
+				if env.Message.Usage.InputTokens > 0 ||
+					env.Message.Usage.CacheReadInputTokens > 0 ||
+					env.Message.Usage.CacheCreationInputTokens > 0 {
+					in := env.Message.Usage.InputTokens +
+						env.Message.Usage.CacheReadInputTokens +
+						env.Message.Usage.CacheCreationInputTokens
+					out <- reviewtypes.Tokens{In: in, Out: 0}
+				}
 			case "result":
 				sawResult = true
 				resultErr = env.IsError
@@ -145,6 +167,12 @@ type claudeEnvelope struct {
 
 type claudeMessage struct {
 	Content []claudeBlock `json:"content"`
+	// Usage on assistant envelopes is the per-turn-START snapshot — input
+	// counts are populated but output_tokens reflects only the model's
+	// initial decision, not the streamed text. Final aggregate usage
+	// arrives on the `result` envelope. Reuses messageUsage (declared in
+	// types.go) to stay aligned with the transcript-parser usage shape.
+	Usage messageUsage `json:"usage"`
 }
 
 type claudeBlock struct {
diff --git a/cmd/entire/cli/agent/claudecode/reviewer_test.go b/cmd/entire/cli/agent/claudecode/reviewer_test.go
index 12b1b8712..e456efb76 100644
--- a/cmd/entire/cli/agent/claudecode/reviewer_test.go
+++ b/cmd/entire/cli/agent/claudecode/reviewer_test.go
@@ -239,6 +239,11 @@ func TestParseClaudeOutput_DecodesStreamJSON(t *testing.T) {
 		t.Error("expected AssistantText carrying fixture prose 'Cats are…'")
 	}
 
+	// The parser emits Tokens on every assistant envelope that carries
+	// non-zero usage plus a terminal Tokens on the result envelope. The
+	// fixture's two assistant envelopes both carry usage, so expect >=2
+	// here (the exact count is fixture-defined and not asserted to keep
+	// the fixture editable).
 	var tokensSeen int
 	var tokensOut int
 	for _, ev := range events {
@@ -247,11 +252,11 @@ func TestParseClaudeOutput_DecodesStreamJSON(t *testing.T) {
 			tokensOut = tk.Out
 		}
 	}
-	if tokensSeen != 1 {
-		t.Errorf("Tokens count = %d, want 1", tokensSeen)
+	if tokensSeen < 2 {
+		t.Errorf("Tokens count = %d, want >=2 (per-assistant deltas + result backstop)", tokensSeen)
 	}
 	if tokensOut == 0 {
-		t.Error("Tokens.Out = 0, want > 0")
+		t.Error("final Tokens.Out = 0, want > 0")
 	}
 }
 
@@ -385,6 +390,72 @@ func TestParseClaudeOutput_GarbledLineEmitsRunErrorAndContinues(t *testing.T) {
 	}
 }
 
+// TestParseClaudeOutput_EmitsInputOnlyDuringRun captures the live-token
+// contract for Claude: the assistant envelopes' usage block carries a
+// per-turn-start snapshot where output_tokens is essentially zero (1–8
+// tokens of initial decision), so we surface input only on those envelopes
+// and let the terminal `result` envelope carry the true {In, Out} total.
+//
+// Fixture is derived from real `claude -p --output-format stream-json
+// --verbose` output captured against claude-haiku-4-5: six assistant
+// envelopes across three turns, each with output_tokens 1–6, then a final
+// result with output_tokens 2511. If a future Claude build starts streaming
+// real output token counts on assistant envelopes, this test fails and
+// signals that the parser can emit live {In, Out} pairs.
+func TestParseClaudeOutput_EmitsInputOnlyDuringRun(t *testing.T) {
+	t.Parallel()
+	f, err := os.Open("testdata/stream_with_deltas.jsonl")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	var events []reviewtypes.Event
+	for ev := range parseClaudeOutput(f) {
+		events = append(events, ev)
+	}
+
+	var tokens []reviewtypes.Tokens
+	sawFinished := false
+	for _, e := range events {
+		switch ev := e.(type) {
+		case reviewtypes.Tokens:
+			if sawFinished {
+				t.Errorf("Tokens event arrived AFTER Finished — wrong ordering")
+			}
+			tokens = append(tokens, ev)
+		case reviewtypes.Finished:
+			sawFinished = true
+		}
+	}
+	if len(tokens) < 2 {
+		t.Fatalf("expected >=2 Tokens events (assistant snapshots + final), got %d", len(tokens))
+	}
+
+	// All Tokens BEFORE the final one are assistant-envelope snapshots:
+	// input is populated; Out must be exactly 0 to avoid showing the
+	// misleading per-turn-start initial-decision count in the TUI.
+	for i := range len(tokens) - 1 {
+		tk := tokens[i]
+		if tk.Out != 0 {
+			t.Errorf("tokens[%d].Out = %d, want 0 (assistant envelope snapshot — output not yet known)", i, tk.Out)
+		}
+		if tk.In == 0 {
+			t.Errorf("tokens[%d].In = 0, want >0 (assistant envelope carries input)", i)
+		}
+	}
+
+	// The terminal Tokens event is from `result` and carries the true
+	// aggregate output count (2511 in the fixture).
+	last := tokens[len(tokens)-1]
+	if last.Out == 0 {
+		t.Error("final Tokens.Out = 0, want non-zero (result envelope final tally)")
+	}
+	if last.In == 0 {
+		t.Error("final Tokens.In = 0, want non-zero")
+	}
+}
+
 // collectEvents drains an event channel into a slice.
 func collectEvents(ch <-chan reviewtypes.Event) []reviewtypes.Event {
 	var events []reviewtypes.Event
diff --git a/cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl b/cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl
new file mode 100644
index 000000000..837fe4aa8
--- /dev/null
+++ b/cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl
@@ -0,0 +1,7 @@
+{"type":"system","subtype":"init","cwd":"/redacted/worktree","session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","model":"claude-haiku-4-5","permissionMode":"plan","output_style":"default","apiKeySource":"none","uuid":"redacted-uuid-1"}
+{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn1","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"Analyzing the request..."}],"stop_reason":null,"usage":{"input_tokens":10,"cache_creation_input_tokens":56267,"cache_read_input_tokens":0,"output_tokens":6,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-2"}
+{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn1","type":"message","role":"assistant","content":[{"type":"text","text":"I'll outline a plan first."}],"stop_reason":null,"usage":{"input_tokens":10,"cache_creation_input_tokens":56267,"cache_read_input_tokens":0,"output_tokens":6,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-3"}
+{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn1","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01","name":"Write","input":{"file_path":"plan.md","content":"plan body"}}],"stop_reason":null,"usage":{"input_tokens":10,"cache_creation_input_tokens":56267,"cache_read_input_tokens":0,"output_tokens":6,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-4"}
+{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn2","type":"message","role":"assistant","content":[{"type":"text","text":"Plan created, ready to proceed."}],"stop_reason":null,"usage":{"input_tokens":5,"cache_creation_input_tokens":10066,"cache_read_input_tokens":46555,"output_tokens":1,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-5"}
+{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn3","type":"message","role":"assistant","content":[{"type":"text","text":"Found 3 issues."}],"stop_reason":null,"usage":{"input_tokens":6,"cache_creation_input_tokens":107,"cache_read_input_tokens":56621,"output_tokens":2,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-6"}
+{"type":"result","subtype":"success","is_error":false,"duration_ms":29272,"num_turns":3,"result":"Found 3 issues.","stop_reason":"end_turn","session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","total_cost_usd":0.105,"usage":{"input_tokens":21,"cache_creation_input_tokens":66440,"cache_read_input_tokens":103176,"output_tokens":2511,"service_tier":"standard"},"uuid":"redacted-uuid-7"}
diff --git a/cmd/entire/cli/agent/codex/AGENT.md b/cmd/entire/cli/agent/codex/AGENT.md
index 0480d6592..dd4dbe687 100644
--- a/cmd/entire/cli/agent/codex/AGENT.md
+++ b/cmd/entire/cli/agent/codex/AGENT.md
@@ -204,3 +204,11 @@ The `systemMessage` field can be used to display messages to the user via the ag
 
 - JSON schemas at `codex-rs/hooks/schema/generated/` in the Codex repository
 - Hook config structure at `codex-rs/hooks/src/engine/config.rs` in the Codex repository
+
+## Review integration (`entire review`)
+
+Codex review runs via `codex exec --skip-git-repo-check --json [-m <model>] [-c model_reasoning_effort=<level>] -` (prompt on stdin). **`codex exec` fires no lifecycle hooks**, which shapes the whole integration (see CLAUDE.md → `entire review` → "Codex specifics"):
+
+- **Skills are passed verbatim, not paraphrased.** Codex injects its installed-skill catalog into every exec session and loads the matching `SKILL.md`; configured skills use codex's `$name` / `$plugin:name` form (`DiscoverReviewSkills` in `discovery.go`). Native `codex exec review` is not used — it rejects a prompt under a scope flag and can't carry Entire's scope/per-run/checkpoint context.
+- **Live tokens come from the rollout file, not stdout.** `codex exec --json` carries `usage` only on the terminal `turn.completed`, and a review is a single turn. `review_tokens.go` resolves the rollout transcript by `thread_id` (from the `thread.started` envelope), tails it (the same `~/.codex/.../rollout-*-<thread-id>.jsonl` documented under Transcript above), and emits cumulative `Tokens` per `token_count` event — the source codex's interactive UI reads.
+- **No tagged review session.** Because no hook fires, codex's session is never tagged `KindAgentReview`. The fix manifest therefore sources codex from its **live run output** (`run.Buffer`), and `entire review fix` skill verification is advisory for codex (loose description match), not a hard block.
diff --git a/cmd/entire/cli/agent/codex/discovery.go b/cmd/entire/cli/agent/codex/discovery.go
index b16981e27..f524fbd4f 100644
--- a/cmd/entire/cli/agent/codex/discovery.go
+++ b/cmd/entire/cli/agent/codex/discovery.go
@@ -2,14 +2,51 @@ package codex
 
 import (
 	"context"
+	"log/slog"
+	"os"
+	"path/filepath"
 
 	"github.com/entireio/cli/cmd/entire/cli/agent"
+	"github.com/entireio/cli/cmd/entire/cli/agent/skilldiscovery"
+	"github.com/entireio/cli/cmd/entire/cli/logging"
 )
 
-// DiscoverReviewSkills is a stub until the Codex on-disk plugin layout is
-// verified against codex-rs source (see codex-rs/tui/src/slash_command.rs).
-// Returns (nil, nil) so the picker treats Codex as "built-ins + install
-// hint only" for Phase 1.
-func (c *CodexAgent) DiscoverReviewSkills(_ context.Context) ([]agent.DiscoveredSkill, error) {
-	return nil, nil
+// DiscoverReviewSkills walks codex's on-disk skill layout looking for
+// review-adjacent skills. Returns (nil, nil) when HOME is unreadable or the
+// directories are missing — discovery is best-effort.
+//
+// Codex exposes skills as <root>/<name>/SKILL.md (same frontmatter shape as
+// Claude). Three roots contribute, mirroring codex's own injected skills
+// catalog:
+//   - ~/.codex/skills/<name>/                          → user skills ($name)
+//   - ~/.codex/plugins/cache/<m>/<p>/<v>/skills/<name>/ → plugin skills ($p:name)
+//   - ~/.codex/superpowers/skills/<name>/              → superpowers ($superpowers:name)
+//
+// Skills are emitted in codex's dollar invocation form ($name / $plugin:name) —
+// the literal token a user types to invoke the skill in the codex CLI — so the
+// review prompt names skills exactly the way codex's skill system expects,
+// loading the real SKILL.md rather than relying on a loose description match.
+//
+//nolint:unparam // error return is part of SkillDiscoverer contract; future implementations may report hard failures
+func (c *CodexAgent) DiscoverReviewSkills(ctx context.Context) ([]agent.DiscoveredSkill, error) {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		logging.Debug(ctx, "codex discovery: UserHomeDir failed", slog.String("error", err.Error()))
+		return nil, nil
+	}
+	codexHome := filepath.Join(home, ".codex")
+
+	form := skilldiscovery.DollarForm
+	var found []agent.DiscoveredSkill
+	found = append(found, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(codexHome, "skills"), "", form)...)
+	found = append(found, skilldiscovery.ScanPluginCache(ctx, filepath.Join(codexHome, "plugins", "cache"),
+		func(versionRoot, pluginName string) []agent.DiscoveredSkill {
+			return skilldiscovery.ScanSkillsDir(ctx, filepath.Join(versionRoot, "skills"), pluginName, form)
+		})...)
+	found = append(found, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(codexHome, "superpowers", "skills"), "superpowers", form)...)
+	found = skilldiscovery.DedupeByInvocation(found)
+	if len(found) == 0 {
+		return nil, nil
+	}
+	return found, nil
 }
diff --git a/cmd/entire/cli/agent/codex/discovery_test.go b/cmd/entire/cli/agent/codex/discovery_test.go
index 4b8d39633..f9e2e3d03 100644
--- a/cmd/entire/cli/agent/codex/discovery_test.go
+++ b/cmd/entire/cli/agent/codex/discovery_test.go
@@ -2,6 +2,8 @@ package codex_test
 
 import (
 	"context"
+	"os"
+	"path/filepath"
 	"testing"
 
 	"github.com/entireio/cli/cmd/entire/cli/agent"
@@ -11,14 +13,102 @@ import (
 // Compile-time pin: CodexAgent must satisfy SkillDiscoverer.
 var _ agent.SkillDiscoverer = (*codex.CodexAgent)(nil)
 
-func TestCodexAgent_DiscoverReviewSkills_Stub(t *testing.T) {
-	t.Parallel()
-	a := &codex.CodexAgent{}
-	skills, err := a.DiscoverReviewSkills(context.Background())
+// withFakeHome points HOME at a temp dir so discovery walks an empty,
+// controlled ~/.codex tree. Uses t.Setenv, so callers must NOT t.Parallel.
+func withFakeHome(t *testing.T) string {
+	t.Helper()
+	home := t.TempDir()
+	t.Setenv("HOME", home)
+	return home
+}
+
+// writeSkill creates <root>/<name>/SKILL.md with the given frontmatter name
+// and description.
+func writeSkill(t *testing.T, root, dir, name, description string) {
+	t.Helper()
+	skillDir := filepath.Join(root, dir)
+	if err := os.MkdirAll(skillDir, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	content := "---\nname: " + name + "\ndescription: " + description + "\n---\n\nbody\n"
+	if err := os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte(content), 0o644); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func discover(t *testing.T) []agent.DiscoveredSkill {
+	t.Helper()
+	skills, err := (&codex.CodexAgent{}).DiscoverReviewSkills(context.Background())
 	if err != nil {
-		t.Fatalf("stub should not error; got %v", err)
+		t.Fatalf("unexpected error: %v", err)
+	}
+	return skills
+}
+
+func nameOf(skills []agent.DiscoveredSkill, want string) bool {
+	for _, s := range skills {
+		if s.Name == want {
+			return true
+		}
+	}
+	return false
+}
+
+func TestCodexAgent_DiscoverReviewSkills_NoSkillsReturnsNilNil(t *testing.T) {
+	// Cannot t.Parallel — uses t.Setenv.
+	withFakeHome(t)
+	if skills := discover(t); skills != nil {
+		t.Errorf("skills = %v, want nil", skills)
+	}
+}
+
+func TestCodexAgent_DiscoverReviewSkills_FindsUserSkillInDollarForm(t *testing.T) {
+	home := withFakeHome(t)
+	writeSkill(t, filepath.Join(home, ".codex", "skills"), "code-reviewer", "code-reviewer",
+		"Review code changes with an emphasis on correctness.")
+
+	skills := discover(t)
+	if len(skills) != 1 {
+		t.Fatalf("skills count = %d, want 1: %+v", len(skills), skills)
+	}
+	if skills[0].Name != "$code-reviewer" {
+		t.Errorf("Name = %q, want $code-reviewer", skills[0].Name)
+	}
+}
+
+func TestCodexAgent_DiscoverReviewSkills_FindsPluginSkillNamespaced(t *testing.T) {
+	home := withFakeHome(t)
+	// Opaque (non-semver) version dir, like codex's content-hash versions.
+	writeSkill(t,
+		filepath.Join(home, ".codex", "plugins", "cache", "openai-curated", "github", "fef63ecf", "skills"),
+		"gh-review", "gh-review", "Review a GitHub pull request.")
+
+	skills := discover(t)
+	if !nameOf(skills, "$github:gh-review") {
+		t.Errorf("missing $github:gh-review; got %+v", skills)
 	}
-	if skills != nil {
-		t.Errorf("stub should return nil skills; got %+v", skills)
+}
+
+func TestCodexAgent_DiscoverReviewSkills_FindsSuperpowersSkill(t *testing.T) {
+	home := withFakeHome(t)
+	writeSkill(t, filepath.Join(home, ".codex", "superpowers", "skills"),
+		"receiving-code-review", "receiving-code-review", "Receive code review feedback.")
+
+	skills := discover(t)
+	if !nameOf(skills, "$superpowers:receiving-code-review") {
+		t.Errorf("missing $superpowers:receiving-code-review; got %+v", skills)
+	}
+}
+
+func TestCodexAgent_DiscoverReviewSkills_SkipsNonReviewSkill(t *testing.T) {
+	home := withFakeHome(t)
+	skillsRoot := filepath.Join(home, ".codex", "skills")
+	writeSkill(t, skillsRoot, "code-reviewer", "code-reviewer", "Review code changes.")
+	// "committer" has no review keyword in its name → filtered by Matches.
+	writeSkill(t, skillsRoot, "committer", "committer", "Prepare clear commit messages.")
+
+	skills := discover(t)
+	if len(skills) != 1 || skills[0].Name != "$code-reviewer" {
+		t.Errorf("want only $code-reviewer; got %+v", skills)
 	}
 }
diff --git a/cmd/entire/cli/agent/codex/review_tokens.go b/cmd/entire/cli/agent/codex/review_tokens.go
new file mode 100644
index 000000000..c41208b0f
--- /dev/null
+++ b/cmd/entire/cli/agent/codex/review_tokens.go
@@ -0,0 +1,134 @@
+package codex
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"log/slog"
+	"os"
+	"time"
+
+	"github.com/entireio/cli/cmd/entire/cli/logging"
+	reviewtypes "github.com/entireio/cli/cmd/entire/cli/review/types"
+)
+
+// Polling/tailing cadence for the rollout token tailer.
+const (
+	rolloutPollInterval = 300 * time.Millisecond
+	rolloutPollAttempts = 100 // ~30s for codex to create the rollout file
+	rolloutTailInterval = 400 * time.Millisecond
+	rolloutReadChunk    = 8192
+)
+
+// tailRolloutTokens resolves the codex rollout transcript for threadID and
+// tails it, emitting a cumulative reviewtypes.Tokens event for every
+// token_count codex writes (~once per model turn). codex's `exec --json`
+// stdout only carries usage on the terminal turn.completed envelope, and a
+// review is usually a single turn — so without this the TUI shows no token
+// movement until the run ends. The rollout file is the same source codex's
+// interactive UI reads for its live token counter.
+//
+// token_count.total_token_usage is a running total, so each emission is an
+// absolute count — matching the TUI's overwrite-not-sum semantics. Duplicate
+// totals are suppressed so we only emit on real movement.
+//
+// Returns when stop is closed (the stdout stream ended) or the rollout file
+// never appears. The caller must wait for this to return before closing the
+// event channel (see parseCodexOutputBuf), so a send here can never race a
+// channel close.
+func tailRolloutTokens(threadID string, out chan<- reviewtypes.Event, stop <-chan struct{}) {
+	ctx := context.Background()
+	sessionDir, err := (&CodexAgent{}).GetSessionDir("")
+	if err != nil {
+		logging.Debug(ctx, "codex token tail: session dir unresolved", slog.String("error", err.Error()))
+		return
+	}
+	path := waitForRollout(sessionDir, threadID, stop)
+	if path == "" {
+		return
+	}
+	f, err := os.Open(path) //nolint:gosec // path is a glob match under codex's session dir, not user input
+	if err != nil {
+		logging.Debug(ctx, "codex token tail: open rollout failed", slog.String("error", err.Error()))
+		return
+	}
+	defer f.Close()
+
+	// Tail via os.File.Read rather than bufio.Reader: bufio is sticky on EOF
+	// and would never observe lines codex appends after we first catch up.
+	var pending []byte
+	chunk := make([]byte, rolloutReadChunk)
+	lastIn, lastOut := -1, -1
+	ticker := time.NewTicker(rolloutTailInterval)
+	defer ticker.Stop()
+	for {
+		for {
+			n, readErr := f.Read(chunk)
+			if n > 0 {
+				pending = append(pending, chunk[:n]...)
+				for {
+					idx := bytes.IndexByte(pending, '\n')
+					if idx < 0 {
+						break
+					}
+					line := pending[:idx]
+					pending = pending[idx+1:]
+					in, outTok, ok := parseRolloutTokenCount(line)
+					if !ok || (in == lastIn && outTok == lastOut) {
+						continue
+					}
+					lastIn, lastOut = in, outTok
+					select {
+					case out <- reviewtypes.Tokens{In: in, Out: outTok}:
+					case <-stop:
+						return
+					}
+				}
+			}
+			if readErr != nil {
+				break // EOF or error — wait for the file to grow, then retry
+			}
+		}
+		select {
+		case <-stop:
+			return
+		case <-ticker.C:
+		}
+	}
+}
+
+// waitForRollout polls for the rollout file matching threadID, returning its
+// path or "" if stop fires or the attempts are exhausted.
+func waitForRollout(sessionDir, threadID string, stop <-chan struct{}) string {
+	for range rolloutPollAttempts {
+		if path := findRolloutBySessionID(sessionDir, threadID); path != "" {
+			return path
+		}
+		select {
+		case <-stop:
+			return ""
+		case <-time.After(rolloutPollInterval):
+		}
+	}
+	return ""
+}
+
+// parseRolloutTokenCount extracts cumulative input/output token totals from one
+// rollout JSONL line. ok is false for any line that isn't a token_count event
+// carrying total_token_usage. Reuses the rolloutLine/eventMsgPayload/
+// tokenCountInfo shapes from transcript.go so the two readers can't drift.
+func parseRolloutTokenCount(data []byte) (in, out int, ok bool) {
+	var line rolloutLine
+	if json.Unmarshal(data, &line) != nil || line.Type != "event_msg" {
+		return 0, 0, false
+	}
+	var evt eventMsgPayload
+	if json.Unmarshal(line.Payload, &evt) != nil || evt.Type != "token_count" || len(evt.Info) == 0 {
+		return 0, 0, false
+	}
+	var info tokenCountInfo
+	if json.Unmarshal(evt.Info, &info) != nil || info.TotalTokenUsage == nil {
+		return 0, 0, false
+	}
+	return info.TotalTokenUsage.InputTokens, info.TotalTokenUsage.OutputTokens, true
+}
diff --git a/cmd/entire/cli/agent/codex/review_tokens_test.go b/cmd/entire/cli/agent/codex/review_tokens_test.go
new file mode 100644
index 000000000..4092843bc
--- /dev/null
+++ b/cmd/entire/cli/agent/codex/review_tokens_test.go
@@ -0,0 +1,128 @@
+package codex
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	reviewtypes "github.com/entireio/cli/cmd/entire/cli/review/types"
+)
+
+const tailTestThreadID = "019e8d8f-9d70-7021-b8fe-2c13802e3443"
+
+func tokenLine(in, out int) string {
+	return `{"type":"event_msg","payload":{"type":"token_count","info":{"total_token_usage":` +
+		`{"input_tokens":` + itoa(in) + `,"output_tokens":` + itoa(out) + `}}}}` + "\n"
+}
+
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	var b []byte
+	for n > 0 {
+		b = append([]byte{byte('0' + n%10)}, b...)
+		n /= 10
+	}
+	return string(b)
+}
+
+func TestParseRolloutTokenCount(t *testing.T) {
+	t.Parallel()
+	in, out, ok := parseRolloutTokenCount([]byte(tokenLine(25338, 595)))
+	if !ok || in != 25338 || out != 595 {
+		t.Fatalf("token_count line: got in=%d out=%d ok=%v, want 25338/595/true", in, out, ok)
+	}
+	// Non-token_count lines are ignored.
+	for _, line := range []string{
+		`{"type":"response_item","payload":{"type":"reasoning"}}`,
+		`{"type":"event_msg","payload":{"type":"agent_message"}}`,
+		`not json`,
+		``,
+	} {
+		if _, _, ok := parseRolloutTokenCount([]byte(line)); ok {
+			t.Errorf("expected ok=false for %q", line)
+		}
+	}
+}
+
+// TestTailRolloutTokens_TailsAppendedLines is the core behavior: the tailer
+// must emit Tokens for token_count lines that codex appends *after* the tailer
+// has already caught up to EOF (a plain bufio.Reader would miss these).
+func TestTailRolloutTokens_TailsAppendedLines(t *testing.T) {
+	// Cannot t.Parallel — uses t.Setenv.
+	dir := t.TempDir()
+	t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", dir)
+
+	rollout := filepath.Join(dir, "rollout-2026-06-03T08-57-39-"+tailTestThreadID+".jsonl")
+	if err := os.WriteFile(rollout, []byte(tokenLine(25338, 595)), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	out := make(chan reviewtypes.Event, 16)
+	stop := make(chan struct{})
+	done := make(chan struct{})
+	go func() {
+		tailRolloutTokens(tailTestThreadID, out, stop)
+		close(done)
+	}()
+	defer func() {
+		close(stop)
+		<-done
+	}()
+
+	first := awaitTokens(t, out)
+	if first.In != 25338 || first.Out != 595 {
+		t.Fatalf("first tokens = %+v, want {25338, 595}", first)
+	}
+
+	// Append a second token_count after the tailer caught up — it must see it.
+	f, err := os.OpenFile(rollout, os.O_APPEND|os.O_WRONLY, 0o644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := f.WriteString(tokenLine(52798, 1123)); err != nil {
+		t.Fatal(err)
+	}
+	_ = f.Close()
+
+	second := awaitTokens(t, out)
+	if second.In != 52798 || second.Out != 1123 {
+		t.Fatalf("second tokens = %+v, want {52798, 1123} (appended line not tailed)", second)
+	}
+}
+
+// awaitTokens waits for the next Tokens event or fails on timeout.
+func awaitTokens(t *testing.T, out <-chan reviewtypes.Event) reviewtypes.Tokens {
+	t.Helper()
+	timeout := time.After(5 * time.Second)
+	for {
+		select {
+		case ev := <-out:
+			if tk, ok := ev.(reviewtypes.Tokens); ok {
+				return tk
+			}
+		case <-timeout:
+			t.Fatal("timed out waiting for a Tokens event")
+		}
+	}
+}
+
+func TestTailRolloutTokens_ReturnsOnStopWhenNoRollout(t *testing.T) {
+	// Cannot t.Parallel — uses t.Setenv.
+	t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", t.TempDir())
+	out := make(chan reviewtypes.Event, 4)
+	stop := make(chan struct{})
+	done := make(chan struct{})
+	go func() {
+		tailRolloutTokens(tailTestThreadID, out, stop)
+		close(done)
+	}()
+	close(stop)
+	select {
+	case <-done:
+	case <-time.After(5 * time.Second):
+		t.Fatal("tailRolloutTokens did not return promptly after stop with no rollout file")
+	}
+}
diff --git a/cmd/entire/cli/agent/codex/reviewer.go b/cmd/entire/cli/agent/codex/reviewer.go
index 0567056ef..f05937f9c 100644
--- a/cmd/entire/cli/agent/codex/reviewer.go
+++ b/cmd/entire/cli/agent/codex/reviewer.go
@@ -10,6 +10,7 @@ import (
 	"os"
 	"os/exec"
 	"strings"
+	"sync"
 
 	"github.com/entireio/cli/cmd/entire/cli/logging"
 	"github.com/entireio/cli/cmd/entire/cli/review"
@@ -18,7 +19,8 @@ import (
 
 // NewReviewer returns the AgentReviewer for codex.
 //
-// Argv shape: codex exec --skip-git-repo-check --json -.
+// Argv shape: codex exec --skip-git-repo-check --json [-m <model>]
+// [-c model_reasoning_effort=<level>] -.
 // Prompt is piped via stdin (the trailing "-" tells codex to read from stdin).
 // Stdout is newline-delimited JSON envelopes (one event per line); no chrome
 // filter needed — each line is parsed directly into an Event.
@@ -32,37 +34,44 @@ func NewReviewer() *reviewtypes.ReviewerTemplate {
 
 // buildCodexReviewCmd builds the exec.Cmd for a codex review run.
 // Exposed at package level for test inspection of argv, stdin, and env.
+//
+// Configured skills are passed through verbatim — NOT paraphrased. Codex's
+// skill system injects a catalog of installed skills into every exec session
+// and loads the matching SKILL.md when the prompt names a skill (codex's
+// `$Skill` form, or plain-text/description match), so the agent runs codex's
+// real review workflow rather than a generic restatement of it. An older
+// version replaced `/review` with 28 words of generic instruction, which
+// obscured the skill signal and degraded review quality.
+//
+// Native `codex exec review` is intentionally NOT used: it rejects a prompt
+// when a scope flag is set and codex hooks don't fire during exec, leaving no
+// channel for Entire's scope clause, per-run prompt, and checkpoint context.
+// Plain `codex exec -` with the composed prompt on stdin runs the same skill
+// while carrying our customization.
+//
+// Per-spawn overrides from RunConfig.{Model, ReasoningEffort} translate to
+// codex CLI flags `-m <model>` and `-c model_reasoning_effort=<level>`,
+// inserted before the trailing `-` stdin marker. Empty values are skipped —
+// codex falls back to whatever the user's ~/.codex/config.toml configures.
 func buildCodexReviewCmd(ctx context.Context, cfg reviewtypes.RunConfig) *exec.Cmd {
-	promptCfg := cfg
-	promptCfg.Skills = expandCodexBuiltinReview(cfg.Skills)
-	args := []string{codexExecCommand, "--skip-git-repo-check", "--json", "-"}
-	prompt := review.ComposeReviewPrompt(promptCfg)
+	args := []string{codexExecCommand, "--skip-git-repo-check", "--json"}
+	if cfg.Model != "" {
+		args = append(args, "-m", cfg.Model)
+	}
+	if cfg.ReasoningEffort != "" {
+		args = append(args, "-c", "model_reasoning_effort="+cfg.ReasoningEffort)
+	}
+	args = append(args, "-")
+
+	prompt := review.ComposeReviewPrompt(cfg)
 	cmd := exec.CommandContext(ctx, "codex", args...)
 	cmd.Stdin = strings.NewReader(prompt)
 	cmd.Env = review.AppendReviewEnv(os.Environ(), "codex", cfg, prompt)
 	return cmd
 }
 
-// Codex's native `exec review --base <branch>` rejects an additional prompt,
-// so expand `/review` into text and run normal `codex exec -`. That preserves
-// Entire's scoped base clause, per-run instructions, and checkpoint context.
-const codexBuiltinReviewPrompt = "Review the current branch changes and report actionable findings. " +
-	"Prioritize correctness, regressions, security, and missing test coverage. Do not make code changes."
-
 const codexExecCommand = "exec"
 
-func expandCodexBuiltinReview(skills []string) []string {
-	out := make([]string, 0, len(skills))
-	for _, skill := range skills {
-		if skill == "/review" {
-			out = append(out, codexBuiltinReviewPrompt)
-			continue
-		}
-		out = append(out, skill)
-	}
-	return out
-}
-
 // parseCodexOutput converts codex's `exec --json` stdout into a stream of
 // Events. Each stdout line is one JSON envelope (top-level "type" field).
 //
@@ -79,8 +88,16 @@ func expandCodexBuiltinReview(skills []string) []string {
 // On a scanner error or a missing turn.completed envelope, emits RunError
 // (scanner) or Finished{Success: false} (missing turn) accordingly.
 //
-// Tokens are emitted only at the terminal `turn.completed` envelope, not
-// incrementally — codex's usage fields land once at end-of-turn.
+// Live-token semantics: codex's `--json` output carries `usage` ONLY on
+// `turn.completed` envelopes. Verified against codex-cli 0.130.0 stdout
+// for both short and long prompts — no intermediate envelope
+// (item.started, item.completed, etc.) carries a usage block. Codex's
+// on-disk session log (the `event_msg{type:"token_count"}` shape the
+// transcript parser consumes) is a separate format, not surfaced
+// through `exec --json`.
+//
+// Tokens are emitted at every `turn.completed` envelope so multi-turn
+// runs show iterative updates in the TUI.
 //
 // Package-private; called directly from this package's tests so they can
 // drive raw stdout fixtures through the parser without going through the
@@ -104,10 +121,20 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 	out := make(chan reviewtypes.Event, 32)
 	go func() {
 		defer close(out)
+		// The rollout token tailer (started on thread.started) runs concurrently
+		// and also sends on out. Stop it and wait for it to exit BEFORE close(out)
+		// so its send can never hit a closed channel. defer is LIFO, so this runs
+		// before the close(out) registered above.
+		stop := make(chan struct{})
+		var tailWG sync.WaitGroup
+		defer func() {
+			close(stop)
+			tailWG.Wait()
+		}()
 		out <- reviewtypes.Started{}
 		scanner := bufio.NewScanner(r)
 		scanner.Buffer(make([]byte, min(1024*1024, maxBuf)), maxBuf)
-		var seenTurnComplete bool
+		var seenTurnComplete, emittedTokens, tailerStarted bool
 		var turnUsage codexUsage
 		for scanner.Scan() {
 			line := scanner.Bytes()
@@ -123,8 +150,21 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 			// default arm logs unknown types at Debug so drift can be
 			// triaged via ENTIRE_LOG_LEVEL=debug.
 			switch env.Type {
-			case "thread.started", "turn.started":
-				// Session/turn markers — no event emitted.
+			case "thread.started":
+				// Launch the rollout token tailer once. codex's exec --json stdout
+				// only carries usage on the terminal turn.completed, so we tail the
+				// rollout file (located by thread_id) for live per-turn token totals
+				// — the same source codex's interactive UI reads.
+				if !tailerStarted && env.ThreadID != "" {
+					tailerStarted = true
+					tailWG.Add(1)
+					go func(id string) {
+						defer tailWG.Done()
+						tailRolloutTokens(id, out, stop)
+					}(env.ThreadID)
+				}
+			case "turn.started":
+				// Turn marker — no event emitted.
 			case "item.started":
 				if env.Item.Type == "command_execution" {
 					out <- reviewtypes.ToolCall{Name: "exec", Args: env.Item.Command}
@@ -136,9 +176,26 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 				// command_execution completion is intentionally swallowed —
 				// item.started already announced it. aggregated_output is the
 				// tool's stdout, not the model's narrative.
+				//
+				// No Tokens emission here: codex --json only carries usage on
+				// `turn.completed`. An earlier version of this parser also
+				// emitted Tokens here in case a future codex build attached
+				// a usage block, but that branch was unreachable against real
+				// codex output and is gone — emit on turn.completed only.
 			case "turn.completed":
 				seenTurnComplete = true
 				turnUsage = env.Usage
+				// Emit Tokens at every turn boundary so multi-turn reviews
+				// show iterative updates in the TUI. The post-loop emission
+				// is kept as a backstop for streams that ended without a
+				// turn.completed (defensive — shouldn't happen in practice).
+				if env.Usage.InputTokens > 0 || env.Usage.OutputTokens > 0 {
+					out <- reviewtypes.Tokens{
+						In:  env.Usage.InputTokens,
+						Out: env.Usage.OutputTokens,
+					}
+					emittedTokens = true
+				}
 			default:
 				logging.Debug(context.Background(), "codex parser: unknown envelope type",
 					slog.String("type", env.Type))
@@ -150,13 +207,22 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 			return
 		}
 		if seenTurnComplete {
+			// Defensive backstop: the per-turn emission inside the
+			// `case "turn.completed"` arm above is the primary path and
+			// already emitted Tokens for every turn with non-zero usage.
+			// This fires only if no per-turn emission happened (e.g., a
+			// terminal turn.completed envelope arrived without a usage
+			// block — defensive against codex output drift).
+			//
 			// codex reports cached_input_tokens as a subset of input_tokens
 			// and reasoning_output_tokens as a subset of output_tokens
 			// (matching OpenAI's chat-completions usage shape), so do NOT
 			// sum the subset fields — that would double-count.
-			out <- reviewtypes.Tokens{
-				In:  turnUsage.InputTokens,
-				Out: turnUsage.OutputTokens,
+			if !emittedTokens {
+				out <- reviewtypes.Tokens{
+					In:  turnUsage.InputTokens,
+					Out: turnUsage.OutputTokens,
+				}
 			}
 			// Success is hard-coded true here because codex's `turn.completed`
 			// envelope has no turn-level error field in 0.130.0. If a future
@@ -173,9 +239,10 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 }
 
 type codexEnvelope struct {
-	Type  string     `json:"type"`
-	Item  codexItem  `json:"item"`
-	Usage codexUsage `json:"usage"`
+	Type     string     `json:"type"`
+	ThreadID string     `json:"thread_id"` // present on thread.started; locates the rollout file
+	Item     codexItem  `json:"item"`
+	Usage    codexUsage `json:"usage"`
 }
 
 type codexItem struct {
diff --git a/cmd/entire/cli/agent/codex/reviewer_test.go b/cmd/entire/cli/agent/codex/reviewer_test.go
index 6a7290c65..40399f559 100644
--- a/cmd/entire/cli/agent/codex/reviewer_test.go
+++ b/cmd/entire/cli/agent/codex/reviewer_test.go
@@ -100,7 +100,7 @@ func TestCodexReviewer_ArgvShape(t *testing.T) {
 	}
 }
 
-func TestCodexReviewer_BuiltinReviewExpandsToScopedExecPrompt(t *testing.T) {
+func TestCodexReviewer_PassesSkillThroughVerbatim(t *testing.T) {
 	t.Parallel()
 	cfg := reviewtypes.RunConfig{
 		Skills:            []string{"/review"},
@@ -121,18 +121,49 @@ func TestCodexReviewer_BuiltinReviewExpandsToScopedExecPrompt(t *testing.T) {
 	}
 
 	prompt := readCodexCmdStdin(t, cmd)
-	if strings.Contains(prompt, "/review") {
-		t.Fatalf("builtin review prompt should not include raw /review:\n%s", prompt)
+	// The configured skill must reach codex verbatim so codex's skill system
+	// loads the real SKILL.md — not a generic paraphrase.
+	if !strings.Contains(prompt, "/review") {
+		t.Fatalf("codex prompt should pass /review through verbatim:\n%s", prompt)
+	}
+	if strings.Contains(prompt, "Review the current branch changes and report actionable findings.") {
+		t.Fatalf("codex prompt must not paraphrase /review:\n%s", prompt)
 	}
 	for _, wantText := range []string{
-		"Review the current branch changes and report actionable findings.",
 		"Focus on auth regressions.",
 		"Scope: review the commits unique to this branch vs main, plus any uncommitted changes in the working tree. Ignore code outside this scope.",
 		"Commits in scope (newest first):",
 		"abc123 summary",
 	} {
 		if !strings.Contains(prompt, wantText) {
-			t.Fatalf("builtin review prompt missing %q:\n%s", wantText, prompt)
+			t.Fatalf("codex prompt missing %q:\n%s", wantText, prompt)
+		}
+	}
+}
+
+func TestCodexReviewer_ModelAndReasoningEffortFlags(t *testing.T) {
+	t.Parallel()
+	cfg := reviewtypes.RunConfig{
+		Skills:          []string{"/review"},
+		Model:           "gpt-5-mini",
+		ReasoningEffort: "low",
+	}
+	cmd := buildCodexReviewCmd(context.Background(), cfg)
+
+	// -m and -c are inserted before the trailing stdin marker; empty
+	// overrides are skipped (covered by the other argv tests).
+	want := []string{
+		wantCodexAgentName, "exec", "--skip-git-repo-check", "--json",
+		"-m", "gpt-5-mini",
+		"-c", "model_reasoning_effort=low",
+		"-",
+	}
+	if len(cmd.Args) != len(want) {
+		t.Fatalf("len(Args) = %d, want %d: %v", len(cmd.Args), len(want), cmd.Args)
+	}
+	for i, w := range want {
+		if cmd.Args[i] != w {
+			t.Errorf("Args[%d] = %q, want %q", i, cmd.Args[i], w)
 		}
 	}
 }
@@ -388,6 +419,52 @@ func TestParseCodexOutput_GarbledLineEmitsRunErrorAndContinues(t *testing.T) {
 	}
 }
 
+// TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted locks the live-token
+// contract for codex: its --json output carries `usage` on every
+// `turn.completed` envelope, and the parser emits Tokens at each turn
+// boundary so multi-turn reviews show iterative updates in the TUI.
+// Captured by running real codex-cli 0.130.0 — no item.* envelope ever
+// carried a usage field, so emission stays anchored to turn.completed.
+func TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted(t *testing.T) {
+	t.Parallel()
+	// Real codex --json output (lightly trimmed) simulating a multi-turn
+	// review: model call → tool exec → model call → tool exec, with a
+	// turn.completed envelope at every turn boundary carrying running
+	// usage. The parser should emit Tokens for each turn in order.
+	input := strings.Join([]string{
+		`{"type":"thread.started","thread_id":"tid-1"}`,
+		`{"type":"turn.started"}`,
+		`{"type":"item.started","item":{"id":"item_0","type":"command_execution","command":"ls","aggregated_output":"","exit_code":null,"status":"in_progress"}}`,
+		`{"type":"item.completed","item":{"id":"item_0","type":"command_execution","command":"ls","aggregated_output":"a\nb\nc","exit_code":0,"status":"completed"}}`,
+		`{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"Found three files."}}`,
+		`{"type":"turn.completed","usage":{"input_tokens":34317,"cached_input_tokens":19712,"output_tokens":240,"reasoning_output_tokens":114}}`,
+		`{"type":"turn.started"}`,
+		`{"type":"item.started","item":{"id":"item_2","type":"command_execution","command":"cat a","aggregated_output":"","exit_code":null,"status":"in_progress"}}`,
+		`{"type":"item.completed","item":{"id":"item_2","type":"command_execution","command":"cat a","aggregated_output":"hello","exit_code":0,"status":"completed"}}`,
+		`{"type":"item.completed","item":{"id":"item_3","type":"agent_message","text":"Done."}}`,
+		`{"type":"turn.completed","usage":{"input_tokens":35820,"cached_input_tokens":20114,"output_tokens":401,"reasoning_output_tokens":160}}`,
+		"",
+	}, "\n")
+
+	var tokens []reviewtypes.Tokens
+	for ev := range parseCodexOutput(strings.NewReader(input)) {
+		if tk, ok := ev.(reviewtypes.Tokens); ok {
+			tokens = append(tokens, tk)
+		}
+	}
+
+	if len(tokens) != 2 {
+		t.Fatalf("Tokens count = %d, want exactly 2 (one per turn.completed); got events: %+v",
+			len(tokens), tokens)
+	}
+	if tokens[0].In != 34317 || tokens[0].Out != 240 {
+		t.Errorf("tokens[0] = %+v, want {In:34317, Out:240}", tokens[0])
+	}
+	if tokens[1].In != 35820 || tokens[1].Out != 401 {
+		t.Errorf("tokens[1] = %+v, want {In:35820, Out:401}", tokens[1])
+	}
+}
+
 func collectCodexEvents(ch <-chan reviewtypes.Event) []reviewtypes.Event {
 	var events []reviewtypes.Event
 	for ev := range ch {
diff --git a/cmd/entire/cli/agent/skilldiscovery/registry.go b/cmd/entire/cli/agent/skilldiscovery/registry.go
index bc5938309..0c4b097b6 100644
--- a/cmd/entire/cli/agent/skilldiscovery/registry.go
+++ b/cmd/entire/cli/agent/skilldiscovery/registry.go
@@ -31,7 +31,12 @@ var curatedBuiltins = map[string][]CuratedSkill{
 		{Name: "/security-review", Desc: "Scan git diff for security issues"},
 		{Name: "/simplify", Desc: "Review recent changes for code quality"},
 	},
-	"codex":  {{Name: "/review", Desc: "Review current changes and find issues"}},
+	// Codex has no binary-bundled review command usable from `codex exec`:
+	// built-in slash commands like `/review` only fire in the interactive TUI,
+	// not when piped through exec. Codex's review skills (code-reviewer,
+	// review-swarm, …) live on disk and are surfaced by DiscoverReviewSkills in
+	// $name form, so there are no curated built-ins to hardcode here.
+	"codex":  {},
 	"gemini": {},
 }
 
@@ -42,10 +47,14 @@ var curatedBuiltins = map[string][]CuratedSkill{
 // Install commands below are placeholders until marketplace URLs are pinned.
 // Tests do not assert on Message text — only on ProvidesAny semantics — so
 // prose revisions do not break the suite.
+//
+// Messages must stay backtick-free: the picker renders them through huh, which
+// treats the text as markdown and mangles backtick-wrapped code spans in the
+// terminal. Use plain text / colons to set off commands instead.
 var installHints = map[string][]InstallHint{
 	"claude-code": {
 		{
-			Message: "Install `pr-review-toolkit` via `claude plugin install entireio/pr-review-toolkit`",
+			Message: "Install pr-review-toolkit: claude plugin install entireio/pr-review-toolkit",
 			ProvidesAny: []string{
 				"/pr-review-toolkit:review-pr",
 				"/pr-review-toolkit:code-reviewer",
@@ -53,19 +62,19 @@ var installHints = map[string][]InstallHint{
 			},
 		},
 		{
-			Message:     "Install `test-auditor` via the superpowers plugin",
+			Message:     "Install test-auditor via the superpowers plugin",
 			ProvidesAny: []string{"/test-auditor"},
 		},
 	},
 	"codex": {
 		{
-			Message:     "Install `codex-review-pack` via `codex plugins add <url>`",
+			Message:     "Install codex-review-pack: codex plugins add <url>",
 			ProvidesAny: []string{"/codex:adversarial-review"},
 		},
 	},
 	"gemini": {
 		{
-			Message:     "Install `gemini-code-review` via `gemini extensions install <url>`",
+			Message:     "Install gemini-code-review: gemini extensions install <url>",
 			ProvidesAny: nil,
 		},
 	},
diff --git a/cmd/entire/cli/agent/skilldiscovery/registry_test.go b/cmd/entire/cli/agent/skilldiscovery/registry_test.go
index 22f45e533..b38bbc6d4 100644
--- a/cmd/entire/cli/agent/skilldiscovery/registry_test.go
+++ b/cmd/entire/cli/agent/skilldiscovery/registry_test.go
@@ -12,9 +12,11 @@ func TestCuratedBuiltinsFor_KnownAgents(t *testing.T) {
 	if len(claude) != 3 {
 		t.Fatalf("claude-code built-ins: got %d entries, want 3", len(claude))
 	}
+	// Codex has no binary-bundled review command usable from `codex exec`;
+	// its review skills are discovered on disk in $name form instead.
 	codex := skilldiscovery.CuratedBuiltinsFor("codex")
-	if len(codex) != 1 || codex[0].Name != "/review" {
-		t.Errorf("codex built-ins: got %+v, want 1x /review", codex)
+	if len(codex) != 0 {
+		t.Errorf("codex built-ins: got %+v, want 0 (discovery-driven)", codex)
 	}
 	gemini := skilldiscovery.CuratedBuiltinsFor("gemini")
 	if len(gemini) != 0 {
diff --git a/cmd/entire/cli/agent/skilldiscovery/scan.go b/cmd/entire/cli/agent/skilldiscovery/scan.go
new file mode 100644
index 000000000..fe2039f1b
--- /dev/null
+++ b/cmd/entire/cli/agent/skilldiscovery/scan.go
@@ -0,0 +1,257 @@
+package skilldiscovery
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+
+	"golang.org/x/mod/semver"
+
+	"github.com/entireio/cli/cmd/entire/cli/agent"
+	"github.com/entireio/cli/cmd/entire/cli/logging"
+)
+
+// InvocationForm builds an agent's invocation string for a discovered skill.
+// The only thing that differs between agents is the prefix and namespace
+// joiner: Claude Code uses slash form (`/name`, `/plugin:name`), codex uses
+// dollar form (`$name`, `$plugin:name`) — the literal token a user types to
+// invoke the skill in that CLI. Discovery emits Name already in this form so
+// downstream prompt composition stays agent-agnostic and joins verbatim.
+type InvocationForm func(name, pluginName string) string
+
+// SlashForm is Claude Code's invocation syntax: "/name" or "/plugin:name".
+func SlashForm(name, pluginName string) string {
+	if pluginName == "" {
+		return "/" + name
+	}
+	return "/" + pluginName + ":" + name
+}
+
+// DollarForm is codex's invocation syntax: "$name" or "$plugin:name". This is
+// the explicit "use this skill" token from codex's own injected skills
+// catalog ("name a skill with $SkillName or plain text").
+func DollarForm(name, pluginName string) string {
+	if pluginName == "" {
+		return "$" + name
+	}
+	return "$" + pluginName + ":" + name
+}
+
+// DedupeByInvocation collapses entries sharing an invocation name, keeping the
+// first occurrence. Plugins can ship a skill and a same-named wrapper that
+// forwards to it; scan order decides which wins.
+func DedupeByInvocation(in []agent.DiscoveredSkill) []agent.DiscoveredSkill {
+	if len(in) < 2 {
+		return in
+	}
+	seen := make(map[string]struct{}, len(in))
+	out := make([]agent.DiscoveredSkill, 0, len(in))
+	for _, s := range in {
+		if _, dup := seen[s.Name]; dup {
+			continue
+		}
+		seen[s.Name] = struct{}{}
+		out = append(out, s)
+	}
+	return out
+}
+
+// ScanPluginCache walks <root>/<marketplace>/<plugin>/<version>/ and invokes
+// scanVersion once per plugin, for the single version directory chosen by
+// PickLatestVersion. The callback receives the chosen version root and the
+// plugin name (used as the invocation namespace). Both Claude Code and codex
+// use this same market/plugin/version cache layout; they differ only in which
+// subdirectories under the version root they scan and their invocation form.
+func ScanPluginCache(ctx context.Context, root string, scanVersion func(versionRoot, pluginName string) []agent.DiscoveredSkill) []agent.DiscoveredSkill {
+	entries, err := os.ReadDir(root)
+	if err != nil {
+		logging.Debug(ctx, "skill discovery: plugin cache unreadable",
+			slog.String("root", root), slog.String("error", err.Error()))
+		return nil
+	}
+	var found []agent.DiscoveredSkill
+	for _, marketEntry := range entries {
+		if !marketEntry.IsDir() {
+			continue
+		}
+		marketRoot := filepath.Join(root, marketEntry.Name())
+		pluginEntries, err := os.ReadDir(marketRoot)
+		if err != nil {
+			continue
+		}
+		for _, pluginEntry := range pluginEntries {
+			if !pluginEntry.IsDir() {
+				continue
+			}
+			pluginName := pluginEntry.Name()
+			pluginRoot := filepath.Join(marketRoot, pluginName)
+			versionEntries, err := os.ReadDir(pluginRoot)
+			if err != nil {
+				continue
+			}
+			versionDir, ok := PickLatestVersion(versionEntries)
+			if !ok {
+				continue
+			}
+			found = append(found, scanVersion(filepath.Join(pluginRoot, versionDir), pluginName)...)
+		}
+	}
+	return found
+}
+
+// PickLatestVersion returns the "newest" version directory name among entries:
+//
+//   - If any entry parses as semver (with or without a leading "v"), pick the
+//     highest semver; non-semver entries are ignored when a semver exists.
+//   - Otherwise fall back to the lexicographic max of all directory names.
+//     This handles the "unknown" sentinel some plugins ship, and the opaque
+//     content-hash version dirs codex plugins use (e.g. "fef63ecf").
+//
+// Returns ("", false) if no usable directory entry exists.
+func PickLatestVersion(entries []os.DirEntry) (string, bool) {
+	var dirs []string
+	for _, e := range entries {
+		if e.IsDir() {
+			dirs = append(dirs, e.Name())
+		}
+	}
+	if len(dirs) == 0 {
+		return "", false
+	}
+	var semverDirs []string
+	for _, d := range dirs {
+		if semver.IsValid(semverWithV(d)) {
+			semverDirs = append(semverDirs, d)
+		}
+	}
+	if len(semverDirs) > 0 {
+		sort.Slice(semverDirs, func(i, j int) bool {
+			return semver.Compare(semverWithV(semverDirs[i]), semverWithV(semverDirs[j])) > 0
+		})
+		return semverDirs[0], true
+	}
+	sort.Sort(sort.Reverse(sort.StringSlice(dirs)))
+	return dirs[0], true
+}
+
+// semverWithV ensures a version string has the "v" prefix golang.org/x/mod/semver
+// requires. Plugin version dirs are usually bare (e.g. "0.1.0").
+func semverWithV(s string) string {
+	if strings.HasPrefix(s, "v") {
+		return s
+	}
+	return "v" + s
+}
+
+// ScanSkillsDir reads each <dir>/<name>/SKILL.md, parses its frontmatter, and
+// emits a DiscoveredSkill (in invoke's form) when Matches() returns true.
+// pluginName is the namespace ("" for un-namespaced user skills). Missing dirs
+// yield nil — discovery is best-effort.
+func ScanSkillsDir(ctx context.Context, dir, pluginName string, invoke InvocationForm) []agent.DiscoveredSkill {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return nil
+	}
+	var found []agent.DiscoveredSkill
+	for _, skillEntry := range entries {
+		if !skillEntry.IsDir() {
+			continue
+		}
+		skillFile := filepath.Join(dir, skillEntry.Name(), "SKILL.md")
+		data, err := os.ReadFile(skillFile) //nolint:gosec // G304: skillFile is built from a ReadDir walk under HOME, not user input
+		if err != nil {
+			continue
+		}
+		name, description, parseErr := ParseSkillFrontmatter(data)
+		if parseErr != nil {
+			logging.Debug(ctx, "skill discovery: skipping malformed SKILL.md",
+				slog.String("path", skillFile), slog.String("error", parseErr.Error()))
+			continue
+		}
+		if name == "" {
+			name = skillEntry.Name()
+		}
+		invocation := invoke(name, pluginName)
+		if !Matches(invocation, description) {
+			continue
+		}
+		found = append(found, agent.DiscoveredSkill{
+			Name:        invocation,
+			Description: description,
+			SourcePath:  skillFile,
+		})
+	}
+	return found
+}
+
+// ScanFlatMarkdownDir reads *.md files directly under dir (no nesting), parses
+// their frontmatter for `description:`, and derives the invocation name from
+// the filename (minus .md). Used by Claude Code for plugin/user commands and
+// agents, whose frontmatter has no `name:` field. README.md is skipped.
+func ScanFlatMarkdownDir(ctx context.Context, dir, pluginName string, invoke InvocationForm) []agent.DiscoveredSkill {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return nil
+	}
+	var found []agent.DiscoveredSkill
+	for _, entry := range entries {
+		if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".md") {
+			continue
+		}
+		baseName := strings.TrimSuffix(entry.Name(), ".md")
+		if strings.EqualFold(baseName, "README") {
+			continue
+		}
+		filePath := filepath.Join(dir, entry.Name())
+		data, err := os.ReadFile(filePath) //nolint:gosec // G304: filePath is built from a ReadDir walk under HOME, not user input
+		if err != nil {
+			continue
+		}
+		_, description, parseErr := ParseSkillFrontmatter(data)
+		if parseErr != nil {
+			logging.Debug(ctx, "skill discovery: skipping malformed command/agent",
+				slog.String("path", filePath), slog.String("error", parseErr.Error()))
+			continue
+		}
+		invocation := invoke(baseName, pluginName)
+		if !Matches(invocation, description) {
+			continue
+		}
+		found = append(found, agent.DiscoveredSkill{
+			Name:        invocation,
+			Description: description,
+			SourcePath:  filePath,
+		})
+	}
+	return found
+}
+
+// ParseSkillFrontmatter extracts `name:` and `description:` from a minimal YAML
+// frontmatter block — the tiny subset these SKILL.md / command / agent files
+// use. Surrounding double-quotes are trimmed so `description: "foo"` returns
+// `foo`.
+func ParseSkillFrontmatter(data []byte) (name, description string, err error) {
+	s := string(data)
+	if !strings.HasPrefix(s, "---\n") && !strings.HasPrefix(s, "---\r\n") {
+		return "", "", errors.New("no frontmatter delimiter")
+	}
+	body := strings.TrimPrefix(strings.TrimPrefix(s, "---\r\n"), "---\n")
+	end := strings.Index(body, "\n---")
+	if end < 0 {
+		return "", "", errors.New("no closing frontmatter delimiter")
+	}
+	for _, line := range strings.Split(body[:end], "\n") {
+		line = strings.TrimSpace(line)
+		switch {
+		case strings.HasPrefix(line, "name:"):
+			name = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "name:")), `"`)
+		case strings.HasPrefix(line, "description:"):
+			description = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "description:")), `"`)
+		}
+	}
+	return name, description, nil
+}
diff --git a/cmd/entire/cli/review/types/reviewer.go b/cmd/entire/cli/review/types/reviewer.go
index 97f9a631b..c9f2ea505 100644
--- a/cmd/entire/cli/review/types/reviewer.go
+++ b/cmd/entire/cli/review/types/reviewer.go
@@ -104,6 +104,18 @@ type RunConfig struct {
 	// the commit that was reviewed.
 	StartingSHA string
 
+	// Model is an optional per-spawn model override the reviewer should
+	// pass to its underlying CLI (e.g. codex `-m <model>`). Empty means
+	// use the agent's default. Reviewers that don't support a model flag
+	// silently ignore this field.
+	Model string
+
+	// ReasoningEffort is an optional per-spawn reasoning-effort override
+	// (e.g. "low", "medium", "high", "xhigh" for codex). Empty means use
+	// the agent's default. Reviewers that don't expose reasoning effort
+	// silently ignore this field.
+	ReasoningEffort string
+
 	// EnrichSummary optionally updates the completed run summary before sinks
 	// receive RunFinished. It is used for post-process data such as token
 	// totals that are only available after agent lifecycle hooks flush state.
diff --git a/cmd/entire/cli/settings/settings.go b/cmd/entire/cli/settings/settings.go
index 21f51265b..8dfe20ce1 100644
--- a/cmd/entire/cli/settings/settings.go
+++ b/cmd/entire/cli/settings/settings.go
@@ -332,11 +332,22 @@ type ReviewConfig struct {
 	// Skills is non-empty it is appended after the selected skills; when
 	// Skills is empty it is the full prompt for prompt-only review configs.
 	Prompt string `json:"prompt,omitempty"`
+
+	// Model is an optional per-spawn model override passed to the agent's
+	// CLI (e.g. codex `-m <model>`). Empty falls back to the agent's own
+	// configured default. Agents that don't support a model flag ignore it.
+	Model string `json:"model,omitempty"`
+
+	// ReasoningEffort is an optional per-spawn reasoning-effort override
+	// (e.g. "low", "medium", "high", "xhigh" for codex `-c
+	// model_reasoning_effort=`). Empty falls back to the agent's default.
+	// Agents without a reasoning-effort knob ignore it.
+	ReasoningEffort string `json:"reasoning_effort,omitempty"`
 }
 
 // IsZero reports whether the config is effectively unset.
 func (c ReviewConfig) IsZero() bool {
-	return len(c.Skills) == 0 && c.Prompt == ""
+	return len(c.Skills) == 0 && c.Prompt == "" && c.Model == "" && c.ReasoningEffort == ""
 }
 
 // ReviewConfigFor returns the configured review config for the given agent.

From 242e9bb4e324ac036c417f5ba7a37e0749352aa9 Mon Sep 17 00:00:00 2001
From: Peyton Montei <peyton@entire.io>
Date: Thu, 4 Jun 2026 13:45:10 -0400
Subject: [PATCH 2/4] fix(review): address #1370 bugbot findings

- reviewer_test: TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted feeds a
  thread.started envelope, which launches the rollout token tailer. Pin
  ENTIRE_TEST_CODEX_SESSION_DIR to an empty temp dir (drop t.Parallel) so the
  tailer can't pick up a real ~/.codex rollout-*-tid-1.jsonl and inject extra
  Tokens that break the exactly-2 assertion.
- skilldiscovery: codex install-hint suppression fingerprint was slash form
  (/codex:adversarial-review) but DiscoverReviewSkills emits dollar form, so
  ActiveInstallHintsFor never intersected and the hint showed forever after
  install. Use $codex:adversarial-review.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Entire-Checkpoint: 210b33059b2b
---
 cmd/entire/cli/agent/codex/reviewer_test.go     | 6 +++++-
 cmd/entire/cli/agent/skilldiscovery/registry.go | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cmd/entire/cli/agent/codex/reviewer_test.go b/cmd/entire/cli/agent/codex/reviewer_test.go
index 40399f559..0936a09e5 100644
--- a/cmd/entire/cli/agent/codex/reviewer_test.go
+++ b/cmd/entire/cli/agent/codex/reviewer_test.go
@@ -426,7 +426,11 @@ func TestParseCodexOutput_GarbledLineEmitsRunErrorAndContinues(t *testing.T) {
 // Captured by running real codex-cli 0.130.0 — no item.* envelope ever
 // carried a usage field, so emission stays anchored to turn.completed.
 func TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted(t *testing.T) {
-	t.Parallel()
+	// Not t.Parallel: the thread.started envelope launches the rollout token
+	// tailer, which resolves the codex session dir. Pin it to an empty temp
+	// dir so the tailer never finds a real ~/.codex rollout-*-tid-1.jsonl and
+	// injects extra Tokens events that would break the exactly-2 assertion.
+	t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", t.TempDir())
 	// Real codex --json output (lightly trimmed) simulating a multi-turn
 	// review: model call → tool exec → model call → tool exec, with a
 	// turn.completed envelope at every turn boundary carrying running
diff --git a/cmd/entire/cli/agent/skilldiscovery/registry.go b/cmd/entire/cli/agent/skilldiscovery/registry.go
index 0c4b097b6..e42ea3067 100644
--- a/cmd/entire/cli/agent/skilldiscovery/registry.go
+++ b/cmd/entire/cli/agent/skilldiscovery/registry.go
@@ -68,8 +68,11 @@ var installHints = map[string][]InstallHint{
 	},
 	"codex": {
 		{
-			Message:     "Install codex-review-pack: codex plugins add <url>",
-			ProvidesAny: []string{"/codex:adversarial-review"},
+			Message: "Install codex-review-pack: codex plugins add <url>",
+			// Dollar form: codex DiscoverReviewSkills emits $plugin:name, so the
+			// suppression fingerprint must match that, not the slash form — else
+			// ActiveInstallHintsFor never intersects and the hint shows forever.
+			ProvidesAny: []string{"$codex:adversarial-review"},
 		},
 	},
 	"gemini": {

From 69cc48615520d06fa7cd2c36b038b514f346d253 Mon Sep 17 00:00:00 2001
From: Peyton Montei <peyton@entire.io>
Date: Thu, 4 Jun 2026 14:29:18 -0400
Subject: [PATCH 3/4] fix(review): address #1370 Copilot findings (codex)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Event ordering (real bug): the rollout token tailer runs concurrently and
  sends Tokens on the same channel as the parser. It was only stopped in a
  defer that runs AFTER the inline Finished emissions, so a late Tokens could
  follow Finished — violating ReviewerTemplate's "Finished is the last event"
  contract. Introduce an idempotent stopTailer (sync.Once + WaitGroup.Wait)
  and call it before every terminal Finished; the defer remains as a backstop.
  Adds a regression test (Finished is last even with a live rollout tailer),
  verified under -race.
- Test fidelity: TestCodexReviewer_PassesSkillThroughVerbatim configured the
  codex skill as "/review" (claude's slash form). Codex skills are
  dollar-prefixed; use "$code-reviewer" so the test models real codex usage.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Entire-Checkpoint: 480a94f05ac8
---
 cmd/entire/cli/agent/codex/reviewer.go      | 24 ++++++++---
 cmd/entire/cli/agent/codex/reviewer_test.go | 45 +++++++++++++++++++--
 2 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/cmd/entire/cli/agent/codex/reviewer.go b/cmd/entire/cli/agent/codex/reviewer.go
index f05937f9c..33efa11c1 100644
--- a/cmd/entire/cli/agent/codex/reviewer.go
+++ b/cmd/entire/cli/agent/codex/reviewer.go
@@ -122,15 +122,22 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 	go func() {
 		defer close(out)
 		// The rollout token tailer (started on thread.started) runs concurrently
-		// and also sends on out. Stop it and wait for it to exit BEFORE close(out)
-		// so its send can never hit a closed channel. defer is LIFO, so this runs
-		// before the close(out) registered above.
+		// and also sends on out. stopTailer halts it and waits for it to exit.
+		// It MUST run before any Finished emission: ReviewerTemplate requires
+		// Finished to be the last event before the channel closes, so a late
+		// Tokens from the tailer would violate that contract (and a send after
+		// close(out) would panic). Idempotent (sync.Once) so it's safe to call
+		// explicitly before each terminal Finished AND from the defer backstop;
+		// the defer (LIFO, registered after close(out)) still guards early or
+		// unexpected returns.
 		stop := make(chan struct{})
 		var tailWG sync.WaitGroup
-		defer func() {
-			close(stop)
+		var stopOnce sync.Once
+		stopTailer := func() {
+			stopOnce.Do(func() { close(stop) })
 			tailWG.Wait()
-		}()
+		}
+		defer stopTailer()
 		out <- reviewtypes.Started{}
 		scanner := bufio.NewScanner(r)
 		scanner.Buffer(make([]byte, min(1024*1024, maxBuf)), maxBuf)
@@ -202,6 +209,7 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 			}
 		}
 		if err := scanner.Err(); err != nil {
+			stopTailer()
 			out <- reviewtypes.RunError{Err: fmt.Errorf("read stdout: %w", err)}
 			out <- reviewtypes.Finished{Success: false}
 			return
@@ -224,6 +232,9 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 					Out: turnUsage.OutputTokens,
 				}
 			}
+			// Stop the tailer (and wait) before Finished so no late Tokens can
+			// land after the terminal event.
+			stopTailer()
 			// Success is hard-coded true here because codex's `turn.completed`
 			// envelope has no turn-level error field in 0.130.0. If a future
 			// codex version adds one (e.g., an `error` or `is_error` field on
@@ -233,6 +244,7 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event {
 			out <- reviewtypes.Finished{Success: true}
 			return
 		}
+		stopTailer()
 		out <- reviewtypes.Finished{Success: false}
 	}()
 	return out
diff --git a/cmd/entire/cli/agent/codex/reviewer_test.go b/cmd/entire/cli/agent/codex/reviewer_test.go
index 0936a09e5..a82b846f2 100644
--- a/cmd/entire/cli/agent/codex/reviewer_test.go
+++ b/cmd/entire/cli/agent/codex/reviewer_test.go
@@ -6,6 +6,7 @@ import (
 	"io"
 	"os"
 	"os/exec"
+	"path/filepath"
 	"strings"
 	"testing"
 	"time"
@@ -102,8 +103,11 @@ func TestCodexReviewer_ArgvShape(t *testing.T) {
 
 func TestCodexReviewer_PassesSkillThroughVerbatim(t *testing.T) {
 	t.Parallel()
+	// Codex skills are dollar-prefixed ($name / $plugin:name) — the literal
+	// token a user types in the codex CLI. Use that form so the test models
+	// real codex usage, not claude's slash form.
 	cfg := reviewtypes.RunConfig{
-		Skills:            []string{"/review"},
+		Skills:            []string{"$code-reviewer"},
 		AlwaysPrompt:      "Focus on auth regressions.",
 		ScopeBaseRef:      "main",
 		CheckpointContext: "Commits in scope (newest first):\n  abc123 summary\n",
@@ -123,11 +127,11 @@ func TestCodexReviewer_PassesSkillThroughVerbatim(t *testing.T) {
 	prompt := readCodexCmdStdin(t, cmd)
 	// The configured skill must reach codex verbatim so codex's skill system
 	// loads the real SKILL.md — not a generic paraphrase.
-	if !strings.Contains(prompt, "/review") {
-		t.Fatalf("codex prompt should pass /review through verbatim:\n%s", prompt)
+	if !strings.Contains(prompt, "$code-reviewer") {
+		t.Fatalf("codex prompt should pass $code-reviewer through verbatim:\n%s", prompt)
 	}
 	if strings.Contains(prompt, "Review the current branch changes and report actionable findings.") {
-		t.Fatalf("codex prompt must not paraphrase /review:\n%s", prompt)
+		t.Fatalf("codex prompt must not paraphrase the configured skill:\n%s", prompt)
 	}
 	for _, wantText := range []string{
 		"Focus on auth regressions.",
@@ -469,6 +473,39 @@ func TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted(t *testing.T) {
 	}
 }
 
+// Regression: the rollout token tailer runs concurrently and sends on the same
+// channel. The parser must stop+join it BEFORE emitting the terminal Finished,
+// so a late Tokens can never follow Finished (ReviewerTemplate requires Finished
+// to be the last event). A live rollout file gives the tailer real work to emit.
+func TestParseCodexOutput_FinishedIsLastEvenWithLiveTailer(t *testing.T) {
+	// Cannot t.Parallel — uses t.Setenv.
+	dir := t.TempDir()
+	t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", dir)
+	rollout := filepath.Join(dir, "rollout-2026-06-03T08-57-39-"+tailTestThreadID+".jsonl")
+	content := tokenLine(100, 5) + tokenLine(200, 10) + tokenLine(300, 15)
+	if err := os.WriteFile(rollout, []byte(content), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	input := strings.Join([]string{
+		`{"type":"thread.started","thread_id":"` + tailTestThreadID + `"}`,
+		`{"type":"turn.completed","usage":{"input_tokens":300,"output_tokens":15}}`,
+		"",
+	}, "\n")
+
+	var events []reviewtypes.Event
+	for ev := range parseCodexOutput(strings.NewReader(input)) {
+		events = append(events, ev)
+	}
+	if len(events) == 0 {
+		t.Fatal("no events emitted")
+	}
+	if _, ok := events[len(events)-1].(reviewtypes.Finished); !ok {
+		t.Fatalf("last event = %T, want Finished (no Tokens may follow the terminal event); events=%+v",
+			events[len(events)-1], events)
+	}
+}
+
 func collectCodexEvents(ch <-chan reviewtypes.Event) []reviewtypes.Event {
 	var events []reviewtypes.Event
 	for ev := range ch {

From 5974ba99185050eb04ef6f0aa7556121c342333f Mon Sep 17 00:00:00 2001
From: Peyton Montei <peyton@entire.io>
Date: Thu, 4 Jun 2026 17:48:07 -0400
Subject: [PATCH 4/4] fix(review): capture Claude reviewer model via transcript
 backfill (#1373)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Claude review subprocesses run via `claude -p` don't reliably report a model
through lifecycle hooks (the ModelUpdate path is flaky in headless mode), so
per-session checkpoint metadata landed with "model": "" for Claude reviewer
sessions, while codex/pi reviewers record theirs. Root cause: ClaudeCodeAgent
did not implement agent.ModelExtractor, so condense's sessionStateBackfillModel
fallback (AsModelExtractor -> ExtractModel) was a no-op for Claude; the model
relied solely on the flaky hook path.

Implement ExtractModel on ClaudeCodeAgent: parse the transcript and return the
last non-empty model from an assistant line (mirrors pi/copilotcli/factoryai).
Condense already calls this when state.ModelName == "", so review (and any
hook-missed) Claude sessions now record a non-empty model deterministically.

Verified against real recorded data: a real agent_review row in
entire/checkpoints/v1 with model="" (skills=[/review]) — running the new
ExtractModel on that session's stored transcript recovers "claude-opus-4-7",
which is exactly the value the condense backfill writes. Surfaces in
entire.io's per-reviewer attribution (entirehq/entire.io#2271).

Note: the transcript carries the base model id ("claude-opus-4-7"), not the
hook's context-window variant ("claude-opus-4-7[1m]"), so backfilled models
omit the [1m] suffix — a correct, non-empty id, slightly less precise.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Entire-Checkpoint: 41d9aa9acef6
---
 cmd/entire/cli/agent/claudecode/transcript.go | 29 +++++++++++++
 .../cli/agent/claudecode/transcript_test.go   | 42 +++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/cmd/entire/cli/agent/claudecode/transcript.go b/cmd/entire/cli/agent/claudecode/transcript.go
index d151c814f..9d8c241af 100644
--- a/cmd/entire/cli/agent/claudecode/transcript.go
+++ b/cmd/entire/cli/agent/claudecode/transcript.go
@@ -88,6 +88,35 @@ func ExtractModifiedFiles(lines []TranscriptLine) []string {
 	return files
 }
 
+// ExtractModel returns the LLM model recorded in the transcript, satisfying
+// agent.ModelExtractor. Claude review subprocesses spawned via `claude -p` do
+// not reliably report the model through lifecycle hooks (the ModelUpdate path
+// is flaky in headless mode), so condense backfills it from the transcript
+// here — the same mechanism codex/pi/etc. rely on. Returns the last non-empty
+// model seen on an assistant line, or "" when the transcript carries none.
+func (c *ClaudeCodeAgent) ExtractModel(transcriptData []byte) (string, error) {
+	lines, err := transcript.ParseFromBytes(transcriptData)
+	if err != nil {
+		return "", fmt.Errorf("parse transcript: %w", err)
+	}
+	model := ""
+	for _, line := range lines {
+		if line.Type != envelopeTypeAssistant {
+			continue
+		}
+		var msg struct {
+			Model string `json:"model"`
+		}
+		if err := json.Unmarshal(line.Message, &msg); err != nil {
+			continue
+		}
+		if msg.Model != "" {
+			model = msg.Model
+		}
+	}
+	return model, nil
+}
+
 // TruncateAtUUID returns transcript lines up to and including the line with given UUID
 func TruncateAtUUID(lines []TranscriptLine, uuid string) []TranscriptLine {
 	if uuid == "" {
diff --git a/cmd/entire/cli/agent/claudecode/transcript_test.go b/cmd/entire/cli/agent/claudecode/transcript_test.go
index e16929a2e..d96c77cba 100644
--- a/cmd/entire/cli/agent/claudecode/transcript_test.go
+++ b/cmd/entire/cli/agent/claudecode/transcript_test.go
@@ -869,3 +869,45 @@ func TestExtractAllModifiedFiles_SubagentOnlyChanges(t *testing.T) {
 		t.Errorf("missing expected file %q", f)
 	}
 }
+
+func TestExtractModel(t *testing.T) {
+	t.Parallel()
+	data := []byte(`{"type":"assistant","uuid":"a1","message":{"model":"claude-opus-4-7[1m]","content":[{"type":"text","text":"hi"}]}}` + "\n")
+	model, err := (&ClaudeCodeAgent{}).ExtractModel(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if model != "claude-opus-4-7[1m]" {
+		t.Errorf("model = %q, want claude-opus-4-7[1m]", model)
+	}
+}
+
+func TestExtractModel_MostRecentWins(t *testing.T) {
+	t.Parallel()
+	data := []byte(strings.Join([]string{
+		`{"type":"assistant","uuid":"a1","message":{"model":"claude-sonnet-4-6","content":[]}}`,
+		`{"type":"assistant","uuid":"a2","message":{"model":"claude-opus-4-8","content":[]}}`,
+		"",
+	}, "\n"))
+	model, err := (&ClaudeCodeAgent{}).ExtractModel(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if model != "claude-opus-4-8" {
+		t.Errorf("model = %q, want claude-opus-4-8 (most recent)", model)
+	}
+}
+
+func TestExtractModel_EmptyWhenAbsent(t *testing.T) {
+	t.Parallel()
+	// A review subprocess transcript with no model field on any assistant line
+	// yields "" (caller treats that as "no model available").
+	data := []byte(`{"type":"assistant","uuid":"a1","message":{"content":[{"type":"text","text":"hi"}]}}` + "\n")
+	model, err := (&ClaudeCodeAgent{}).ExtractModel(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if model != "" {
+		t.Errorf("model = %q, want empty", model)
+	}
+}