From 349236775bc75668e7c2d1622e786439d0c8690f Mon Sep 17 00:00:00 2001 From: Peyton Montei Date: Thu, 4 Jun 2026 13:24:26 -0400 Subject: [PATCH 1/4] feat(review): codex review correctness (skills, discovery, live tokens) Agent-layer correctness for `entire review`, independent of the command surface (no cmd/setup/fix churn): - Codex invokes its real review skill verbatim (no /review paraphrase); codex's skill system loads the matching SKILL.md. Adds per-spawn review..model / reasoning_effort, threaded through RunConfig into codex CLI flags -m / -c model_reasoning_effort=. - Codex skill discovery in $name form over ~/.codex/{skills,plugins,superpowers}; lifts the generic SKILL.md scanners + version dedupe + frontmatter parse into the shared skilldiscovery package (SlashForm/DollarForm); claude delegates to it. Codex curated built-ins dropped (discovered on disk, not bundled). - Honest live tokens: claude emits input-only Tokens during streaming (output snapshot is misleading) with final totals on result; codex tails its rollout transcript (located by thread_id) and emits cumulative Tokens per token_count, since `codex exec --json` only carries usage at turn.completed. Co-Authored-By: Claude Opus 4.8 (1M context) Entire-Checkpoint: 4cbe537bd34d --- cmd/entire/cli/agent/claudecode/discovery.go | 284 ++---------------- cmd/entire/cli/agent/claudecode/reviewer.go | 34 ++- .../cli/agent/claudecode/reviewer_test.go | 77 ++++- .../testdata/stream_with_deltas.jsonl | 7 + cmd/entire/cli/agent/codex/AGENT.md | 8 + cmd/entire/cli/agent/codex/discovery.go | 49 ++- cmd/entire/cli/agent/codex/discovery_test.go | 104 ++++++- cmd/entire/cli/agent/codex/review_tokens.go | 134 +++++++++ .../cli/agent/codex/review_tokens_test.go | 128 ++++++++ cmd/entire/cli/agent/codex/reviewer.go | 135 ++++++--- cmd/entire/cli/agent/codex/reviewer_test.go | 87 +++++- .../cli/agent/skilldiscovery/registry.go | 19 +- .../cli/agent/skilldiscovery/registry_test.go | 6 +- cmd/entire/cli/agent/skilldiscovery/scan.go | 257 ++++++++++++++++ cmd/entire/cli/review/types/reviewer.go | 12 + cmd/entire/cli/settings/settings.go | 13 +- 16 files changed, 1028 insertions(+), 326 deletions(-) create mode 100644 cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl create mode 100644 cmd/entire/cli/agent/codex/review_tokens.go create mode 100644 cmd/entire/cli/agent/codex/review_tokens_test.go create mode 100644 cmd/entire/cli/agent/skilldiscovery/scan.go diff --git a/cmd/entire/cli/agent/claudecode/discovery.go b/cmd/entire/cli/agent/claudecode/discovery.go index 9566901d8..e2d2f8619 100644 --- a/cmd/entire/cli/agent/claudecode/discovery.go +++ b/cmd/entire/cli/agent/claudecode/discovery.go @@ -2,14 +2,9 @@ package claudecode import ( "context" - "errors" "log/slog" "os" "path/filepath" - "sort" - "strings" - - "golang.org/x/mod/semver" "github.com/entireio/cli/cmd/entire/cli/agent" "github.com/entireio/cli/cmd/entire/cli/agent/skilldiscovery" @@ -21,16 +16,18 @@ import ( // (nil, nil) when HOME is unreadable or directories are missing — discovery // is best-effort. // -// Claude Code exposes three kinds of invocable content per plugin: -// - skills: /skills//SKILL.md (YAML frontmatter with name + description) -// - commands: /commands/.md (YAML frontmatter with description; name = filename) -// - agents: /agents/.md (YAML frontmatter with description; name = filename) +// Claude Code exposes three kinds of invocable content per plugin, all invoked +// via the same slash-prefix syntax (`/name`, `/plugin:name`): +// - skills: /skills//SKILL.md (frontmatter: name + description) +// - commands: /commands/.md (frontmatter: description; name = filename) +// - agents: /agents/.md (frontmatter: description; name = filename) +// +// All three are walked because any can be a review tool — the pr-review-toolkit +// plugin, for example, ships its review skills as commands/agents (not skills/). // -// All three are walked because users invoke them via the same slash-prefix -// syntax (`/plugin:name`) and any of them can be a review tool. The -// pr-review-toolkit plugin, for example, ships its review skills as -// commands/agents (not skills/), and was silently missed by a skills-only -// walker. +// The generic SKILL.md / markdown scanning, version dedupe, and frontmatter +// parsing live in the shared skilldiscovery package; this method supplies the +// Claude-specific roots and slash invocation form. // //nolint:unparam // error return is part of SkillDiscoverer contract; future implementations may report hard failures func (c *ClaudeCodeAgent) DiscoverReviewSkills(ctx context.Context) ([]agent.DiscoveredSkill, error) { @@ -40,255 +37,22 @@ func (c *ClaudeCodeAgent) DiscoverReviewSkills(ctx context.Context) ([]agent.Dis return nil, nil } + form := skilldiscovery.SlashForm var found []agent.DiscoveredSkill - found = append(found, scanPluginCache(ctx, filepath.Join(home, ".claude", "plugins", "cache"))...) - found = append(found, scanUserSkills(ctx, filepath.Join(home, ".claude", "skills"))...) - found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "commands"), "")...) - found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "agents"), "")...) - found = dedupeByInvocation(found) + found = append(found, skilldiscovery.ScanPluginCache(ctx, filepath.Join(home, ".claude", "plugins", "cache"), + func(versionRoot, pluginName string) []agent.DiscoveredSkill { + var out []agent.DiscoveredSkill + out = append(out, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(versionRoot, "skills"), pluginName, form)...) + out = append(out, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "commands"), pluginName, form)...) + out = append(out, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "agents"), pluginName, form)...) + return out + })...) + found = append(found, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(home, ".claude", "skills"), "", form)...) + found = append(found, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "commands"), "", form)...) + found = append(found, skilldiscovery.ScanFlatMarkdownDir(ctx, filepath.Join(home, ".claude", "agents"), "", form)...) + found = skilldiscovery.DedupeByInvocation(found) if len(found) == 0 { return nil, nil } return found, nil } - -// dedupeByInvocation collapses entries sharing an invocation name. Plugins -// can ship a skill and a same-named command wrapper that forwards to it; -// scan order keeps the skill over its wrapper. -func dedupeByInvocation(in []agent.DiscoveredSkill) []agent.DiscoveredSkill { - if len(in) < 2 { - return in - } - seen := make(map[string]struct{}, len(in)) - out := make([]agent.DiscoveredSkill, 0, len(in)) - for _, s := range in { - if _, dup := seen[s.Name]; dup { - continue - } - seen[s.Name] = struct{}{} - out = append(out, s) - } - return out -} - -// scanPluginCache walks ////{skills,commands,agents}/ -// One plugin can contribute through any or all three directories. -// -// Multiple version directories per plugin are common after upgrades. Walking -// every version produces duplicate skills (same invocation name, same -// description) — confusing in the picker and wasteful in the prompt. We pick -// a single version per plugin via pickLatestVersion: prefer valid semver -// (highest), fall back to lexicographic max. -func scanPluginCache(ctx context.Context, root string) []agent.DiscoveredSkill { - entries, err := os.ReadDir(root) - if err != nil { - logging.Debug(ctx, "claude-code discovery: plugin cache unreadable", - slog.String("root", root), slog.String("error", err.Error())) - return nil - } - var found []agent.DiscoveredSkill - for _, marketEntry := range entries { - if !marketEntry.IsDir() { - continue - } - marketRoot := filepath.Join(root, marketEntry.Name()) - pluginEntries, err := os.ReadDir(marketRoot) - if err != nil { - continue - } - for _, pluginEntry := range pluginEntries { - if !pluginEntry.IsDir() { - continue - } - pluginName := pluginEntry.Name() - pluginRoot := filepath.Join(marketRoot, pluginName) - versionEntries, err := os.ReadDir(pluginRoot) - if err != nil { - continue - } - versionDir, ok := pickLatestVersion(versionEntries) - if !ok { - continue - } - versionRoot := filepath.Join(pluginRoot, versionDir) - found = append(found, readSkillsDir(ctx, filepath.Join(versionRoot, "skills"), pluginName)...) - found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "commands"), pluginName)...) - found = append(found, scanFlatMarkdownDir(ctx, filepath.Join(versionRoot, "agents"), pluginName)...) - } - } - return found -} - -// pickLatestVersion returns the name of the "newest" version directory among -// entries. Strategy: -// -// - If any entry name parses as semver (with or without a leading "v"), pick -// the highest semver among those that parse. Non-semver entries are -// ignored when at least one semver entry exists. -// - Otherwise, fall back to the lexicographic max of all directory names. -// This handles the "unknown" sentinel some plugins ship and one-off names. -// -// Returns ("", false) if no usable directory entry exists. -func pickLatestVersion(entries []os.DirEntry) (string, bool) { - var dirs []string - for _, e := range entries { - if e.IsDir() { - dirs = append(dirs, e.Name()) - } - } - if len(dirs) == 0 { - return "", false - } - var semverDirs []string - for _, d := range dirs { - if semver.IsValid(semverWithV(d)) { - semverDirs = append(semverDirs, d) - } - } - if len(semverDirs) > 0 { - sort.Slice(semverDirs, func(i, j int) bool { - return semver.Compare(semverWithV(semverDirs[i]), semverWithV(semverDirs[j])) > 0 - }) - return semverDirs[0], true - } - sort.Sort(sort.Reverse(sort.StringSlice(dirs))) - return dirs[0], true -} - -// semverWithV ensures a version string has the "v" prefix that -// golang.org/x/mod/semver requires. Plugin version dirs are usually bare -// (e.g. "0.1.0"), but we tolerate either form. -func semverWithV(s string) string { - if strings.HasPrefix(s, "v") { - return s - } - return "v" + s -} - -// scanUserSkills walks ~/.claude/skills//SKILL.md. -func scanUserSkills(ctx context.Context, root string) []agent.DiscoveredSkill { - return readSkillsDir(ctx, root, "" /* no plugin prefix */) -} - -// readSkillsDir reads each skill subdirectory's SKILL.md, parses frontmatter, -// and emits a DiscoveredSkill if Matches() returns true. -func readSkillsDir(ctx context.Context, dir, pluginName string) []agent.DiscoveredSkill { - entries, err := os.ReadDir(dir) - if err != nil { - return nil - } - var found []agent.DiscoveredSkill - for _, skillEntry := range entries { - if !skillEntry.IsDir() { - continue - } - skillDir := filepath.Join(dir, skillEntry.Name()) - skillFile := filepath.Join(skillDir, "SKILL.md") - data, err := os.ReadFile(skillFile) //nolint:gosec // G304: skillFile is constructed from a ReadDir walk under HOME, not user input - if err != nil { - continue - } - name, description, parseErr := parseSkillFrontmatter(data) - if parseErr != nil { - logging.Debug(ctx, "claude-code discovery: skipping malformed SKILL.md", - slog.String("path", skillFile), slog.String("error", parseErr.Error())) - continue - } - if name == "" { - name = skillEntry.Name() - } - invocation := invocationName(name, pluginName) - if !skilldiscovery.Matches(invocation, description) { - continue - } - found = append(found, agent.DiscoveredSkill{ - Name: invocation, - Description: description, - SourcePath: skillFile, - }) - } - return found -} - -// scanFlatMarkdownDir reads *.md files directly under dir (no nesting), parses -// their YAML frontmatter for `description:`, and derives the invocation name -// from the filename (stripping the .md suffix). Used for both plugin -// commands/agents and user-level ~/.claude/commands and ~/.claude/agents. -// -// Frontmatter shape differs from SKILL.md — no `name:` field, so the -// filename is the source of truth for the invocation name. -func scanFlatMarkdownDir(ctx context.Context, dir, pluginName string) []agent.DiscoveredSkill { - entries, err := os.ReadDir(dir) - if err != nil { - return nil - } - var found []agent.DiscoveredSkill - for _, entry := range entries { - if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".md") { - continue - } - baseName := strings.TrimSuffix(entry.Name(), ".md") - if strings.EqualFold(baseName, "README") { - continue - } - filePath := filepath.Join(dir, entry.Name()) - data, err := os.ReadFile(filePath) //nolint:gosec // G304: filePath is constructed from a ReadDir walk under HOME, not user input - if err != nil { - continue - } - _, description, parseErr := parseSkillFrontmatter(data) - if parseErr != nil { - logging.Debug(ctx, "claude-code discovery: skipping malformed command/agent", - slog.String("path", filePath), slog.String("error", parseErr.Error())) - continue - } - invocation := invocationName(baseName, pluginName) - if !skilldiscovery.Matches(invocation, description) { - continue - } - found = append(found, agent.DiscoveredSkill{ - Name: invocation, - Description: description, - SourcePath: filePath, - }) - } - return found -} - -// invocationName builds the slash-prefixed invocation form. Plugin-prefixed -// names use "/plugin:name"; bare names use "/name". -func invocationName(name, pluginName string) string { - if pluginName == "" { - return "/" + name - } - return "/" + pluginName + ":" + name -} - -// parseSkillFrontmatter extracts `name:` and `description:` from a minimal -// YAML frontmatter block. Purpose-built for the tiny subset of YAML these -// SKILL.md / command / agent files actually use. -// -// Trims surrounding double-quotes from values so `description: "foo bar"` -// is returned as `foo bar` — the command/agent frontmatter quotes values; -// SKILL.md files usually don't. -func parseSkillFrontmatter(data []byte) (name, description string, err error) { - s := string(data) - if !strings.HasPrefix(s, "---\n") && !strings.HasPrefix(s, "---\r\n") { - return "", "", errors.New("no frontmatter delimiter") - } - body := strings.TrimPrefix(strings.TrimPrefix(s, "---\r\n"), "---\n") - end := strings.Index(body, "\n---") - if end < 0 { - return "", "", errors.New("no closing frontmatter delimiter") - } - for _, line := range strings.Split(body[:end], "\n") { - line = strings.TrimSpace(line) - switch { - case strings.HasPrefix(line, "name:"): - name = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "name:")), `"`) - case strings.HasPrefix(line, "description:"): - description = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "description:")), `"`) - } - } - return name, description, nil -} diff --git a/cmd/entire/cli/agent/claudecode/reviewer.go b/cmd/entire/cli/agent/claudecode/reviewer.go index a30e01049..816625f3d 100644 --- a/cmd/entire/cli/agent/claudecode/reviewer.go +++ b/cmd/entire/cli/agent/claudecode/reviewer.go @@ -53,9 +53,17 @@ func buildReviewCmd(ctx context.Context, cfg reviewtypes.RunConfig) *exec.Cmd { // Emits Started first, Finished{Success:...} last (success follows result.is_error). // On a scanner error (torn stream), emits RunError then Finished{Success:false}. // -// Tokens are emitted only at the terminal `result` envelope, not -// incrementally — claude's per-assistant `usage` fields aren't cumulative -// and summing them across messages would double-count. +// Live-token semantics: Claude's assistant envelopes carry a usage snapshot +// taken at the START of each turn — input_tokens/cache_* are populated but +// output_tokens is essentially zero (a 1–8 token "initial decision" count +// that does not update as text streams). The true per-turn output is only +// surfaced on `result` (aggregate across all turns in the run) or on the +// late `message_delta` event of --include-partial-messages mode. +// +// We emit `Tokens{In: , Out: 0}` on each assistant envelope so the TUI +// can show context size growing across multi-turn runs, and `Tokens{In, Out}` +// on `result` with the final aggregate. Sending the misleading early +// "Out: 1" snapshot was removed after capturing real claude stream-json. // // Package-private; called directly from this package's tests so they can // drive raw stdout fixtures through the parser without going through the @@ -110,6 +118,20 @@ func parseClaudeOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { out <- reviewtypes.ToolCall{Name: block.Name, Args: string(block.Input)} } } + // The usage snapshot here is captured at the start of the + // turn, so output_tokens is essentially zero (1–8 tokens of + // initial decision, not the true output count). Surface input + // only — TUI consumers render input-only Tokens differently + // from finalized {In, Out} pairs. The final tally comes from + // the `result` envelope below. + if env.Message.Usage.InputTokens > 0 || + env.Message.Usage.CacheReadInputTokens > 0 || + env.Message.Usage.CacheCreationInputTokens > 0 { + in := env.Message.Usage.InputTokens + + env.Message.Usage.CacheReadInputTokens + + env.Message.Usage.CacheCreationInputTokens + out <- reviewtypes.Tokens{In: in, Out: 0} + } case "result": sawResult = true resultErr = env.IsError @@ -145,6 +167,12 @@ type claudeEnvelope struct { type claudeMessage struct { Content []claudeBlock `json:"content"` + // Usage on assistant envelopes is the per-turn-START snapshot — input + // counts are populated but output_tokens reflects only the model's + // initial decision, not the streamed text. Final aggregate usage + // arrives on the `result` envelope. Reuses messageUsage (declared in + // types.go) to stay aligned with the transcript-parser usage shape. + Usage messageUsage `json:"usage"` } type claudeBlock struct { diff --git a/cmd/entire/cli/agent/claudecode/reviewer_test.go b/cmd/entire/cli/agent/claudecode/reviewer_test.go index 12b1b8712..e456efb76 100644 --- a/cmd/entire/cli/agent/claudecode/reviewer_test.go +++ b/cmd/entire/cli/agent/claudecode/reviewer_test.go @@ -239,6 +239,11 @@ func TestParseClaudeOutput_DecodesStreamJSON(t *testing.T) { t.Error("expected AssistantText carrying fixture prose 'Cats are…'") } + // The parser emits Tokens on every assistant envelope that carries + // non-zero usage plus a terminal Tokens on the result envelope. The + // fixture's two assistant envelopes both carry usage, so expect >=2 + // here (the exact count is fixture-defined and not asserted to keep + // the fixture editable). var tokensSeen int var tokensOut int for _, ev := range events { @@ -247,11 +252,11 @@ func TestParseClaudeOutput_DecodesStreamJSON(t *testing.T) { tokensOut = tk.Out } } - if tokensSeen != 1 { - t.Errorf("Tokens count = %d, want 1", tokensSeen) + if tokensSeen < 2 { + t.Errorf("Tokens count = %d, want >=2 (per-assistant deltas + result backstop)", tokensSeen) } if tokensOut == 0 { - t.Error("Tokens.Out = 0, want > 0") + t.Error("final Tokens.Out = 0, want > 0") } } @@ -385,6 +390,72 @@ func TestParseClaudeOutput_GarbledLineEmitsRunErrorAndContinues(t *testing.T) { } } +// TestParseClaudeOutput_EmitsInputOnlyDuringRun captures the live-token +// contract for Claude: the assistant envelopes' usage block carries a +// per-turn-start snapshot where output_tokens is essentially zero (1–8 +// tokens of initial decision), so we surface input only on those envelopes +// and let the terminal `result` envelope carry the true {In, Out} total. +// +// Fixture is derived from real `claude -p --output-format stream-json +// --verbose` output captured against claude-haiku-4-5: six assistant +// envelopes across three turns, each with output_tokens 1–6, then a final +// result with output_tokens 2511. If a future Claude build starts streaming +// real output token counts on assistant envelopes, this test fails and +// signals that the parser can emit live {In, Out} pairs. +func TestParseClaudeOutput_EmitsInputOnlyDuringRun(t *testing.T) { + t.Parallel() + f, err := os.Open("testdata/stream_with_deltas.jsonl") + if err != nil { + t.Fatal(err) + } + defer f.Close() + + var events []reviewtypes.Event + for ev := range parseClaudeOutput(f) { + events = append(events, ev) + } + + var tokens []reviewtypes.Tokens + sawFinished := false + for _, e := range events { + switch ev := e.(type) { + case reviewtypes.Tokens: + if sawFinished { + t.Errorf("Tokens event arrived AFTER Finished — wrong ordering") + } + tokens = append(tokens, ev) + case reviewtypes.Finished: + sawFinished = true + } + } + if len(tokens) < 2 { + t.Fatalf("expected >=2 Tokens events (assistant snapshots + final), got %d", len(tokens)) + } + + // All Tokens BEFORE the final one are assistant-envelope snapshots: + // input is populated; Out must be exactly 0 to avoid showing the + // misleading per-turn-start initial-decision count in the TUI. + for i := range len(tokens) - 1 { + tk := tokens[i] + if tk.Out != 0 { + t.Errorf("tokens[%d].Out = %d, want 0 (assistant envelope snapshot — output not yet known)", i, tk.Out) + } + if tk.In == 0 { + t.Errorf("tokens[%d].In = 0, want >0 (assistant envelope carries input)", i) + } + } + + // The terminal Tokens event is from `result` and carries the true + // aggregate output count (2511 in the fixture). + last := tokens[len(tokens)-1] + if last.Out == 0 { + t.Error("final Tokens.Out = 0, want non-zero (result envelope final tally)") + } + if last.In == 0 { + t.Error("final Tokens.In = 0, want non-zero") + } +} + // collectEvents drains an event channel into a slice. func collectEvents(ch <-chan reviewtypes.Event) []reviewtypes.Event { var events []reviewtypes.Event diff --git a/cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl b/cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl new file mode 100644 index 000000000..837fe4aa8 --- /dev/null +++ b/cmd/entire/cli/agent/claudecode/testdata/stream_with_deltas.jsonl @@ -0,0 +1,7 @@ +{"type":"system","subtype":"init","cwd":"/redacted/worktree","session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","model":"claude-haiku-4-5","permissionMode":"plan","output_style":"default","apiKeySource":"none","uuid":"redacted-uuid-1"} +{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn1","type":"message","role":"assistant","content":[{"type":"thinking","thinking":"Analyzing the request..."}],"stop_reason":null,"usage":{"input_tokens":10,"cache_creation_input_tokens":56267,"cache_read_input_tokens":0,"output_tokens":6,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-2"} +{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn1","type":"message","role":"assistant","content":[{"type":"text","text":"I'll outline a plan first."}],"stop_reason":null,"usage":{"input_tokens":10,"cache_creation_input_tokens":56267,"cache_read_input_tokens":0,"output_tokens":6,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-3"} +{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn1","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01","name":"Write","input":{"file_path":"plan.md","content":"plan body"}}],"stop_reason":null,"usage":{"input_tokens":10,"cache_creation_input_tokens":56267,"cache_read_input_tokens":0,"output_tokens":6,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-4"} +{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn2","type":"message","role":"assistant","content":[{"type":"text","text":"Plan created, ready to proceed."}],"stop_reason":null,"usage":{"input_tokens":5,"cache_creation_input_tokens":10066,"cache_read_input_tokens":46555,"output_tokens":1,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-5"} +{"type":"assistant","message":{"model":"claude-haiku-4-5-20251001","id":"msg_turn3","type":"message","role":"assistant","content":[{"type":"text","text":"Found 3 issues."}],"stop_reason":null,"usage":{"input_tokens":6,"cache_creation_input_tokens":107,"cache_read_input_tokens":56621,"output_tokens":2,"service_tier":"standard"}},"session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","uuid":"redacted-uuid-6"} +{"type":"result","subtype":"success","is_error":false,"duration_ms":29272,"num_turns":3,"result":"Found 3 issues.","stop_reason":"end_turn","session_id":"a905e63f-aaaa-aaaa-aaaa-aaaaaaaaaaaa","total_cost_usd":0.105,"usage":{"input_tokens":21,"cache_creation_input_tokens":66440,"cache_read_input_tokens":103176,"output_tokens":2511,"service_tier":"standard"},"uuid":"redacted-uuid-7"} diff --git a/cmd/entire/cli/agent/codex/AGENT.md b/cmd/entire/cli/agent/codex/AGENT.md index 0480d6592..dd4dbe687 100644 --- a/cmd/entire/cli/agent/codex/AGENT.md +++ b/cmd/entire/cli/agent/codex/AGENT.md @@ -204,3 +204,11 @@ The `systemMessage` field can be used to display messages to the user via the ag - JSON schemas at `codex-rs/hooks/schema/generated/` in the Codex repository - Hook config structure at `codex-rs/hooks/src/engine/config.rs` in the Codex repository + +## Review integration (`entire review`) + +Codex review runs via `codex exec --skip-git-repo-check --json [-m ] [-c model_reasoning_effort=] -` (prompt on stdin). **`codex exec` fires no lifecycle hooks**, which shapes the whole integration (see CLAUDE.md → `entire review` → "Codex specifics"): + +- **Skills are passed verbatim, not paraphrased.** Codex injects its installed-skill catalog into every exec session and loads the matching `SKILL.md`; configured skills use codex's `$name` / `$plugin:name` form (`DiscoverReviewSkills` in `discovery.go`). Native `codex exec review` is not used — it rejects a prompt under a scope flag and can't carry Entire's scope/per-run/checkpoint context. +- **Live tokens come from the rollout file, not stdout.** `codex exec --json` carries `usage` only on the terminal `turn.completed`, and a review is a single turn. `review_tokens.go` resolves the rollout transcript by `thread_id` (from the `thread.started` envelope), tails it (the same `~/.codex/.../rollout-*-.jsonl` documented under Transcript above), and emits cumulative `Tokens` per `token_count` event — the source codex's interactive UI reads. +- **No tagged review session.** Because no hook fires, codex's session is never tagged `KindAgentReview`. The fix manifest therefore sources codex from its **live run output** (`run.Buffer`), and `entire review fix` skill verification is advisory for codex (loose description match), not a hard block. diff --git a/cmd/entire/cli/agent/codex/discovery.go b/cmd/entire/cli/agent/codex/discovery.go index b16981e27..f524fbd4f 100644 --- a/cmd/entire/cli/agent/codex/discovery.go +++ b/cmd/entire/cli/agent/codex/discovery.go @@ -2,14 +2,51 @@ package codex import ( "context" + "log/slog" + "os" + "path/filepath" "github.com/entireio/cli/cmd/entire/cli/agent" + "github.com/entireio/cli/cmd/entire/cli/agent/skilldiscovery" + "github.com/entireio/cli/cmd/entire/cli/logging" ) -// DiscoverReviewSkills is a stub until the Codex on-disk plugin layout is -// verified against codex-rs source (see codex-rs/tui/src/slash_command.rs). -// Returns (nil, nil) so the picker treats Codex as "built-ins + install -// hint only" for Phase 1. -func (c *CodexAgent) DiscoverReviewSkills(_ context.Context) ([]agent.DiscoveredSkill, error) { - return nil, nil +// DiscoverReviewSkills walks codex's on-disk skill layout looking for +// review-adjacent skills. Returns (nil, nil) when HOME is unreadable or the +// directories are missing — discovery is best-effort. +// +// Codex exposes skills as //SKILL.md (same frontmatter shape as +// Claude). Three roots contribute, mirroring codex's own injected skills +// catalog: +// - ~/.codex/skills// → user skills ($name) +// - ~/.codex/plugins/cache//

//skills// → plugin skills ($p:name) +// - ~/.codex/superpowers/skills// → superpowers ($superpowers:name) +// +// Skills are emitted in codex's dollar invocation form ($name / $plugin:name) — +// the literal token a user types to invoke the skill in the codex CLI — so the +// review prompt names skills exactly the way codex's skill system expects, +// loading the real SKILL.md rather than relying on a loose description match. +// +//nolint:unparam // error return is part of SkillDiscoverer contract; future implementations may report hard failures +func (c *CodexAgent) DiscoverReviewSkills(ctx context.Context) ([]agent.DiscoveredSkill, error) { + home, err := os.UserHomeDir() + if err != nil { + logging.Debug(ctx, "codex discovery: UserHomeDir failed", slog.String("error", err.Error())) + return nil, nil + } + codexHome := filepath.Join(home, ".codex") + + form := skilldiscovery.DollarForm + var found []agent.DiscoveredSkill + found = append(found, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(codexHome, "skills"), "", form)...) + found = append(found, skilldiscovery.ScanPluginCache(ctx, filepath.Join(codexHome, "plugins", "cache"), + func(versionRoot, pluginName string) []agent.DiscoveredSkill { + return skilldiscovery.ScanSkillsDir(ctx, filepath.Join(versionRoot, "skills"), pluginName, form) + })...) + found = append(found, skilldiscovery.ScanSkillsDir(ctx, filepath.Join(codexHome, "superpowers", "skills"), "superpowers", form)...) + found = skilldiscovery.DedupeByInvocation(found) + if len(found) == 0 { + return nil, nil + } + return found, nil } diff --git a/cmd/entire/cli/agent/codex/discovery_test.go b/cmd/entire/cli/agent/codex/discovery_test.go index 4b8d39633..f9e2e3d03 100644 --- a/cmd/entire/cli/agent/codex/discovery_test.go +++ b/cmd/entire/cli/agent/codex/discovery_test.go @@ -2,6 +2,8 @@ package codex_test import ( "context" + "os" + "path/filepath" "testing" "github.com/entireio/cli/cmd/entire/cli/agent" @@ -11,14 +13,102 @@ import ( // Compile-time pin: CodexAgent must satisfy SkillDiscoverer. var _ agent.SkillDiscoverer = (*codex.CodexAgent)(nil) -func TestCodexAgent_DiscoverReviewSkills_Stub(t *testing.T) { - t.Parallel() - a := &codex.CodexAgent{} - skills, err := a.DiscoverReviewSkills(context.Background()) +// withFakeHome points HOME at a temp dir so discovery walks an empty, +// controlled ~/.codex tree. Uses t.Setenv, so callers must NOT t.Parallel. +func withFakeHome(t *testing.T) string { + t.Helper() + home := t.TempDir() + t.Setenv("HOME", home) + return home +} + +// writeSkill creates //SKILL.md with the given frontmatter name +// and description. +func writeSkill(t *testing.T, root, dir, name, description string) { + t.Helper() + skillDir := filepath.Join(root, dir) + if err := os.MkdirAll(skillDir, 0o755); err != nil { + t.Fatal(err) + } + content := "---\nname: " + name + "\ndescription: " + description + "\n---\n\nbody\n" + if err := os.WriteFile(filepath.Join(skillDir, "SKILL.md"), []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} + +func discover(t *testing.T) []agent.DiscoveredSkill { + t.Helper() + skills, err := (&codex.CodexAgent{}).DiscoverReviewSkills(context.Background()) if err != nil { - t.Fatalf("stub should not error; got %v", err) + t.Fatalf("unexpected error: %v", err) + } + return skills +} + +func nameOf(skills []agent.DiscoveredSkill, want string) bool { + for _, s := range skills { + if s.Name == want { + return true + } + } + return false +} + +func TestCodexAgent_DiscoverReviewSkills_NoSkillsReturnsNilNil(t *testing.T) { + // Cannot t.Parallel — uses t.Setenv. + withFakeHome(t) + if skills := discover(t); skills != nil { + t.Errorf("skills = %v, want nil", skills) + } +} + +func TestCodexAgent_DiscoverReviewSkills_FindsUserSkillInDollarForm(t *testing.T) { + home := withFakeHome(t) + writeSkill(t, filepath.Join(home, ".codex", "skills"), "code-reviewer", "code-reviewer", + "Review code changes with an emphasis on correctness.") + + skills := discover(t) + if len(skills) != 1 { + t.Fatalf("skills count = %d, want 1: %+v", len(skills), skills) + } + if skills[0].Name != "$code-reviewer" { + t.Errorf("Name = %q, want $code-reviewer", skills[0].Name) + } +} + +func TestCodexAgent_DiscoverReviewSkills_FindsPluginSkillNamespaced(t *testing.T) { + home := withFakeHome(t) + // Opaque (non-semver) version dir, like codex's content-hash versions. + writeSkill(t, + filepath.Join(home, ".codex", "plugins", "cache", "openai-curated", "github", "fef63ecf", "skills"), + "gh-review", "gh-review", "Review a GitHub pull request.") + + skills := discover(t) + if !nameOf(skills, "$github:gh-review") { + t.Errorf("missing $github:gh-review; got %+v", skills) } - if skills != nil { - t.Errorf("stub should return nil skills; got %+v", skills) +} + +func TestCodexAgent_DiscoverReviewSkills_FindsSuperpowersSkill(t *testing.T) { + home := withFakeHome(t) + writeSkill(t, filepath.Join(home, ".codex", "superpowers", "skills"), + "receiving-code-review", "receiving-code-review", "Receive code review feedback.") + + skills := discover(t) + if !nameOf(skills, "$superpowers:receiving-code-review") { + t.Errorf("missing $superpowers:receiving-code-review; got %+v", skills) + } +} + +func TestCodexAgent_DiscoverReviewSkills_SkipsNonReviewSkill(t *testing.T) { + home := withFakeHome(t) + skillsRoot := filepath.Join(home, ".codex", "skills") + writeSkill(t, skillsRoot, "code-reviewer", "code-reviewer", "Review code changes.") + // "committer" has no review keyword in its name → filtered by Matches. + writeSkill(t, skillsRoot, "committer", "committer", "Prepare clear commit messages.") + + skills := discover(t) + if len(skills) != 1 || skills[0].Name != "$code-reviewer" { + t.Errorf("want only $code-reviewer; got %+v", skills) } } diff --git a/cmd/entire/cli/agent/codex/review_tokens.go b/cmd/entire/cli/agent/codex/review_tokens.go new file mode 100644 index 000000000..c41208b0f --- /dev/null +++ b/cmd/entire/cli/agent/codex/review_tokens.go @@ -0,0 +1,134 @@ +package codex + +import ( + "bytes" + "context" + "encoding/json" + "log/slog" + "os" + "time" + + "github.com/entireio/cli/cmd/entire/cli/logging" + reviewtypes "github.com/entireio/cli/cmd/entire/cli/review/types" +) + +// Polling/tailing cadence for the rollout token tailer. +const ( + rolloutPollInterval = 300 * time.Millisecond + rolloutPollAttempts = 100 // ~30s for codex to create the rollout file + rolloutTailInterval = 400 * time.Millisecond + rolloutReadChunk = 8192 +) + +// tailRolloutTokens resolves the codex rollout transcript for threadID and +// tails it, emitting a cumulative reviewtypes.Tokens event for every +// token_count codex writes (~once per model turn). codex's `exec --json` +// stdout only carries usage on the terminal turn.completed envelope, and a +// review is usually a single turn — so without this the TUI shows no token +// movement until the run ends. The rollout file is the same source codex's +// interactive UI reads for its live token counter. +// +// token_count.total_token_usage is a running total, so each emission is an +// absolute count — matching the TUI's overwrite-not-sum semantics. Duplicate +// totals are suppressed so we only emit on real movement. +// +// Returns when stop is closed (the stdout stream ended) or the rollout file +// never appears. The caller must wait for this to return before closing the +// event channel (see parseCodexOutputBuf), so a send here can never race a +// channel close. +func tailRolloutTokens(threadID string, out chan<- reviewtypes.Event, stop <-chan struct{}) { + ctx := context.Background() + sessionDir, err := (&CodexAgent{}).GetSessionDir("") + if err != nil { + logging.Debug(ctx, "codex token tail: session dir unresolved", slog.String("error", err.Error())) + return + } + path := waitForRollout(sessionDir, threadID, stop) + if path == "" { + return + } + f, err := os.Open(path) //nolint:gosec // path is a glob match under codex's session dir, not user input + if err != nil { + logging.Debug(ctx, "codex token tail: open rollout failed", slog.String("error", err.Error())) + return + } + defer f.Close() + + // Tail via os.File.Read rather than bufio.Reader: bufio is sticky on EOF + // and would never observe lines codex appends after we first catch up. + var pending []byte + chunk := make([]byte, rolloutReadChunk) + lastIn, lastOut := -1, -1 + ticker := time.NewTicker(rolloutTailInterval) + defer ticker.Stop() + for { + for { + n, readErr := f.Read(chunk) + if n > 0 { + pending = append(pending, chunk[:n]...) + for { + idx := bytes.IndexByte(pending, '\n') + if idx < 0 { + break + } + line := pending[:idx] + pending = pending[idx+1:] + in, outTok, ok := parseRolloutTokenCount(line) + if !ok || (in == lastIn && outTok == lastOut) { + continue + } + lastIn, lastOut = in, outTok + select { + case out <- reviewtypes.Tokens{In: in, Out: outTok}: + case <-stop: + return + } + } + } + if readErr != nil { + break // EOF or error — wait for the file to grow, then retry + } + } + select { + case <-stop: + return + case <-ticker.C: + } + } +} + +// waitForRollout polls for the rollout file matching threadID, returning its +// path or "" if stop fires or the attempts are exhausted. +func waitForRollout(sessionDir, threadID string, stop <-chan struct{}) string { + for range rolloutPollAttempts { + if path := findRolloutBySessionID(sessionDir, threadID); path != "" { + return path + } + select { + case <-stop: + return "" + case <-time.After(rolloutPollInterval): + } + } + return "" +} + +// parseRolloutTokenCount extracts cumulative input/output token totals from one +// rollout JSONL line. ok is false for any line that isn't a token_count event +// carrying total_token_usage. Reuses the rolloutLine/eventMsgPayload/ +// tokenCountInfo shapes from transcript.go so the two readers can't drift. +func parseRolloutTokenCount(data []byte) (in, out int, ok bool) { + var line rolloutLine + if json.Unmarshal(data, &line) != nil || line.Type != "event_msg" { + return 0, 0, false + } + var evt eventMsgPayload + if json.Unmarshal(line.Payload, &evt) != nil || evt.Type != "token_count" || len(evt.Info) == 0 { + return 0, 0, false + } + var info tokenCountInfo + if json.Unmarshal(evt.Info, &info) != nil || info.TotalTokenUsage == nil { + return 0, 0, false + } + return info.TotalTokenUsage.InputTokens, info.TotalTokenUsage.OutputTokens, true +} diff --git a/cmd/entire/cli/agent/codex/review_tokens_test.go b/cmd/entire/cli/agent/codex/review_tokens_test.go new file mode 100644 index 000000000..4092843bc --- /dev/null +++ b/cmd/entire/cli/agent/codex/review_tokens_test.go @@ -0,0 +1,128 @@ +package codex + +import ( + "os" + "path/filepath" + "testing" + "time" + + reviewtypes "github.com/entireio/cli/cmd/entire/cli/review/types" +) + +const tailTestThreadID = "019e8d8f-9d70-7021-b8fe-2c13802e3443" + +func tokenLine(in, out int) string { + return `{"type":"event_msg","payload":{"type":"token_count","info":{"total_token_usage":` + + `{"input_tokens":` + itoa(in) + `,"output_tokens":` + itoa(out) + `}}}}` + "\n" +} + +func itoa(n int) string { + if n == 0 { + return "0" + } + var b []byte + for n > 0 { + b = append([]byte{byte('0' + n%10)}, b...) + n /= 10 + } + return string(b) +} + +func TestParseRolloutTokenCount(t *testing.T) { + t.Parallel() + in, out, ok := parseRolloutTokenCount([]byte(tokenLine(25338, 595))) + if !ok || in != 25338 || out != 595 { + t.Fatalf("token_count line: got in=%d out=%d ok=%v, want 25338/595/true", in, out, ok) + } + // Non-token_count lines are ignored. + for _, line := range []string{ + `{"type":"response_item","payload":{"type":"reasoning"}}`, + `{"type":"event_msg","payload":{"type":"agent_message"}}`, + `not json`, + ``, + } { + if _, _, ok := parseRolloutTokenCount([]byte(line)); ok { + t.Errorf("expected ok=false for %q", line) + } + } +} + +// TestTailRolloutTokens_TailsAppendedLines is the core behavior: the tailer +// must emit Tokens for token_count lines that codex appends *after* the tailer +// has already caught up to EOF (a plain bufio.Reader would miss these). +func TestTailRolloutTokens_TailsAppendedLines(t *testing.T) { + // Cannot t.Parallel — uses t.Setenv. + dir := t.TempDir() + t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", dir) + + rollout := filepath.Join(dir, "rollout-2026-06-03T08-57-39-"+tailTestThreadID+".jsonl") + if err := os.WriteFile(rollout, []byte(tokenLine(25338, 595)), 0o644); err != nil { + t.Fatal(err) + } + + out := make(chan reviewtypes.Event, 16) + stop := make(chan struct{}) + done := make(chan struct{}) + go func() { + tailRolloutTokens(tailTestThreadID, out, stop) + close(done) + }() + defer func() { + close(stop) + <-done + }() + + first := awaitTokens(t, out) + if first.In != 25338 || first.Out != 595 { + t.Fatalf("first tokens = %+v, want {25338, 595}", first) + } + + // Append a second token_count after the tailer caught up — it must see it. + f, err := os.OpenFile(rollout, os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + t.Fatal(err) + } + if _, err := f.WriteString(tokenLine(52798, 1123)); err != nil { + t.Fatal(err) + } + _ = f.Close() + + second := awaitTokens(t, out) + if second.In != 52798 || second.Out != 1123 { + t.Fatalf("second tokens = %+v, want {52798, 1123} (appended line not tailed)", second) + } +} + +// awaitTokens waits for the next Tokens event or fails on timeout. +func awaitTokens(t *testing.T, out <-chan reviewtypes.Event) reviewtypes.Tokens { + t.Helper() + timeout := time.After(5 * time.Second) + for { + select { + case ev := <-out: + if tk, ok := ev.(reviewtypes.Tokens); ok { + return tk + } + case <-timeout: + t.Fatal("timed out waiting for a Tokens event") + } + } +} + +func TestTailRolloutTokens_ReturnsOnStopWhenNoRollout(t *testing.T) { + // Cannot t.Parallel — uses t.Setenv. + t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", t.TempDir()) + out := make(chan reviewtypes.Event, 4) + stop := make(chan struct{}) + done := make(chan struct{}) + go func() { + tailRolloutTokens(tailTestThreadID, out, stop) + close(done) + }() + close(stop) + select { + case <-done: + case <-time.After(5 * time.Second): + t.Fatal("tailRolloutTokens did not return promptly after stop with no rollout file") + } +} diff --git a/cmd/entire/cli/agent/codex/reviewer.go b/cmd/entire/cli/agent/codex/reviewer.go index 0567056ef..f05937f9c 100644 --- a/cmd/entire/cli/agent/codex/reviewer.go +++ b/cmd/entire/cli/agent/codex/reviewer.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "strings" + "sync" "github.com/entireio/cli/cmd/entire/cli/logging" "github.com/entireio/cli/cmd/entire/cli/review" @@ -18,7 +19,8 @@ import ( // NewReviewer returns the AgentReviewer for codex. // -// Argv shape: codex exec --skip-git-repo-check --json -. +// Argv shape: codex exec --skip-git-repo-check --json [-m ] +// [-c model_reasoning_effort=] -. // Prompt is piped via stdin (the trailing "-" tells codex to read from stdin). // Stdout is newline-delimited JSON envelopes (one event per line); no chrome // filter needed — each line is parsed directly into an Event. @@ -32,37 +34,44 @@ func NewReviewer() *reviewtypes.ReviewerTemplate { // buildCodexReviewCmd builds the exec.Cmd for a codex review run. // Exposed at package level for test inspection of argv, stdin, and env. +// +// Configured skills are passed through verbatim — NOT paraphrased. Codex's +// skill system injects a catalog of installed skills into every exec session +// and loads the matching SKILL.md when the prompt names a skill (codex's +// `$Skill` form, or plain-text/description match), so the agent runs codex's +// real review workflow rather than a generic restatement of it. An older +// version replaced `/review` with 28 words of generic instruction, which +// obscured the skill signal and degraded review quality. +// +// Native `codex exec review` is intentionally NOT used: it rejects a prompt +// when a scope flag is set and codex hooks don't fire during exec, leaving no +// channel for Entire's scope clause, per-run prompt, and checkpoint context. +// Plain `codex exec -` with the composed prompt on stdin runs the same skill +// while carrying our customization. +// +// Per-spawn overrides from RunConfig.{Model, ReasoningEffort} translate to +// codex CLI flags `-m ` and `-c model_reasoning_effort=`, +// inserted before the trailing `-` stdin marker. Empty values are skipped — +// codex falls back to whatever the user's ~/.codex/config.toml configures. func buildCodexReviewCmd(ctx context.Context, cfg reviewtypes.RunConfig) *exec.Cmd { - promptCfg := cfg - promptCfg.Skills = expandCodexBuiltinReview(cfg.Skills) - args := []string{codexExecCommand, "--skip-git-repo-check", "--json", "-"} - prompt := review.ComposeReviewPrompt(promptCfg) + args := []string{codexExecCommand, "--skip-git-repo-check", "--json"} + if cfg.Model != "" { + args = append(args, "-m", cfg.Model) + } + if cfg.ReasoningEffort != "" { + args = append(args, "-c", "model_reasoning_effort="+cfg.ReasoningEffort) + } + args = append(args, "-") + + prompt := review.ComposeReviewPrompt(cfg) cmd := exec.CommandContext(ctx, "codex", args...) cmd.Stdin = strings.NewReader(prompt) cmd.Env = review.AppendReviewEnv(os.Environ(), "codex", cfg, prompt) return cmd } -// Codex's native `exec review --base ` rejects an additional prompt, -// so expand `/review` into text and run normal `codex exec -`. That preserves -// Entire's scoped base clause, per-run instructions, and checkpoint context. -const codexBuiltinReviewPrompt = "Review the current branch changes and report actionable findings. " + - "Prioritize correctness, regressions, security, and missing test coverage. Do not make code changes." - const codexExecCommand = "exec" -func expandCodexBuiltinReview(skills []string) []string { - out := make([]string, 0, len(skills)) - for _, skill := range skills { - if skill == "/review" { - out = append(out, codexBuiltinReviewPrompt) - continue - } - out = append(out, skill) - } - return out -} - // parseCodexOutput converts codex's `exec --json` stdout into a stream of // Events. Each stdout line is one JSON envelope (top-level "type" field). // @@ -79,8 +88,16 @@ func expandCodexBuiltinReview(skills []string) []string { // On a scanner error or a missing turn.completed envelope, emits RunError // (scanner) or Finished{Success: false} (missing turn) accordingly. // -// Tokens are emitted only at the terminal `turn.completed` envelope, not -// incrementally — codex's usage fields land once at end-of-turn. +// Live-token semantics: codex's `--json` output carries `usage` ONLY on +// `turn.completed` envelopes. Verified against codex-cli 0.130.0 stdout +// for both short and long prompts — no intermediate envelope +// (item.started, item.completed, etc.) carries a usage block. Codex's +// on-disk session log (the `event_msg{type:"token_count"}` shape the +// transcript parser consumes) is a separate format, not surfaced +// through `exec --json`. +// +// Tokens are emitted at every `turn.completed` envelope so multi-turn +// runs show iterative updates in the TUI. // // Package-private; called directly from this package's tests so they can // drive raw stdout fixtures through the parser without going through the @@ -104,10 +121,20 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { out := make(chan reviewtypes.Event, 32) go func() { defer close(out) + // The rollout token tailer (started on thread.started) runs concurrently + // and also sends on out. Stop it and wait for it to exit BEFORE close(out) + // so its send can never hit a closed channel. defer is LIFO, so this runs + // before the close(out) registered above. + stop := make(chan struct{}) + var tailWG sync.WaitGroup + defer func() { + close(stop) + tailWG.Wait() + }() out <- reviewtypes.Started{} scanner := bufio.NewScanner(r) scanner.Buffer(make([]byte, min(1024*1024, maxBuf)), maxBuf) - var seenTurnComplete bool + var seenTurnComplete, emittedTokens, tailerStarted bool var turnUsage codexUsage for scanner.Scan() { line := scanner.Bytes() @@ -123,8 +150,21 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { // default arm logs unknown types at Debug so drift can be // triaged via ENTIRE_LOG_LEVEL=debug. switch env.Type { - case "thread.started", "turn.started": - // Session/turn markers — no event emitted. + case "thread.started": + // Launch the rollout token tailer once. codex's exec --json stdout + // only carries usage on the terminal turn.completed, so we tail the + // rollout file (located by thread_id) for live per-turn token totals + // — the same source codex's interactive UI reads. + if !tailerStarted && env.ThreadID != "" { + tailerStarted = true + tailWG.Add(1) + go func(id string) { + defer tailWG.Done() + tailRolloutTokens(id, out, stop) + }(env.ThreadID) + } + case "turn.started": + // Turn marker — no event emitted. case "item.started": if env.Item.Type == "command_execution" { out <- reviewtypes.ToolCall{Name: "exec", Args: env.Item.Command} @@ -136,9 +176,26 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { // command_execution completion is intentionally swallowed — // item.started already announced it. aggregated_output is the // tool's stdout, not the model's narrative. + // + // No Tokens emission here: codex --json only carries usage on + // `turn.completed`. An earlier version of this parser also + // emitted Tokens here in case a future codex build attached + // a usage block, but that branch was unreachable against real + // codex output and is gone — emit on turn.completed only. case "turn.completed": seenTurnComplete = true turnUsage = env.Usage + // Emit Tokens at every turn boundary so multi-turn reviews + // show iterative updates in the TUI. The post-loop emission + // is kept as a backstop for streams that ended without a + // turn.completed (defensive — shouldn't happen in practice). + if env.Usage.InputTokens > 0 || env.Usage.OutputTokens > 0 { + out <- reviewtypes.Tokens{ + In: env.Usage.InputTokens, + Out: env.Usage.OutputTokens, + } + emittedTokens = true + } default: logging.Debug(context.Background(), "codex parser: unknown envelope type", slog.String("type", env.Type)) @@ -150,13 +207,22 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { return } if seenTurnComplete { + // Defensive backstop: the per-turn emission inside the + // `case "turn.completed"` arm above is the primary path and + // already emitted Tokens for every turn with non-zero usage. + // This fires only if no per-turn emission happened (e.g., a + // terminal turn.completed envelope arrived without a usage + // block — defensive against codex output drift). + // // codex reports cached_input_tokens as a subset of input_tokens // and reasoning_output_tokens as a subset of output_tokens // (matching OpenAI's chat-completions usage shape), so do NOT // sum the subset fields — that would double-count. - out <- reviewtypes.Tokens{ - In: turnUsage.InputTokens, - Out: turnUsage.OutputTokens, + if !emittedTokens { + out <- reviewtypes.Tokens{ + In: turnUsage.InputTokens, + Out: turnUsage.OutputTokens, + } } // Success is hard-coded true here because codex's `turn.completed` // envelope has no turn-level error field in 0.130.0. If a future @@ -173,9 +239,10 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { } type codexEnvelope struct { - Type string `json:"type"` - Item codexItem `json:"item"` - Usage codexUsage `json:"usage"` + Type string `json:"type"` + ThreadID string `json:"thread_id"` // present on thread.started; locates the rollout file + Item codexItem `json:"item"` + Usage codexUsage `json:"usage"` } type codexItem struct { diff --git a/cmd/entire/cli/agent/codex/reviewer_test.go b/cmd/entire/cli/agent/codex/reviewer_test.go index 6a7290c65..40399f559 100644 --- a/cmd/entire/cli/agent/codex/reviewer_test.go +++ b/cmd/entire/cli/agent/codex/reviewer_test.go @@ -100,7 +100,7 @@ func TestCodexReviewer_ArgvShape(t *testing.T) { } } -func TestCodexReviewer_BuiltinReviewExpandsToScopedExecPrompt(t *testing.T) { +func TestCodexReviewer_PassesSkillThroughVerbatim(t *testing.T) { t.Parallel() cfg := reviewtypes.RunConfig{ Skills: []string{"/review"}, @@ -121,18 +121,49 @@ func TestCodexReviewer_BuiltinReviewExpandsToScopedExecPrompt(t *testing.T) { } prompt := readCodexCmdStdin(t, cmd) - if strings.Contains(prompt, "/review") { - t.Fatalf("builtin review prompt should not include raw /review:\n%s", prompt) + // The configured skill must reach codex verbatim so codex's skill system + // loads the real SKILL.md — not a generic paraphrase. + if !strings.Contains(prompt, "/review") { + t.Fatalf("codex prompt should pass /review through verbatim:\n%s", prompt) + } + if strings.Contains(prompt, "Review the current branch changes and report actionable findings.") { + t.Fatalf("codex prompt must not paraphrase /review:\n%s", prompt) } for _, wantText := range []string{ - "Review the current branch changes and report actionable findings.", "Focus on auth regressions.", "Scope: review the commits unique to this branch vs main, plus any uncommitted changes in the working tree. Ignore code outside this scope.", "Commits in scope (newest first):", "abc123 summary", } { if !strings.Contains(prompt, wantText) { - t.Fatalf("builtin review prompt missing %q:\n%s", wantText, prompt) + t.Fatalf("codex prompt missing %q:\n%s", wantText, prompt) + } + } +} + +func TestCodexReviewer_ModelAndReasoningEffortFlags(t *testing.T) { + t.Parallel() + cfg := reviewtypes.RunConfig{ + Skills: []string{"/review"}, + Model: "gpt-5-mini", + ReasoningEffort: "low", + } + cmd := buildCodexReviewCmd(context.Background(), cfg) + + // -m and -c are inserted before the trailing stdin marker; empty + // overrides are skipped (covered by the other argv tests). + want := []string{ + wantCodexAgentName, "exec", "--skip-git-repo-check", "--json", + "-m", "gpt-5-mini", + "-c", "model_reasoning_effort=low", + "-", + } + if len(cmd.Args) != len(want) { + t.Fatalf("len(Args) = %d, want %d: %v", len(cmd.Args), len(want), cmd.Args) + } + for i, w := range want { + if cmd.Args[i] != w { + t.Errorf("Args[%d] = %q, want %q", i, cmd.Args[i], w) } } } @@ -388,6 +419,52 @@ func TestParseCodexOutput_GarbledLineEmitsRunErrorAndContinues(t *testing.T) { } } +// TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted locks the live-token +// contract for codex: its --json output carries `usage` on every +// `turn.completed` envelope, and the parser emits Tokens at each turn +// boundary so multi-turn reviews show iterative updates in the TUI. +// Captured by running real codex-cli 0.130.0 — no item.* envelope ever +// carried a usage field, so emission stays anchored to turn.completed. +func TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted(t *testing.T) { + t.Parallel() + // Real codex --json output (lightly trimmed) simulating a multi-turn + // review: model call → tool exec → model call → tool exec, with a + // turn.completed envelope at every turn boundary carrying running + // usage. The parser should emit Tokens for each turn in order. + input := strings.Join([]string{ + `{"type":"thread.started","thread_id":"tid-1"}`, + `{"type":"turn.started"}`, + `{"type":"item.started","item":{"id":"item_0","type":"command_execution","command":"ls","aggregated_output":"","exit_code":null,"status":"in_progress"}}`, + `{"type":"item.completed","item":{"id":"item_0","type":"command_execution","command":"ls","aggregated_output":"a\nb\nc","exit_code":0,"status":"completed"}}`, + `{"type":"item.completed","item":{"id":"item_1","type":"agent_message","text":"Found three files."}}`, + `{"type":"turn.completed","usage":{"input_tokens":34317,"cached_input_tokens":19712,"output_tokens":240,"reasoning_output_tokens":114}}`, + `{"type":"turn.started"}`, + `{"type":"item.started","item":{"id":"item_2","type":"command_execution","command":"cat a","aggregated_output":"","exit_code":null,"status":"in_progress"}}`, + `{"type":"item.completed","item":{"id":"item_2","type":"command_execution","command":"cat a","aggregated_output":"hello","exit_code":0,"status":"completed"}}`, + `{"type":"item.completed","item":{"id":"item_3","type":"agent_message","text":"Done."}}`, + `{"type":"turn.completed","usage":{"input_tokens":35820,"cached_input_tokens":20114,"output_tokens":401,"reasoning_output_tokens":160}}`, + "", + }, "\n") + + var tokens []reviewtypes.Tokens + for ev := range parseCodexOutput(strings.NewReader(input)) { + if tk, ok := ev.(reviewtypes.Tokens); ok { + tokens = append(tokens, tk) + } + } + + if len(tokens) != 2 { + t.Fatalf("Tokens count = %d, want exactly 2 (one per turn.completed); got events: %+v", + len(tokens), tokens) + } + if tokens[0].In != 34317 || tokens[0].Out != 240 { + t.Errorf("tokens[0] = %+v, want {In:34317, Out:240}", tokens[0]) + } + if tokens[1].In != 35820 || tokens[1].Out != 401 { + t.Errorf("tokens[1] = %+v, want {In:35820, Out:401}", tokens[1]) + } +} + func collectCodexEvents(ch <-chan reviewtypes.Event) []reviewtypes.Event { var events []reviewtypes.Event for ev := range ch { diff --git a/cmd/entire/cli/agent/skilldiscovery/registry.go b/cmd/entire/cli/agent/skilldiscovery/registry.go index bc5938309..0c4b097b6 100644 --- a/cmd/entire/cli/agent/skilldiscovery/registry.go +++ b/cmd/entire/cli/agent/skilldiscovery/registry.go @@ -31,7 +31,12 @@ var curatedBuiltins = map[string][]CuratedSkill{ {Name: "/security-review", Desc: "Scan git diff for security issues"}, {Name: "/simplify", Desc: "Review recent changes for code quality"}, }, - "codex": {{Name: "/review", Desc: "Review current changes and find issues"}}, + // Codex has no binary-bundled review command usable from `codex exec`: + // built-in slash commands like `/review` only fire in the interactive TUI, + // not when piped through exec. Codex's review skills (code-reviewer, + // review-swarm, …) live on disk and are surfaced by DiscoverReviewSkills in + // $name form, so there are no curated built-ins to hardcode here. + "codex": {}, "gemini": {}, } @@ -42,10 +47,14 @@ var curatedBuiltins = map[string][]CuratedSkill{ // Install commands below are placeholders until marketplace URLs are pinned. // Tests do not assert on Message text — only on ProvidesAny semantics — so // prose revisions do not break the suite. +// +// Messages must stay backtick-free: the picker renders them through huh, which +// treats the text as markdown and mangles backtick-wrapped code spans in the +// terminal. Use plain text / colons to set off commands instead. var installHints = map[string][]InstallHint{ "claude-code": { { - Message: "Install `pr-review-toolkit` via `claude plugin install entireio/pr-review-toolkit`", + Message: "Install pr-review-toolkit: claude plugin install entireio/pr-review-toolkit", ProvidesAny: []string{ "/pr-review-toolkit:review-pr", "/pr-review-toolkit:code-reviewer", @@ -53,19 +62,19 @@ var installHints = map[string][]InstallHint{ }, }, { - Message: "Install `test-auditor` via the superpowers plugin", + Message: "Install test-auditor via the superpowers plugin", ProvidesAny: []string{"/test-auditor"}, }, }, "codex": { { - Message: "Install `codex-review-pack` via `codex plugins add `", + Message: "Install codex-review-pack: codex plugins add ", ProvidesAny: []string{"/codex:adversarial-review"}, }, }, "gemini": { { - Message: "Install `gemini-code-review` via `gemini extensions install `", + Message: "Install gemini-code-review: gemini extensions install ", ProvidesAny: nil, }, }, diff --git a/cmd/entire/cli/agent/skilldiscovery/registry_test.go b/cmd/entire/cli/agent/skilldiscovery/registry_test.go index 22f45e533..b38bbc6d4 100644 --- a/cmd/entire/cli/agent/skilldiscovery/registry_test.go +++ b/cmd/entire/cli/agent/skilldiscovery/registry_test.go @@ -12,9 +12,11 @@ func TestCuratedBuiltinsFor_KnownAgents(t *testing.T) { if len(claude) != 3 { t.Fatalf("claude-code built-ins: got %d entries, want 3", len(claude)) } + // Codex has no binary-bundled review command usable from `codex exec`; + // its review skills are discovered on disk in $name form instead. codex := skilldiscovery.CuratedBuiltinsFor("codex") - if len(codex) != 1 || codex[0].Name != "/review" { - t.Errorf("codex built-ins: got %+v, want 1x /review", codex) + if len(codex) != 0 { + t.Errorf("codex built-ins: got %+v, want 0 (discovery-driven)", codex) } gemini := skilldiscovery.CuratedBuiltinsFor("gemini") if len(gemini) != 0 { diff --git a/cmd/entire/cli/agent/skilldiscovery/scan.go b/cmd/entire/cli/agent/skilldiscovery/scan.go new file mode 100644 index 000000000..fe2039f1b --- /dev/null +++ b/cmd/entire/cli/agent/skilldiscovery/scan.go @@ -0,0 +1,257 @@ +package skilldiscovery + +import ( + "context" + "errors" + "log/slog" + "os" + "path/filepath" + "sort" + "strings" + + "golang.org/x/mod/semver" + + "github.com/entireio/cli/cmd/entire/cli/agent" + "github.com/entireio/cli/cmd/entire/cli/logging" +) + +// InvocationForm builds an agent's invocation string for a discovered skill. +// The only thing that differs between agents is the prefix and namespace +// joiner: Claude Code uses slash form (`/name`, `/plugin:name`), codex uses +// dollar form (`$name`, `$plugin:name`) — the literal token a user types to +// invoke the skill in that CLI. Discovery emits Name already in this form so +// downstream prompt composition stays agent-agnostic and joins verbatim. +type InvocationForm func(name, pluginName string) string + +// SlashForm is Claude Code's invocation syntax: "/name" or "/plugin:name". +func SlashForm(name, pluginName string) string { + if pluginName == "" { + return "/" + name + } + return "/" + pluginName + ":" + name +} + +// DollarForm is codex's invocation syntax: "$name" or "$plugin:name". This is +// the explicit "use this skill" token from codex's own injected skills +// catalog ("name a skill with $SkillName or plain text"). +func DollarForm(name, pluginName string) string { + if pluginName == "" { + return "$" + name + } + return "$" + pluginName + ":" + name +} + +// DedupeByInvocation collapses entries sharing an invocation name, keeping the +// first occurrence. Plugins can ship a skill and a same-named wrapper that +// forwards to it; scan order decides which wins. +func DedupeByInvocation(in []agent.DiscoveredSkill) []agent.DiscoveredSkill { + if len(in) < 2 { + return in + } + seen := make(map[string]struct{}, len(in)) + out := make([]agent.DiscoveredSkill, 0, len(in)) + for _, s := range in { + if _, dup := seen[s.Name]; dup { + continue + } + seen[s.Name] = struct{}{} + out = append(out, s) + } + return out +} + +// ScanPluginCache walks //// and invokes +// scanVersion once per plugin, for the single version directory chosen by +// PickLatestVersion. The callback receives the chosen version root and the +// plugin name (used as the invocation namespace). Both Claude Code and codex +// use this same market/plugin/version cache layout; they differ only in which +// subdirectories under the version root they scan and their invocation form. +func ScanPluginCache(ctx context.Context, root string, scanVersion func(versionRoot, pluginName string) []agent.DiscoveredSkill) []agent.DiscoveredSkill { + entries, err := os.ReadDir(root) + if err != nil { + logging.Debug(ctx, "skill discovery: plugin cache unreadable", + slog.String("root", root), slog.String("error", err.Error())) + return nil + } + var found []agent.DiscoveredSkill + for _, marketEntry := range entries { + if !marketEntry.IsDir() { + continue + } + marketRoot := filepath.Join(root, marketEntry.Name()) + pluginEntries, err := os.ReadDir(marketRoot) + if err != nil { + continue + } + for _, pluginEntry := range pluginEntries { + if !pluginEntry.IsDir() { + continue + } + pluginName := pluginEntry.Name() + pluginRoot := filepath.Join(marketRoot, pluginName) + versionEntries, err := os.ReadDir(pluginRoot) + if err != nil { + continue + } + versionDir, ok := PickLatestVersion(versionEntries) + if !ok { + continue + } + found = append(found, scanVersion(filepath.Join(pluginRoot, versionDir), pluginName)...) + } + } + return found +} + +// PickLatestVersion returns the "newest" version directory name among entries: +// +// - If any entry parses as semver (with or without a leading "v"), pick the +// highest semver; non-semver entries are ignored when a semver exists. +// - Otherwise fall back to the lexicographic max of all directory names. +// This handles the "unknown" sentinel some plugins ship, and the opaque +// content-hash version dirs codex plugins use (e.g. "fef63ecf"). +// +// Returns ("", false) if no usable directory entry exists. +func PickLatestVersion(entries []os.DirEntry) (string, bool) { + var dirs []string + for _, e := range entries { + if e.IsDir() { + dirs = append(dirs, e.Name()) + } + } + if len(dirs) == 0 { + return "", false + } + var semverDirs []string + for _, d := range dirs { + if semver.IsValid(semverWithV(d)) { + semverDirs = append(semverDirs, d) + } + } + if len(semverDirs) > 0 { + sort.Slice(semverDirs, func(i, j int) bool { + return semver.Compare(semverWithV(semverDirs[i]), semverWithV(semverDirs[j])) > 0 + }) + return semverDirs[0], true + } + sort.Sort(sort.Reverse(sort.StringSlice(dirs))) + return dirs[0], true +} + +// semverWithV ensures a version string has the "v" prefix golang.org/x/mod/semver +// requires. Plugin version dirs are usually bare (e.g. "0.1.0"). +func semverWithV(s string) string { + if strings.HasPrefix(s, "v") { + return s + } + return "v" + s +} + +// ScanSkillsDir reads each

//SKILL.md, parses its frontmatter, and +// emits a DiscoveredSkill (in invoke's form) when Matches() returns true. +// pluginName is the namespace ("" for un-namespaced user skills). Missing dirs +// yield nil — discovery is best-effort. +func ScanSkillsDir(ctx context.Context, dir, pluginName string, invoke InvocationForm) []agent.DiscoveredSkill { + entries, err := os.ReadDir(dir) + if err != nil { + return nil + } + var found []agent.DiscoveredSkill + for _, skillEntry := range entries { + if !skillEntry.IsDir() { + continue + } + skillFile := filepath.Join(dir, skillEntry.Name(), "SKILL.md") + data, err := os.ReadFile(skillFile) //nolint:gosec // G304: skillFile is built from a ReadDir walk under HOME, not user input + if err != nil { + continue + } + name, description, parseErr := ParseSkillFrontmatter(data) + if parseErr != nil { + logging.Debug(ctx, "skill discovery: skipping malformed SKILL.md", + slog.String("path", skillFile), slog.String("error", parseErr.Error())) + continue + } + if name == "" { + name = skillEntry.Name() + } + invocation := invoke(name, pluginName) + if !Matches(invocation, description) { + continue + } + found = append(found, agent.DiscoveredSkill{ + Name: invocation, + Description: description, + SourcePath: skillFile, + }) + } + return found +} + +// ScanFlatMarkdownDir reads *.md files directly under dir (no nesting), parses +// their frontmatter for `description:`, and derives the invocation name from +// the filename (minus .md). Used by Claude Code for plugin/user commands and +// agents, whose frontmatter has no `name:` field. README.md is skipped. +func ScanFlatMarkdownDir(ctx context.Context, dir, pluginName string, invoke InvocationForm) []agent.DiscoveredSkill { + entries, err := os.ReadDir(dir) + if err != nil { + return nil + } + var found []agent.DiscoveredSkill + for _, entry := range entries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".md") { + continue + } + baseName := strings.TrimSuffix(entry.Name(), ".md") + if strings.EqualFold(baseName, "README") { + continue + } + filePath := filepath.Join(dir, entry.Name()) + data, err := os.ReadFile(filePath) //nolint:gosec // G304: filePath is built from a ReadDir walk under HOME, not user input + if err != nil { + continue + } + _, description, parseErr := ParseSkillFrontmatter(data) + if parseErr != nil { + logging.Debug(ctx, "skill discovery: skipping malformed command/agent", + slog.String("path", filePath), slog.String("error", parseErr.Error())) + continue + } + invocation := invoke(baseName, pluginName) + if !Matches(invocation, description) { + continue + } + found = append(found, agent.DiscoveredSkill{ + Name: invocation, + Description: description, + SourcePath: filePath, + }) + } + return found +} + +// ParseSkillFrontmatter extracts `name:` and `description:` from a minimal YAML +// frontmatter block — the tiny subset these SKILL.md / command / agent files +// use. Surrounding double-quotes are trimmed so `description: "foo"` returns +// `foo`. +func ParseSkillFrontmatter(data []byte) (name, description string, err error) { + s := string(data) + if !strings.HasPrefix(s, "---\n") && !strings.HasPrefix(s, "---\r\n") { + return "", "", errors.New("no frontmatter delimiter") + } + body := strings.TrimPrefix(strings.TrimPrefix(s, "---\r\n"), "---\n") + end := strings.Index(body, "\n---") + if end < 0 { + return "", "", errors.New("no closing frontmatter delimiter") + } + for _, line := range strings.Split(body[:end], "\n") { + line = strings.TrimSpace(line) + switch { + case strings.HasPrefix(line, "name:"): + name = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "name:")), `"`) + case strings.HasPrefix(line, "description:"): + description = strings.Trim(strings.TrimSpace(strings.TrimPrefix(line, "description:")), `"`) + } + } + return name, description, nil +} diff --git a/cmd/entire/cli/review/types/reviewer.go b/cmd/entire/cli/review/types/reviewer.go index 97f9a631b..c9f2ea505 100644 --- a/cmd/entire/cli/review/types/reviewer.go +++ b/cmd/entire/cli/review/types/reviewer.go @@ -104,6 +104,18 @@ type RunConfig struct { // the commit that was reviewed. StartingSHA string + // Model is an optional per-spawn model override the reviewer should + // pass to its underlying CLI (e.g. codex `-m `). Empty means + // use the agent's default. Reviewers that don't support a model flag + // silently ignore this field. + Model string + + // ReasoningEffort is an optional per-spawn reasoning-effort override + // (e.g. "low", "medium", "high", "xhigh" for codex). Empty means use + // the agent's default. Reviewers that don't expose reasoning effort + // silently ignore this field. + ReasoningEffort string + // EnrichSummary optionally updates the completed run summary before sinks // receive RunFinished. It is used for post-process data such as token // totals that are only available after agent lifecycle hooks flush state. diff --git a/cmd/entire/cli/settings/settings.go b/cmd/entire/cli/settings/settings.go index 21f51265b..8dfe20ce1 100644 --- a/cmd/entire/cli/settings/settings.go +++ b/cmd/entire/cli/settings/settings.go @@ -332,11 +332,22 @@ type ReviewConfig struct { // Skills is non-empty it is appended after the selected skills; when // Skills is empty it is the full prompt for prompt-only review configs. Prompt string `json:"prompt,omitempty"` + + // Model is an optional per-spawn model override passed to the agent's + // CLI (e.g. codex `-m `). Empty falls back to the agent's own + // configured default. Agents that don't support a model flag ignore it. + Model string `json:"model,omitempty"` + + // ReasoningEffort is an optional per-spawn reasoning-effort override + // (e.g. "low", "medium", "high", "xhigh" for codex `-c + // model_reasoning_effort=`). Empty falls back to the agent's default. + // Agents without a reasoning-effort knob ignore it. + ReasoningEffort string `json:"reasoning_effort,omitempty"` } // IsZero reports whether the config is effectively unset. func (c ReviewConfig) IsZero() bool { - return len(c.Skills) == 0 && c.Prompt == "" + return len(c.Skills) == 0 && c.Prompt == "" && c.Model == "" && c.ReasoningEffort == "" } // ReviewConfigFor returns the configured review config for the given agent. From 242e9bb4e324ac036c417f5ba7a37e0749352aa9 Mon Sep 17 00:00:00 2001 From: Peyton Montei Date: Thu, 4 Jun 2026 13:45:10 -0400 Subject: [PATCH 2/4] fix(review): address #1370 bugbot findings - reviewer_test: TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted feeds a thread.started envelope, which launches the rollout token tailer. Pin ENTIRE_TEST_CODEX_SESSION_DIR to an empty temp dir (drop t.Parallel) so the tailer can't pick up a real ~/.codex rollout-*-tid-1.jsonl and inject extra Tokens that break the exactly-2 assertion. - skilldiscovery: codex install-hint suppression fingerprint was slash form (/codex:adversarial-review) but DiscoverReviewSkills emits dollar form, so ActiveInstallHintsFor never intersected and the hint showed forever after install. Use $codex:adversarial-review. Co-Authored-By: Claude Opus 4.8 (1M context) Entire-Checkpoint: 210b33059b2b --- cmd/entire/cli/agent/codex/reviewer_test.go | 6 +++++- cmd/entire/cli/agent/skilldiscovery/registry.go | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cmd/entire/cli/agent/codex/reviewer_test.go b/cmd/entire/cli/agent/codex/reviewer_test.go index 40399f559..0936a09e5 100644 --- a/cmd/entire/cli/agent/codex/reviewer_test.go +++ b/cmd/entire/cli/agent/codex/reviewer_test.go @@ -426,7 +426,11 @@ func TestParseCodexOutput_GarbledLineEmitsRunErrorAndContinues(t *testing.T) { // Captured by running real codex-cli 0.130.0 — no item.* envelope ever // carried a usage field, so emission stays anchored to turn.completed. func TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted(t *testing.T) { - t.Parallel() + // Not t.Parallel: the thread.started envelope launches the rollout token + // tailer, which resolves the codex session dir. Pin it to an empty temp + // dir so the tailer never finds a real ~/.codex rollout-*-tid-1.jsonl and + // injects extra Tokens events that would break the exactly-2 assertion. + t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", t.TempDir()) // Real codex --json output (lightly trimmed) simulating a multi-turn // review: model call → tool exec → model call → tool exec, with a // turn.completed envelope at every turn boundary carrying running diff --git a/cmd/entire/cli/agent/skilldiscovery/registry.go b/cmd/entire/cli/agent/skilldiscovery/registry.go index 0c4b097b6..e42ea3067 100644 --- a/cmd/entire/cli/agent/skilldiscovery/registry.go +++ b/cmd/entire/cli/agent/skilldiscovery/registry.go @@ -68,8 +68,11 @@ var installHints = map[string][]InstallHint{ }, "codex": { { - Message: "Install codex-review-pack: codex plugins add ", - ProvidesAny: []string{"/codex:adversarial-review"}, + Message: "Install codex-review-pack: codex plugins add ", + // Dollar form: codex DiscoverReviewSkills emits $plugin:name, so the + // suppression fingerprint must match that, not the slash form — else + // ActiveInstallHintsFor never intersects and the hint shows forever. + ProvidesAny: []string{"$codex:adversarial-review"}, }, }, "gemini": { From 69cc48615520d06fa7cd2c36b038b514f346d253 Mon Sep 17 00:00:00 2001 From: Peyton Montei Date: Thu, 4 Jun 2026 14:29:18 -0400 Subject: [PATCH 3/4] fix(review): address #1370 Copilot findings (codex) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Event ordering (real bug): the rollout token tailer runs concurrently and sends Tokens on the same channel as the parser. It was only stopped in a defer that runs AFTER the inline Finished emissions, so a late Tokens could follow Finished — violating ReviewerTemplate's "Finished is the last event" contract. Introduce an idempotent stopTailer (sync.Once + WaitGroup.Wait) and call it before every terminal Finished; the defer remains as a backstop. Adds a regression test (Finished is last even with a live rollout tailer), verified under -race. - Test fidelity: TestCodexReviewer_PassesSkillThroughVerbatim configured the codex skill as "/review" (claude's slash form). Codex skills are dollar-prefixed; use "$code-reviewer" so the test models real codex usage. Co-Authored-By: Claude Opus 4.8 (1M context) Entire-Checkpoint: 480a94f05ac8 --- cmd/entire/cli/agent/codex/reviewer.go | 24 ++++++++--- cmd/entire/cli/agent/codex/reviewer_test.go | 45 +++++++++++++++++++-- 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/cmd/entire/cli/agent/codex/reviewer.go b/cmd/entire/cli/agent/codex/reviewer.go index f05937f9c..33efa11c1 100644 --- a/cmd/entire/cli/agent/codex/reviewer.go +++ b/cmd/entire/cli/agent/codex/reviewer.go @@ -122,15 +122,22 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { go func() { defer close(out) // The rollout token tailer (started on thread.started) runs concurrently - // and also sends on out. Stop it and wait for it to exit BEFORE close(out) - // so its send can never hit a closed channel. defer is LIFO, so this runs - // before the close(out) registered above. + // and also sends on out. stopTailer halts it and waits for it to exit. + // It MUST run before any Finished emission: ReviewerTemplate requires + // Finished to be the last event before the channel closes, so a late + // Tokens from the tailer would violate that contract (and a send after + // close(out) would panic). Idempotent (sync.Once) so it's safe to call + // explicitly before each terminal Finished AND from the defer backstop; + // the defer (LIFO, registered after close(out)) still guards early or + // unexpected returns. stop := make(chan struct{}) var tailWG sync.WaitGroup - defer func() { - close(stop) + var stopOnce sync.Once + stopTailer := func() { + stopOnce.Do(func() { close(stop) }) tailWG.Wait() - }() + } + defer stopTailer() out <- reviewtypes.Started{} scanner := bufio.NewScanner(r) scanner.Buffer(make([]byte, min(1024*1024, maxBuf)), maxBuf) @@ -202,6 +209,7 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { } } if err := scanner.Err(); err != nil { + stopTailer() out <- reviewtypes.RunError{Err: fmt.Errorf("read stdout: %w", err)} out <- reviewtypes.Finished{Success: false} return @@ -224,6 +232,9 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { Out: turnUsage.OutputTokens, } } + // Stop the tailer (and wait) before Finished so no late Tokens can + // land after the terminal event. + stopTailer() // Success is hard-coded true here because codex's `turn.completed` // envelope has no turn-level error field in 0.130.0. If a future // codex version adds one (e.g., an `error` or `is_error` field on @@ -233,6 +244,7 @@ func parseCodexOutputBuf(r io.Reader, maxBuf int) <-chan reviewtypes.Event { out <- reviewtypes.Finished{Success: true} return } + stopTailer() out <- reviewtypes.Finished{Success: false} }() return out diff --git a/cmd/entire/cli/agent/codex/reviewer_test.go b/cmd/entire/cli/agent/codex/reviewer_test.go index 0936a09e5..a82b846f2 100644 --- a/cmd/entire/cli/agent/codex/reviewer_test.go +++ b/cmd/entire/cli/agent/codex/reviewer_test.go @@ -6,6 +6,7 @@ import ( "io" "os" "os/exec" + "path/filepath" "strings" "testing" "time" @@ -102,8 +103,11 @@ func TestCodexReviewer_ArgvShape(t *testing.T) { func TestCodexReviewer_PassesSkillThroughVerbatim(t *testing.T) { t.Parallel() + // Codex skills are dollar-prefixed ($name / $plugin:name) — the literal + // token a user types in the codex CLI. Use that form so the test models + // real codex usage, not claude's slash form. cfg := reviewtypes.RunConfig{ - Skills: []string{"/review"}, + Skills: []string{"$code-reviewer"}, AlwaysPrompt: "Focus on auth regressions.", ScopeBaseRef: "main", CheckpointContext: "Commits in scope (newest first):\n abc123 summary\n", @@ -123,11 +127,11 @@ func TestCodexReviewer_PassesSkillThroughVerbatim(t *testing.T) { prompt := readCodexCmdStdin(t, cmd) // The configured skill must reach codex verbatim so codex's skill system // loads the real SKILL.md — not a generic paraphrase. - if !strings.Contains(prompt, "/review") { - t.Fatalf("codex prompt should pass /review through verbatim:\n%s", prompt) + if !strings.Contains(prompt, "$code-reviewer") { + t.Fatalf("codex prompt should pass $code-reviewer through verbatim:\n%s", prompt) } if strings.Contains(prompt, "Review the current branch changes and report actionable findings.") { - t.Fatalf("codex prompt must not paraphrase /review:\n%s", prompt) + t.Fatalf("codex prompt must not paraphrase the configured skill:\n%s", prompt) } for _, wantText := range []string{ "Focus on auth regressions.", @@ -469,6 +473,39 @@ func TestParseCodexOutput_EmitsTokensAtEveryTurnCompleted(t *testing.T) { } } +// Regression: the rollout token tailer runs concurrently and sends on the same +// channel. The parser must stop+join it BEFORE emitting the terminal Finished, +// so a late Tokens can never follow Finished (ReviewerTemplate requires Finished +// to be the last event). A live rollout file gives the tailer real work to emit. +func TestParseCodexOutput_FinishedIsLastEvenWithLiveTailer(t *testing.T) { + // Cannot t.Parallel — uses t.Setenv. + dir := t.TempDir() + t.Setenv("ENTIRE_TEST_CODEX_SESSION_DIR", dir) + rollout := filepath.Join(dir, "rollout-2026-06-03T08-57-39-"+tailTestThreadID+".jsonl") + content := tokenLine(100, 5) + tokenLine(200, 10) + tokenLine(300, 15) + if err := os.WriteFile(rollout, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + + input := strings.Join([]string{ + `{"type":"thread.started","thread_id":"` + tailTestThreadID + `"}`, + `{"type":"turn.completed","usage":{"input_tokens":300,"output_tokens":15}}`, + "", + }, "\n") + + var events []reviewtypes.Event + for ev := range parseCodexOutput(strings.NewReader(input)) { + events = append(events, ev) + } + if len(events) == 0 { + t.Fatal("no events emitted") + } + if _, ok := events[len(events)-1].(reviewtypes.Finished); !ok { + t.Fatalf("last event = %T, want Finished (no Tokens may follow the terminal event); events=%+v", + events[len(events)-1], events) + } +} + func collectCodexEvents(ch <-chan reviewtypes.Event) []reviewtypes.Event { var events []reviewtypes.Event for ev := range ch { From 5974ba99185050eb04ef6f0aa7556121c342333f Mon Sep 17 00:00:00 2001 From: Peyton Montei Date: Thu, 4 Jun 2026 17:48:07 -0400 Subject: [PATCH 4/4] fix(review): capture Claude reviewer model via transcript backfill (#1373) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Claude review subprocesses run via `claude -p` don't reliably report a model through lifecycle hooks (the ModelUpdate path is flaky in headless mode), so per-session checkpoint metadata landed with "model": "" for Claude reviewer sessions, while codex/pi reviewers record theirs. Root cause: ClaudeCodeAgent did not implement agent.ModelExtractor, so condense's sessionStateBackfillModel fallback (AsModelExtractor -> ExtractModel) was a no-op for Claude; the model relied solely on the flaky hook path. Implement ExtractModel on ClaudeCodeAgent: parse the transcript and return the last non-empty model from an assistant line (mirrors pi/copilotcli/factoryai). Condense already calls this when state.ModelName == "", so review (and any hook-missed) Claude sessions now record a non-empty model deterministically. Verified against real recorded data: a real agent_review row in entire/checkpoints/v1 with model="" (skills=[/review]) — running the new ExtractModel on that session's stored transcript recovers "claude-opus-4-7", which is exactly the value the condense backfill writes. Surfaces in entire.io's per-reviewer attribution (entirehq/entire.io#2271). Note: the transcript carries the base model id ("claude-opus-4-7"), not the hook's context-window variant ("claude-opus-4-7[1m]"), so backfilled models omit the [1m] suffix — a correct, non-empty id, slightly less precise. Co-Authored-By: Claude Opus 4.8 (1M context) Entire-Checkpoint: 41d9aa9acef6 --- cmd/entire/cli/agent/claudecode/transcript.go | 29 +++++++++++++ .../cli/agent/claudecode/transcript_test.go | 42 +++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/cmd/entire/cli/agent/claudecode/transcript.go b/cmd/entire/cli/agent/claudecode/transcript.go index d151c814f..9d8c241af 100644 --- a/cmd/entire/cli/agent/claudecode/transcript.go +++ b/cmd/entire/cli/agent/claudecode/transcript.go @@ -88,6 +88,35 @@ func ExtractModifiedFiles(lines []TranscriptLine) []string { return files } +// ExtractModel returns the LLM model recorded in the transcript, satisfying +// agent.ModelExtractor. Claude review subprocesses spawned via `claude -p` do +// not reliably report the model through lifecycle hooks (the ModelUpdate path +// is flaky in headless mode), so condense backfills it from the transcript +// here — the same mechanism codex/pi/etc. rely on. Returns the last non-empty +// model seen on an assistant line, or "" when the transcript carries none. +func (c *ClaudeCodeAgent) ExtractModel(transcriptData []byte) (string, error) { + lines, err := transcript.ParseFromBytes(transcriptData) + if err != nil { + return "", fmt.Errorf("parse transcript: %w", err) + } + model := "" + for _, line := range lines { + if line.Type != envelopeTypeAssistant { + continue + } + var msg struct { + Model string `json:"model"` + } + if err := json.Unmarshal(line.Message, &msg); err != nil { + continue + } + if msg.Model != "" { + model = msg.Model + } + } + return model, nil +} + // TruncateAtUUID returns transcript lines up to and including the line with given UUID func TruncateAtUUID(lines []TranscriptLine, uuid string) []TranscriptLine { if uuid == "" { diff --git a/cmd/entire/cli/agent/claudecode/transcript_test.go b/cmd/entire/cli/agent/claudecode/transcript_test.go index e16929a2e..d96c77cba 100644 --- a/cmd/entire/cli/agent/claudecode/transcript_test.go +++ b/cmd/entire/cli/agent/claudecode/transcript_test.go @@ -869,3 +869,45 @@ func TestExtractAllModifiedFiles_SubagentOnlyChanges(t *testing.T) { t.Errorf("missing expected file %q", f) } } + +func TestExtractModel(t *testing.T) { + t.Parallel() + data := []byte(`{"type":"assistant","uuid":"a1","message":{"model":"claude-opus-4-7[1m]","content":[{"type":"text","text":"hi"}]}}` + "\n") + model, err := (&ClaudeCodeAgent{}).ExtractModel(data) + if err != nil { + t.Fatal(err) + } + if model != "claude-opus-4-7[1m]" { + t.Errorf("model = %q, want claude-opus-4-7[1m]", model) + } +} + +func TestExtractModel_MostRecentWins(t *testing.T) { + t.Parallel() + data := []byte(strings.Join([]string{ + `{"type":"assistant","uuid":"a1","message":{"model":"claude-sonnet-4-6","content":[]}}`, + `{"type":"assistant","uuid":"a2","message":{"model":"claude-opus-4-8","content":[]}}`, + "", + }, "\n")) + model, err := (&ClaudeCodeAgent{}).ExtractModel(data) + if err != nil { + t.Fatal(err) + } + if model != "claude-opus-4-8" { + t.Errorf("model = %q, want claude-opus-4-8 (most recent)", model) + } +} + +func TestExtractModel_EmptyWhenAbsent(t *testing.T) { + t.Parallel() + // A review subprocess transcript with no model field on any assistant line + // yields "" (caller treats that as "no model available"). + data := []byte(`{"type":"assistant","uuid":"a1","message":{"content":[{"type":"text","text":"hi"}]}}` + "\n") + model, err := (&ClaudeCodeAgent{}).ExtractModel(data) + if err != nil { + t.Fatal(err) + } + if model != "" { + t.Errorf("model = %q, want empty", model) + } +}