Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/),
and this project adheres to [Semantic Versioning](https://semver.org/).

## [0.8.0] - 2026-06-03

### Added

- Added Replay Lab: `entire replay checkpoint` replays a committed checkpoint in an isolated worktree and compares the agent result to the real commit by file overlap, optional tests, risk signals, semantic similarity when `entire-sem` is installed, duration, and token usage when available.
- Added `entire eval run` and `entire eval report` for private multi-agent benchmarks across explicit or recent Entire checkpoints, including per-agent rankings by pass rate, file recall, precision, risk, duration, and token use.
- `entire labs` now lists Replay Lab commands alongside other experimental workflows.
## [0.7.5] - 2026-06-04

### Security
Expand Down
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ go test -tags=integration ./cmd/entire/cli/integration_test -run TestLogin
| `entire disable` | Remove Entire hooks from repository |
| `entire doctor` | Fix or clean up stuck sessions |
| `entire enable` | Enable Entire in your repository |
| `entire replay` | Replay checkpoint tasks in isolated worktrees |
| `entire eval` | Run private agent evals from Entire checkpoints |
| `entire checkpoint` | List, explain, rewind, and search checkpoints |
| `entire checkpoint explain` | Explain a session, commit, or checkpoint |
| `entire checkpoint rewind` | Rewind to a previous checkpoint |
Expand All @@ -253,6 +255,55 @@ go test -tags=integration ./cmd/entire/cli/integration_test -run TestLogin
| `entire doctor trace` | Show hook performance traces |
| `entire version` | Show Entire CLI version |

### Replay Lab

Replay Lab turns real Entire checkpoints into private agent benchmarks. It
checks out the checkpoint's parent commit in an isolated temp worktree, runs the
original prompt with a launchable agent, then compares the result to the real
checkpoint commit by changed files, optional tests, risk signals, duration, and
token usage when available. If `entire-sem` is installed, reports also include
semantic similarity.

Replay one checkpoint:

```bash
entire replay checkpoint <checkpoint-id> \
--agent codex \
--test-cmd "go test ./..." \
--timeout 20m
```

Inspect or automate a run:

```bash
entire replay checkpoint <checkpoint-id> --agent claude-code --keep-worktree
entire replay checkpoint <checkpoint-id> --agent gemini --json
entire replay report <run-id>
entire replay report <run-id> --json
```

Compare agents across recent checkpoints:

```bash
entire eval run \
--from-checkpoints \
--limit 5 \
--agent claude-code,codex \
--test-cmd "go test ./..." \
--timeout 20m

entire eval report <eval-id>
entire eval report <eval-id> --json
```

Supported replay agents are `claude-code`, `codex`, and `gemini`. Replay and
eval JSON is saved under the repository's git common directory at
`.git/entire-replay/`, so benchmark output is local to the repo and not tracked
in the working tree. Eval rankings sort agents by pass rate, file recall,
precision, risk, duration, and token use. See
[`docs/architecture/replay-lab.md`](docs/architecture/replay-lab.md) for the
storage, isolation, and scoring details.

### `entire enable` Flags

| Flag | Description |
Expand Down
25 changes: 22 additions & 3 deletions cmd/entire/cli/labs.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ var experimentalCommands = []experimentalCommandInfo{
Invocation: "entire investigate",
Summary: "Run a multi-agent investigation against a topic, issue, or seed doc",
},
{
Name: "replay",
Invocation: "entire replay",
Summary: "Replay checkpoint tasks in isolated worktrees",
},
{
Name: "eval",
Invocation: "entire eval",
Summary: "Run private agent benchmarks from Entire checkpoints",
},
{
Name: "org",
Invocation: "entire org",
Expand Down Expand Up @@ -57,9 +67,7 @@ func newLabsCmd() *cobra.Command {
return nil
}
err := fmt.Errorf("unknown labs topic %q", args[0])
fmt.Fprintf(cmd.ErrOrStderr(),
"%v\n\nRun `entire labs` to see available experimental commands, or run `entire review --help` for command-specific help.\n",
err)
fmt.Fprintf(cmd.ErrOrStderr(), "%v\n\n%s\n", err, labsTopicHint(args[0]))
return NewSilentError(err)
},
Run: func(cmd *cobra.Command, _ []string) {
Expand Down Expand Up @@ -87,13 +95,24 @@ Available experimental commands:
Try:
entire review --help
entire investigate --help
entire replay --help
entire eval --help
entire org --help
entire project --help
entire repo --help
entire grant --help
`
}

func labsTopicHint(topic string) string {
for _, info := range experimentalCommands {
if topic == info.Name {
return fmt.Sprintf("Run `entire labs` to see available experimental commands, or run `%s --help` for command-specific help.", info.Invocation)
}
}
return "Run `entire labs` to see available experimental commands and their command-specific help."
}

func renderExperimentalCommands(commands []experimentalCommandInfo) string {
width := 0
for _, info := range commands {
Expand Down
25 changes: 24 additions & 1 deletion cmd/entire/cli/labs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ func TestLabsCmd_PrintsExperimentalCommandList(t *testing.T) {
"newer Entire workflows",
"Available experimental commands",
"entire review",
"entire replay",
"entire eval",
"entire review --help",
"entire replay --help",
"entire eval --help",
} {
if !strings.Contains(got, want) {
t.Fatalf("entire labs output missing %q:\n%s", want, got)
Expand All @@ -46,7 +50,7 @@ func TestLabsCmd_HelpShowsExperimentalCommandList(t *testing.T) {
t.Fatalf("entire labs --help failed: %v", err)
}
got := out.String()
for _, want := range []string{"Labs", "entire review"} {
for _, want := range []string{"Labs", "entire review", "entire replay", "entire eval"} {
if !strings.Contains(got, want) {
t.Fatalf("entire labs --help output missing %q:\n%s", want, got)
}
Expand Down Expand Up @@ -77,6 +81,25 @@ func TestLabsCmd_RejectsTopicWithoutRunningIt(t *testing.T) {
}
}

func TestLabsCmd_UnknownTopicPointsBackToLabs(t *testing.T) {
t.Parallel()

root := NewRootCmd()
var out, errOut bytes.Buffer
root.SetOut(&out)
root.SetErr(&errOut)
root.SetArgs([]string{"labs", "unknown-topic"})

err := root.Execute()
if err == nil {
t.Fatal("entire labs unknown-topic should return an error")
}
stderr := errOut.String()
if !strings.Contains(stderr, "entire labs") || strings.Contains(stderr, "entire unknown-topic --help") {
t.Fatalf("stderr should point unknown topics back to labs without inventing a command, got:\n%s", stderr)
}
}

func TestRootHelp_ShowsLabsButHidesReview(t *testing.T) {
t.Parallel()

Expand Down
Loading