entireio · suhaanthayyil · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/),
 and this project adheres to [Semantic Versioning](https://semver.org/).
 
+## [0.8.0] - 2026-06-03
+
+### Added
+
+- Added Replay Lab: `entire replay checkpoint` replays a committed checkpoint in an isolated worktree and compares the agent result to the real commit by file overlap, optional tests, risk signals, semantic similarity when `entire-sem` is installed, duration, and token usage when available.
+- Added `entire eval run` and `entire eval report` for private multi-agent benchmarks across explicit or recent Entire checkpoints, including per-agent rankings by pass rate, file recall, precision, risk, duration, and token use.
+- `entire labs` now lists Replay Lab commands alongside other experimental workflows.
 ## [0.7.5] - 2026-06-04
 
 ### Security

@@ -242,6 +242,8 @@ go test -tags=integration ./cmd/entire/cli/integration_test -run TestLogin
 | `entire disable` | Remove Entire hooks from repository                                                               |
 | `entire doctor`  | Fix or clean up stuck sessions                                                                    |
 | `entire enable`  | Enable Entire in your repository                                                                  |
+| `entire replay`  | Replay checkpoint tasks in isolated worktrees                                                     |
+| `entire eval`    | Run private agent evals from Entire checkpoints                                                   |
 | `entire checkpoint`        | List, explain, rewind, and search checkpoints                                           |
 | `entire checkpoint explain` | Explain a session, commit, or checkpoint                                               |
 | `entire checkpoint rewind` | Rewind to a previous checkpoint                                                         |
@@ -253,6 +255,55 @@ go test -tags=integration ./cmd/entire/cli/integration_test -run TestLogin
 | `entire doctor trace` | Show hook performance traces                                                                 |
 | `entire version` | Show Entire CLI version                                                                           |
 
+### Replay Lab
+
+Replay Lab turns real Entire checkpoints into private agent benchmarks. It
+checks out the checkpoint's parent commit in an isolated temp worktree, runs the
+original prompt with a launchable agent, then compares the result to the real
+checkpoint commit by changed files, optional tests, risk signals, duration, and
+token usage when available. If `entire-sem` is installed, reports also include
+semantic similarity.
+
+Replay one checkpoint:
+
+```bash
+entire replay checkpoint <checkpoint-id> \
+  --agent codex \
+  --test-cmd "go test ./..." \
+  --timeout 20m
+```
+
+Inspect or automate a run:
+
+```bash
+entire replay checkpoint <checkpoint-id> --agent claude-code --keep-worktree
+entire replay checkpoint <checkpoint-id> --agent gemini --json
+entire replay report <run-id>
+entire replay report <run-id> --json
+```
+
+Compare agents across recent checkpoints:
+
+```bash
+entire eval run \
+  --from-checkpoints \
+  --limit 5 \
+  --agent claude-code,codex \
+  --test-cmd "go test ./..." \
+  --timeout 20m
+
+entire eval report <eval-id>
+entire eval report <eval-id> --json
+```
+
+Supported replay agents are `claude-code`, `codex`, and `gemini`. Replay and
+eval JSON is saved under the repository's git common directory at
+`.git/entire-replay/`, so benchmark output is local to the repo and not tracked
+in the working tree. Eval rankings sort agents by pass rate, file recall,
+precision, risk, duration, and token use. See
+[`docs/architecture/replay-lab.md`](docs/architecture/replay-lab.md) for the
+storage, isolation, and scoring details.
+
 ### `entire enable` Flags
 
 | Flag                                        | Description                                                                                                       |

@@ -25,6 +25,16 @@ var experimentalCommands = []experimentalCommandInfo{
 		Invocation: "entire investigate",
 		Summary:    "Run a multi-agent investigation against a topic, issue, or seed doc",
 	},
+	{
+		Name:       "replay",
+		Invocation: "entire replay",
+		Summary:    "Replay checkpoint tasks in isolated worktrees",
+	},
+	{
+		Name:       "eval",
+		Invocation: "entire eval",
+		Summary:    "Run private agent benchmarks from Entire checkpoints",
+	},
 	{
 		Name:       "org",
 		Invocation: "entire org",
@@ -57,9 +67,7 @@ func newLabsCmd() *cobra.Command {
 				return nil
 			}
 			err := fmt.Errorf("unknown labs topic %q", args[0])
-			fmt.Fprintf(cmd.ErrOrStderr(),
-				"%v\n\nRun `entire labs` to see available experimental commands, or run `entire review --help` for command-specific help.\n",
-				err)
+			fmt.Fprintf(cmd.ErrOrStderr(), "%v\n\n%s\n", err, labsTopicHint(args[0]))
 			return NewSilentError(err)
 		},
 		Run: func(cmd *cobra.Command, _ []string) {
@@ -87,13 +95,24 @@ Available experimental commands:
 Try:
   entire review --help
   entire investigate --help
+  entire replay --help
+  entire eval --help
   entire org --help
   entire project --help
   entire repo --help
   entire grant --help
 `
 }
 
+func labsTopicHint(topic string) string {
+	for _, info := range experimentalCommands {
+		if topic == info.Name {
+			return fmt.Sprintf("Run `entire labs` to see available experimental commands, or run `%s --help` for command-specific help.", info.Invocation)
+		}
+	}
+	return "Run `entire labs` to see available experimental commands and their command-specific help."
+}
+
 func renderExperimentalCommands(commands []experimentalCommandInfo) string {
 	width := 0
 	for _, info := range commands {

@@ -25,7 +25,11 @@ func TestLabsCmd_PrintsExperimentalCommandList(t *testing.T) {
 		"newer Entire workflows",
 		"Available experimental commands",
 		"entire review",
+		"entire replay",
+		"entire eval",
 		"entire review --help",
+		"entire replay --help",
+		"entire eval --help",
 	} {
 		if !strings.Contains(got, want) {
 			t.Fatalf("entire labs output missing %q:\n%s", want, got)
@@ -46,7 +50,7 @@ func TestLabsCmd_HelpShowsExperimentalCommandList(t *testing.T) {
 		t.Fatalf("entire labs --help failed: %v", err)
 	}
 	got := out.String()
-	for _, want := range []string{"Labs", "entire review"} {
+	for _, want := range []string{"Labs", "entire review", "entire replay", "entire eval"} {
 		if !strings.Contains(got, want) {
 			t.Fatalf("entire labs --help output missing %q:\n%s", want, got)
 		}
@@ -77,6 +81,25 @@ func TestLabsCmd_RejectsTopicWithoutRunningIt(t *testing.T) {
 	}
 }
 
+func TestLabsCmd_UnknownTopicPointsBackToLabs(t *testing.T) {
+	t.Parallel()
+
+	root := NewRootCmd()
+	var out, errOut bytes.Buffer
+	root.SetOut(&out)
+	root.SetErr(&errOut)
+	root.SetArgs([]string{"labs", "unknown-topic"})
+
+	err := root.Execute()
+	if err == nil {
+		t.Fatal("entire labs unknown-topic should return an error")
+	}
+	stderr := errOut.String()
+	if !strings.Contains(stderr, "entire labs") || strings.Contains(stderr, "entire unknown-topic --help") {
+		t.Fatalf("stderr should point unknown topics back to labs without inventing a command, got:\n%s", stderr)
+	}
+}
+
 func TestRootHelp_ShowsLabsButHidesReview(t *testing.T) {
 	t.Parallel()