Skip to content

Commit e307d71

Browse files
peyton-altclaude
andcommitted
Fix migration retry-safety: reconcile sessions, preserve task metadata
- Narrow transcript deletion in updateCommittedFullTranscript to only remove transcript-specific files, preserving tasks/ metadata - Reconcile missing tail sessions by comparing v1 vs v2 session counts - Restore task metadata unconditionally during repair (idempotent via MergeKeepExisting) - Add regression tests for partial migration retry, task metadata preservation, task metadata restoration, and archive-aware repair Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Entire-Checkpoint: 758494ac2835
1 parent 7dbe3a7 commit e307d71

3 files changed

Lines changed: 233 additions & 4 deletions

File tree

cmd/entire/cli/checkpoint/v2_committed.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,9 +207,17 @@ func (s *V2GitStore) updateCommittedFullTranscript(ctx context.Context, opts Upd
207207
return err
208208
}
209209

210-
// Clear existing transcript entries at this session path before writing new ones
210+
// Clear existing transcript entries at this session path before writing new ones.
211+
// Only delete transcript-specific files (full.jsonl, chunks, content_hash.txt),
212+
// preserving other entries like tasks/ metadata.
211213
for key := range entries {
212-
if strings.HasPrefix(key, sessionPath) {
214+
if !strings.HasPrefix(key, sessionPath) {
215+
continue
216+
}
217+
relName := key[len(sessionPath):]
218+
if relName == paths.ContentHashFileName ||
219+
relName == paths.TranscriptFileName ||
220+
strings.HasPrefix(relName, paths.TranscriptFileName+".") {
213221
delete(entries, key)
214222
}
215223
}

cmd/entire/cli/migrate.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,21 @@ func migrateOneCheckpoint(ctx context.Context, repo *git.Repository, v1Store *ch
234234
func repairPartialV2Checkpoint(ctx context.Context, repo *git.Repository, v1Store *checkpoint.GitStore, v2Store *checkpoint.V2GitStore, info checkpoint.CommittedInfo, v2Summary *checkpoint.CheckpointSummary) (bool, error) {
235235
repaired := false
236236

237-
// Spot-check already present sessions: ensure required /full/current artifacts exist.
237+
// Load v1 summary once for reconciliation and task metadata repair.
238+
v1Summary, err := v1Store.ReadCommitted(ctx, info.CheckpointID)
239+
if err != nil {
240+
return false, fmt.Errorf("failed to read v1 summary: %w", err)
241+
}
242+
if v1Summary == nil {
243+
return false, nil
244+
}
245+
238246
existingSessionCount := len(v2Summary.Sessions)
247+
248+
// Phase 1: Repair existing sessions with missing /full/current artifacts.
249+
// Always check /full/current specifically — archived generations may not be
250+
// pushed to remotes, so transcripts must exist in /full/current to be available
251+
// on other clones.
239252
for sessionIdx := range existingSessionCount {
240253
ok, checkErr := hasCurrentFullSessionArtifacts(repo, v2Store, info.CheckpointID, sessionIdx)
241254
if checkErr != nil {
@@ -267,6 +280,35 @@ func repairPartialV2Checkpoint(ctx context.Context, repo *git.Repository, v1Stor
267280
repaired = true
268281
}
269282

283+
// Phase 2: Append any v1 sessions that were never written to v2.
284+
for sessionIdx := existingSessionCount; sessionIdx < len(v1Summary.Sessions); sessionIdx++ {
285+
content, readErr := v1Store.ReadSessionContent(ctx, info.CheckpointID, sessionIdx)
286+
if readErr != nil {
287+
return false, fmt.Errorf("failed to read v1 session %d for reconciliation: %w", sessionIdx, readErr)
288+
}
289+
290+
opts := buildMigrateWriteOpts(content, info)
291+
if compacted := tryCompactTranscript(ctx, content.Transcript, content.Metadata); compacted != nil {
292+
opts.CompactTranscript = compacted
293+
}
294+
295+
if writeErr := v2Store.WriteCommitted(ctx, opts); writeErr != nil {
296+
return false, fmt.Errorf("failed to write missing v2 session %d: %w", sessionIdx, writeErr)
297+
}
298+
repaired = true
299+
}
300+
301+
// Phase 3: Restore task metadata unconditionally (idempotent via MergeKeepExisting).
302+
// This handles both newly appended task sessions and existing sessions whose
303+
// tasks/ subtree was deleted by a prior bad run. If no v1 sessions have task
304+
// metadata, copyTaskMetadataToV2 returns nil after finding no tasks/ dirs.
305+
if taskErr := copyTaskMetadataToV2(repo, v1Store, v2Store, info.CheckpointID, v1Summary); taskErr != nil {
306+
logging.Warn(ctx, "failed to copy task metadata during repair",
307+
slog.String("checkpoint_id", string(info.CheckpointID)),
308+
slog.String("error", taskErr.Error()),
309+
)
310+
}
311+
270312
return repaired, nil
271313
}
272314

cmd/entire/cli/migrate_test.go

Lines changed: 180 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,8 @@ func TestMigrateCheckpointsV2_RepairsCurrentFullEvenWhenArchiveExists(t *testing
390390
require.NoError(t, archivedReadErr)
391391
assert.NotEmpty(t, archivedRead.Transcript)
392392

393-
// Re-run migration: should still repair /full/current.
393+
// Re-run migration: should still repair /full/current even though archive has the transcript.
394+
// Archived generations may not be pushed to remotes, so /full/current must have it.
394395
var rerun bytes.Buffer
395396
result2, rerunErr := migrateCheckpointsV2(context.Background(), repo, v1Store, v2Store, &rerun)
396397
require.NoError(t, rerunErr)
@@ -455,6 +456,184 @@ func TestBuildMigrateWriteOpts_PromptSeparatorRoundTrip(t *testing.T) {
455456
assert.Equal(t, "second prompt", opts.Prompts[1])
456457
}
457458

459+
func TestMigrateCheckpointsV2_RepairAppendsMissingSessions(t *testing.T) {
460+
t.Parallel()
461+
repo := initMigrateTestRepo(t)
462+
v1Store, v2Store := newMigrateStores(repo)
463+
464+
cpID := id.MustCheckpointID("aabb11223344")
465+
466+
// Write 3 sessions to v1.
467+
for i := range 3 {
468+
writeV1Checkpoint(t, v1Store, cpID, "session-reconcile-"+strconv.Itoa(i),
469+
[]byte("{\"type\":\"assistant\",\"message\":\"session "+strconv.Itoa(i)+"\"}\n"),
470+
[]string{"prompt " + strconv.Itoa(i)},
471+
)
472+
}
473+
474+
// Simulate partial migration: write only the first session to v2.
475+
content0, err := v1Store.ReadSessionContent(context.Background(), cpID, 0)
476+
require.NoError(t, err)
477+
require.NoError(t, v2Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{
478+
CheckpointID: cpID,
479+
SessionID: content0.Metadata.SessionID,
480+
Strategy: "manual-commit",
481+
Transcript: content0.Transcript,
482+
Prompts: checkpoint.SplitPromptContent(content0.Prompts),
483+
AuthorName: "Test",
484+
AuthorEmail: "test@test.com",
485+
}))
486+
487+
// Verify v2 currently has only 1 session.
488+
before, readErr := v2Store.ReadCommitted(context.Background(), cpID)
489+
require.NoError(t, readErr)
490+
require.Len(t, before.Sessions, 1, "precondition: v2 should have 1 session before repair")
491+
492+
// Rerun migration — should reconcile and append the missing sessions.
493+
var stdout bytes.Buffer
494+
result, migrateErr := migrateCheckpointsV2(context.Background(), repo, v1Store, v2Store, &stdout)
495+
require.NoError(t, migrateErr)
496+
assert.Equal(t, 1, result.migrated)
497+
assert.Contains(t, stdout.String(), "repaired partial v2 checkpoint state")
498+
499+
after, readAfterErr := v2Store.ReadCommitted(context.Background(), cpID)
500+
require.NoError(t, readAfterErr)
501+
assert.Len(t, after.Sessions, 3, "all 3 v1 sessions should exist in v2 after reconciliation")
502+
}
503+
504+
func TestMigrateCheckpointsV2_RepairPreservesTaskMetadata(t *testing.T) {
505+
t.Parallel()
506+
repo := initMigrateTestRepo(t)
507+
v1Store, v2Store := newMigrateStores(repo)
508+
509+
cpID := id.MustCheckpointID("cc11dd22ee33")
510+
err := v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{
511+
CheckpointID: cpID,
512+
SessionID: "session-task-preserve-001",
513+
Strategy: "manual-commit",
514+
Transcript: []byte("{\"type\":\"assistant\",\"message\":\"task work\"}\n"),
515+
Prompts: []string{"task prompt"},
516+
IsTask: true,
517+
ToolUseID: "toolu_preserve01",
518+
AuthorName: "Test",
519+
AuthorEmail: "test@test.com",
520+
})
521+
require.NoError(t, err)
522+
523+
// Full migration including task metadata.
524+
var initialRun bytes.Buffer
525+
result1, migrateErr := migrateCheckpointsV2(context.Background(), repo, v1Store, v2Store, &initialRun)
526+
require.NoError(t, migrateErr)
527+
assert.Equal(t, 1, result1.migrated)
528+
529+
// Verify task metadata was written to /full/current.
530+
_, rootTreeHash, refErr := v2Store.GetRefState(plumbing.ReferenceName(paths.V2FullCurrentRefName))
531+
require.NoError(t, refErr)
532+
rootTree, treeErr := repo.TreeObject(rootTreeHash)
533+
require.NoError(t, treeErr)
534+
_, taskFileErr := rootTree.File(cpID.Path() + "/0/tasks/toolu_preserve01/checkpoint.json")
535+
require.NoError(t, taskFileErr, "precondition: task metadata should exist before repair")
536+
537+
// Simulate interrupted migration by removing transcript files only.
538+
removeV2SessionTranscriptFiles(t, repo, v2Store, cpID, 0)
539+
540+
// Rerun migration — triggers repair.
541+
var rerun bytes.Buffer
542+
result2, rerunErr := migrateCheckpointsV2(context.Background(), repo, v1Store, v2Store, &rerun)
543+
require.NoError(t, rerunErr)
544+
assert.Equal(t, 1, result2.migrated)
545+
assert.Contains(t, rerun.String(), "repaired partial v2 checkpoint state")
546+
547+
// Verify transcript was restored.
548+
content, readErr := v2Store.ReadSessionContent(context.Background(), cpID, 0)
549+
require.NoError(t, readErr)
550+
assert.NotEmpty(t, content.Transcript, "transcript should be restored")
551+
552+
// Verify task metadata was NOT erased during repair.
553+
_, rootTreeHash2, refErr2 := v2Store.GetRefState(plumbing.ReferenceName(paths.V2FullCurrentRefName))
554+
require.NoError(t, refErr2)
555+
rootTree2, treeErr2 := repo.TreeObject(rootTreeHash2)
556+
require.NoError(t, treeErr2)
557+
_, taskFileErr2 := rootTree2.File(cpID.Path() + "/0/tasks/toolu_preserve01/checkpoint.json")
558+
require.NoError(t, taskFileErr2, "task metadata must survive transcript repair")
559+
}
560+
561+
func TestMigrateCheckpointsV2_RepairRestoresDeletedTaskMetadata(t *testing.T) {
562+
t.Parallel()
563+
repo := initMigrateTestRepo(t)
564+
v1Store, v2Store := newMigrateStores(repo)
565+
566+
cpID := id.MustCheckpointID("ee22ff334455")
567+
err := v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{
568+
CheckpointID: cpID,
569+
SessionID: "session-task-restore-001",
570+
Strategy: "manual-commit",
571+
Transcript: []byte("{\"type\":\"assistant\",\"message\":\"task work\"}\n"),
572+
Prompts: []string{"task prompt"},
573+
IsTask: true,
574+
ToolUseID: "toolu_restore01",
575+
AuthorName: "Test",
576+
AuthorEmail: "test@test.com",
577+
})
578+
require.NoError(t, err)
579+
580+
// Full migration.
581+
var initialRun bytes.Buffer
582+
result1, migrateErr := migrateCheckpointsV2(context.Background(), repo, v1Store, v2Store, &initialRun)
583+
require.NoError(t, migrateErr)
584+
assert.Equal(t, 1, result1.migrated)
585+
586+
// Simulate prior bad run: delete ONLY tasks/ subtree (transcripts are intact).
587+
// This tests that task metadata is restored even when no transcript repair is needed.
588+
removeV2TaskMetadata(t, repo, v2Store, cpID, 0)
589+
590+
// Verify tasks/ is actually gone.
591+
_, rootTreeHash, refErr := v2Store.GetRefState(plumbing.ReferenceName(paths.V2FullCurrentRefName))
592+
require.NoError(t, refErr)
593+
rootTree, treeErr := repo.TreeObject(rootTreeHash)
594+
require.NoError(t, treeErr)
595+
_, taskGoneErr := rootTree.File(cpID.Path() + "/0/tasks/toolu_restore01/checkpoint.json")
596+
require.Error(t, taskGoneErr, "precondition: task metadata should be gone")
597+
598+
// Rerun migration — transcripts are fine, but task metadata should be restored.
599+
var rerun bytes.Buffer
600+
_, rerunErr := migrateCheckpointsV2(context.Background(), repo, v1Store, v2Store, &rerun)
601+
require.NoError(t, rerunErr)
602+
603+
// Verify task metadata was restored.
604+
_, rootTreeHash2, refErr2 := v2Store.GetRefState(plumbing.ReferenceName(paths.V2FullCurrentRefName))
605+
require.NoError(t, refErr2)
606+
rootTree2, treeErr2 := repo.TreeObject(rootTreeHash2)
607+
require.NoError(t, treeErr2)
608+
_, taskFileErr := rootTree2.File(cpID.Path() + "/0/tasks/toolu_restore01/checkpoint.json")
609+
require.NoError(t, taskFileErr, "task metadata must be restored even when transcripts are intact")
610+
}
611+
612+
// removeV2TaskMetadata deletes the tasks/ subtree from a session in /full/current.
613+
func removeV2TaskMetadata(t *testing.T, repo *git.Repository, v2Store *checkpoint.V2GitStore, cpID id.CheckpointID, sessionIdx int) {
614+
t.Helper()
615+
616+
refName := plumbing.ReferenceName(paths.V2FullCurrentRefName)
617+
parentHash, rootTreeHash, err := v2Store.GetRefState(refName)
618+
require.NoError(t, err)
619+
620+
newRootHash, updateErr := checkpoint.UpdateSubtree(
621+
repo,
622+
rootTreeHash,
623+
[]string{string(cpID[:2]), string(cpID[2:]), strconv.Itoa(sessionIdx)},
624+
nil,
625+
checkpoint.UpdateSubtreeOptions{
626+
MergeMode: checkpoint.MergeKeepExisting,
627+
DeleteNames: []string{"tasks"},
628+
},
629+
)
630+
require.NoError(t, updateErr)
631+
632+
commitHash, commitErr := checkpoint.CreateCommit(repo, newRootHash, parentHash, "test: remove task metadata\n", "Test", "test@test.com")
633+
require.NoError(t, commitErr)
634+
require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(refName, commitHash)))
635+
}
636+
458637
func TestSpliceTasksTreeToV2_MergesTaskDirectories(t *testing.T) {
459638
t.Parallel()
460639

0 commit comments

Comments
 (0)