From fbc8394951d7d865f60fa1e986e47ad5aeb0e216 Mon Sep 17 00:00:00 2001 From: computermode <2917645+computermode@users.noreply.github.com> Date: Fri, 22 May 2026 14:41:52 -0700 Subject: [PATCH 01/35] Add a migration script Entire-Checkpoint: c9bc625237a6 --- scripts/migrate-v2-checkpoints-to-v1.sh | 484 ++++++++++++++++++++++++ 1 file changed, 484 insertions(+) create mode 100755 scripts/migrate-v2-checkpoints-to-v1.sh diff --git a/scripts/migrate-v2-checkpoints-to-v1.sh b/scripts/migrate-v2-checkpoints-to-v1.sh new file mode 100755 index 000000000..77d7539e1 --- /dev/null +++ b/scripts/migrate-v2-checkpoints-to-v1.sh @@ -0,0 +1,484 @@ +#!/usr/bin/env bash +set -euo pipefail + +# +# migrate-v2-checkpoints-to-v1.sh - Inspect legacy v2 checkpoints for v1 migration. +# +# USAGE: +# ./scripts/migrate-v2-checkpoints-to-v1.sh [OPTIONS] [SINCE_COMMIT] +# +# OPTIONS: +# -h, --help Show this help message +# --list Print checkpoint IDs and associated commit IDs only +# --dry-run Print every v2 folder/file that would be migrated +# --apply Build migration commit support path (currently blocked before writing) +# --repo Local repository path to inspect +# --since Commit before the checkpoints to inspect +# --head Limit scan to one history tip (default: all branches/remotes) +# +# DESCRIPTION: +# Read-only first pass for converting legacy checkpoints v2 data back to the +# v1 checkpoint format. The script finds commits newer than SINCE_COMMIT on +# local branches/remotes (or on --head, when supplied), extracts +# Entire-Checkpoint trailers, and prints the v2 /full folders/files that +# contain raw transcripts: +# +# refs/entire/checkpoints/v2/full/*://raw_transcript* +# +# When refs/entire/checkpoints/v2/main is available, companion checkpoint and +# session metadata paths are printed after the full transcript folders. +# +# If --repo or SINCE_COMMIT is omitted, the script prompts for it. +# + +V1_REF="refs/heads/entire/checkpoints/v1" +V2_MAIN_REF="refs/entire/checkpoints/v2/main" +V2_FULL_REF_PREFIX="refs/entire/checkpoints/v2/full" +TRAILER_KEY="Entire-Checkpoint" + +since_commit="" +head_commitish="" +repo_path="" +dry_run=false +apply=false +list_mode=false +plan_entries_file="" + +show_help() { + sed -n '3,/^$/p' "$0" | sed -E 's/^# ?//' +} + +die() { + printf 'error: %s\n' "$*" >&2 + exit 1 +} + +warn() { + printf 'warning: %s\n' "$*" >&2 +} + +cleanup() { + if [[ -n "$plan_entries_file" && -f "$plan_entries_file" ]]; then + rm -f "$plan_entries_file" + fi +} +trap cleanup EXIT + +checkpoint_to_path() { + local checkpoint_id="$1" + printf '%s/%s' "${checkpoint_id:0:2}" "${checkpoint_id:2}" +} + +tree_path_exists() { + local ref_name="$1" + local path="$2" + git cat-file -e "${ref_name}:${path}" 2>/dev/null +} + +list_numeric_dirs() { + local ref_name="$1" + local path="$2" + local entries + entries=$(git ls-tree -d --name-only "${ref_name}:${path}" 2>/dev/null || true) + printf '%s\n' "$entries" | sed -nE '/^[0-9]+$/p' +} + +list_full_refs() { + git for-each-ref --format='%(refname)' "$V2_FULL_REF_PREFIX" | sort +} + +list_checkpoint_ids_between() { + local since="$1" + local head="$2" + git log --format=%B "${since}..${head}" | + sed -nE "s/^[[:space:]]*${TRAILER_KEY}:[[:space:]]*([0-9a-f]{12})[[:space:]]*$/\\1/p" | + awk '!seen[$0]++' +} + +list_checkpoint_ids_from_all_refs() { + local since="$1" + git log HEAD --branches --remotes --format=%B --not "$since" | + sed -nE "s/^[[:space:]]*${TRAILER_KEY}:[[:space:]]*([0-9a-f]{12})[[:space:]]*$/\\1/p" | + awk '!seen[$0]++' +} + +list_commit_ids_for_checkpoint() { + local checkpoint_id="$1" + local since="$2" + local head="$3" + local commits + + if [[ -n "$head" ]]; then + commits=$(git log --format=%H --extended-regexp \ + --grep="^${TRAILER_KEY}:[[:space:]]*${checkpoint_id}[[:space:]]*$" \ + "${since}..${head}") + else + commits=$(git log HEAD --branches --remotes --format=%H --extended-regexp \ + --grep="^${TRAILER_KEY}:[[:space:]]*${checkpoint_id}[[:space:]]*$" \ + --not "$since") + fi + + printf '%s\n' "$commits" | awk 'NF && !seen[$0]++' +} + +list_full_artifacts() { + local ref_name="$1" + local session_path="$2" + local entries + entries=$(git ls-tree --name-only "${ref_name}:${session_path}" 2>/dev/null || true) + printf '%s\n' "$entries" | + sed -nE '/^raw_transcript(\.[0-9]+)?$/p; /^raw_transcript_hash\.txt$/p' +} + +v1_raw_artifact_name() { + local artifact="$1" + case "$artifact" in + raw_transcript) + printf 'full.jsonl' + ;; + raw_transcript.[0-9][0-9][0-9]) + printf 'full.jsonl%s' "${artifact#raw_transcript}" + ;; + raw_transcript_hash.txt) + printf 'content_hash.txt' + ;; + *) + return 1 + ;; + esac +} + +append_tree_entry_from_source() { + local source_ref="$1" + local source_path="$2" + local target_path="$3" + local entry meta mode type hash + + entry=$(git ls-tree "$source_ref" -- "$source_path" || true) + [[ -n "$entry" ]] || return 1 + + meta=${entry%%$'\t'*} + read -r mode type hash <<< "$meta" + [[ "$type" == "blob" && -n "$hash" ]] || return 1 + + printf '%s %s %s\t%s\n' "$mode" "$type" "$hash" "$target_path" >> "$plan_entries_file" +} + +write_unique_mktree_input() { + local entries_file="$1" + awk -F '\t' 'NF >= 2 { line[$2] = $0 } END { for (path in line) print line[path] }' "$entries_file" | + sort -k2 +} + +build_v1_tree_from_plan() { + local entries_file="$1" + local combined + combined=$(mktemp "${TMPDIR:-/tmp}/v2-to-v1-tree.XXXXXX") + if git show-ref --verify --quiet "$V1_REF"; then + git ls-tree -r "$V1_REF" > "$combined" + else + : > "$combined" + fi + cat "$entries_file" >> "$combined" + write_unique_mktree_input "$combined" | git mktree + rm -f "$combined" +} + +create_v1_migration_commit() { + local entries_file="$1" + local tree_hash parent_hash commit_hash + + tree_hash=$(build_v1_tree_from_plan "$entries_file") + if git show-ref --verify --quiet "$V1_REF"; then + parent_hash=$(git rev-parse "$V1_REF^{commit}") + commit_hash=$(printf 'Migrate checkpoints v2 to v1\n\nSource refs: %s and %s/*\n' "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | + git commit-tree "$tree_hash" -p "$parent_hash") + else + commit_hash=$(printf 'Migrate checkpoints v2 to v1\n\nSource refs: %s and %s/*\n' "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | + git commit-tree "$tree_hash") + fi + + printf '%s\n' "$commit_hash" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + show_help + exit 0 + ;; + --list) + list_mode=true + shift + ;; + --dry-run) + dry_run=true + shift + ;; + --apply) + apply=true + shift + ;; + --since) + [[ $# -ge 2 ]] || die "--since requires a commit" + since_commit="$2" + shift 2 + ;; + --repo) + [[ $# -ge 2 ]] || die "--repo requires a path" + repo_path="$2" + shift 2 + ;; + --head) + [[ $# -ge 2 ]] || die "--head requires a commit" + head_commitish="$2" + shift 2 + ;; + -*) + die "unknown option: $1" + ;; + *) + [[ -z "$since_commit" ]] || die "too many commit arguments" + since_commit="$1" + shift + ;; + esac +done + +mode_count=0 +[[ "$list_mode" == "true" ]] && mode_count=$((mode_count + 1)) +[[ "$dry_run" == "true" ]] && mode_count=$((mode_count + 1)) +[[ "$apply" == "true" ]] && mode_count=$((mode_count + 1)) +if (( mode_count > 1 )); then + die "--list, --dry-run, and --apply are mutually exclusive" +fi + +plan_entries_file=$(mktemp "${TMPDIR:-/tmp}/v2-to-v1-plan.XXXXXX") + +if [[ -z "$repo_path" ]]; then + printf 'Local repo path: ' >&2 + IFS= read -r repo_path +fi + +[[ -n "$repo_path" ]] || die "a local repo path is required" +[[ -d "$repo_path" ]] || die "repo path does not exist or is not a directory: $repo_path" + +if ! repo_root=$(git -C "$repo_path" rev-parse --show-toplevel 2>/dev/null); then + die "not inside a git repository: $repo_path" +fi +cd "$repo_root" + +if [[ -z "$since_commit" ]]; then + printf 'Show v2 checkpoints newer than commit: ' >&2 + IFS= read -r since_commit +fi + +[[ -n "$since_commit" ]] || die "a base commit is required" + +if ! since_hash=$(git rev-parse --verify --quiet "${since_commit}^{commit}"); then + die "commit not found: $since_commit" +fi + +head_hash="" +if [[ -n "$head_commitish" ]]; then + if ! head_hash=$(git rev-parse --verify --quiet "${head_commitish}^{commit}"); then + die "history tip not found: $head_commitish" + fi + git merge-base --is-ancestor "$since_hash" "$head_hash" 2>/dev/null || + die "$since_commit is not an ancestor of $head_commitish" +fi + +main_ref_available=false +if git show-ref --verify --quiet "$V2_MAIN_REF"; then + main_ref_available=true +else + warn "missing $V2_MAIN_REF; companion metadata paths will not be shown" +fi + +full_refs=$(list_full_refs) +[[ -n "$full_refs" ]] || die "missing refs under $V2_FULL_REF_PREFIX; cannot locate raw transcripts" + +if [[ -n "$head_hash" ]]; then + checkpoint_ids=$(list_checkpoint_ids_between "$since_hash" "$head_hash") +else + checkpoint_ids=$(list_checkpoint_ids_from_all_refs "$since_hash") +fi +if [[ -z "$checkpoint_ids" ]]; then + if [[ -n "$head_hash" ]]; then + printf 'No %s trailers found in %s..%s\n' "$TRAILER_KEY" "$since_hash" "$head_hash" + else + printf 'No %s trailers found on local branches/remotes after %s\n' "$TRAILER_KEY" "$since_hash" + fi + exit 0 +fi + +if [[ "$list_mode" == "true" ]]; then + checkpoint_count=$(printf '%s\n' "$checkpoint_ids" | sed '/^$/d' | wc -l | tr -d '[:space:]') + printf 'Checkpoints: %s\n' "$checkpoint_count" + printf 'checkpoint_id\tcommit_ids\n' + while IFS= read -r checkpoint_id; do + [[ -n "$checkpoint_id" ]] || continue + commits=$(list_commit_ids_for_checkpoint "$checkpoint_id" "$since_hash" "$head_hash" | tr '\n' ' ' | sed 's/[[:space:]]*$//') + printf '%s\t' "$checkpoint_id" + if [[ -n "$commits" ]]; then + printf '%s' "$commits" + fi + printf '\n' + done <<< "$checkpoint_ids" + exit 0 +fi + +printf 'Repository: %s\n' "$repo_root" +if [[ -n "$head_hash" ]]; then + printf 'Scanning commits: %s..%s\n' "$since_hash" "$head_hash" +else + printf 'Scanning commits: local branches/remotes after %s\n' "$since_hash" +fi +if [[ "$main_ref_available" == "true" ]]; then + printf 'Companion metadata ref: %s\n' "$V2_MAIN_REF" +fi +printf 'Full refs:\n' +printf '%s\n' "$full_refs" | sed 's/^/ /' +printf '\n' + +planned_checkpoints=0 +planned_sessions=0 +planned_raw_transcripts=0 +missing_raw_checkpoints=0 +missing_metadata_checkpoints=0 +missing_metadata_sessions=0 + +while IFS= read -r checkpoint_id; do + [[ -n "$checkpoint_id" ]] || continue + + checkpoint_path=$(checkpoint_to_path "$checkpoint_id") + + found_full_artifact=false + found_sessions="" + checkpoint_output="" + while IFS= read -r full_ref; do + [[ -n "$full_ref" ]] || continue + + sessions=$(list_numeric_dirs "$full_ref" "$checkpoint_path") + printed_full_checkpoint=false + while IFS= read -r session_index; do + [[ -n "$session_index" ]] || continue + session_path="$checkpoint_path/$session_index" + artifacts=$(list_full_artifacts "$full_ref" "$session_path") + if [[ -z "$artifacts" ]]; then + continue + fi + + found_full_artifact=true + found_sessions="${found_sessions}${session_index}"$'\n' + if [[ "$printed_full_checkpoint" != "true" ]]; then + printed_full_checkpoint=true + checkpoint_output="${checkpoint_output} full checkpoint folder: ${full_ref}:${checkpoint_path}"$'\n' + fi + checkpoint_output="${checkpoint_output} full session folder: ${full_ref}:${session_path}"$'\n' + while IFS= read -r artifact; do + [[ -n "$artifact" ]] || continue + checkpoint_output="${checkpoint_output} raw artifact: ${full_ref}:${session_path}/${artifact}"$'\n' + done <<< "$artifacts" + done <<< "$sessions" + done <<< "$full_refs" + + if [[ "$found_full_artifact" != "true" ]]; then + warn "no raw_transcript artifacts found for checkpoint $checkpoint_id" + missing_raw_checkpoints=$((missing_raw_checkpoints + 1)) + continue + fi + + planned_checkpoints=$((planned_checkpoints + 1)) + + if [[ "$main_ref_available" == "true" ]] && tree_path_exists "$V2_MAIN_REF" "$checkpoint_path"; then + checkpoint_output="${checkpoint_output} companion metadata folder: ${V2_MAIN_REF}:${checkpoint_path}"$'\n' + if tree_path_exists "$V2_MAIN_REF" "$checkpoint_path/metadata.json"; then + checkpoint_output="${checkpoint_output} checkpoint metadata: ${V2_MAIN_REF}:${checkpoint_path}/metadata.json"$'\n' + append_tree_entry_from_source "$V2_MAIN_REF" "$checkpoint_path/metadata.json" "$checkpoint_path/metadata.json" || + warn "failed to plan checkpoint metadata for $checkpoint_id" + fi + + metadata_sessions=$(printf '%s' "$found_sessions" | sed '/^$/d' | sort -n | uniq) + while IFS= read -r session_index; do + [[ -n "$session_index" ]] || continue + session_path="$checkpoint_path/$session_index" + planned_sessions=$((planned_sessions + 1)) + if tree_path_exists "$V2_MAIN_REF" "$session_path/metadata.json"; then + checkpoint_output="${checkpoint_output} session metadata: ${V2_MAIN_REF}:${session_path}/metadata.json"$'\n' + append_tree_entry_from_source "$V2_MAIN_REF" "$session_path/metadata.json" "$session_path/metadata.json" || + warn "failed to plan session metadata for checkpoint $checkpoint_id session $session_index" + else + missing_metadata_sessions=$((missing_metadata_sessions + 1)) + warn "missing session metadata for checkpoint $checkpoint_id session $session_index on $V2_MAIN_REF" + fi + if tree_path_exists "$V2_MAIN_REF" "$session_path/prompt.txt"; then + checkpoint_output="${checkpoint_output} prompt: ${V2_MAIN_REF}:${session_path}/prompt.txt"$'\n' + append_tree_entry_from_source "$V2_MAIN_REF" "$session_path/prompt.txt" "$session_path/prompt.txt" || + warn "failed to plan prompt for checkpoint $checkpoint_id session $session_index" + fi + done <<< "$metadata_sessions" + elif [[ "$main_ref_available" == "true" ]]; then + missing_metadata_checkpoints=$((missing_metadata_checkpoints + 1)) + warn "checkpoint $checkpoint_id has raw transcript artifacts but no companion metadata on $V2_MAIN_REF" + fi + + while IFS=$'\t' read -r full_ref session_index artifact; do + [[ -n "${full_ref:-}" && -n "${session_index:-}" && -n "${artifact:-}" ]] || continue + session_path="$checkpoint_path/$session_index" + v1_artifact=$(v1_raw_artifact_name "$artifact") || continue + append_tree_entry_from_source "$full_ref" "$session_path/$artifact" "$session_path/$v1_artifact" || + warn "failed to plan raw artifact for checkpoint $checkpoint_id session $session_index: $artifact" + if [[ "$artifact" == "raw_transcript" ]]; then + planned_raw_transcripts=$((planned_raw_transcripts + 1)) + fi + done < <( + while IFS= read -r full_ref; do + [[ -n "$full_ref" ]] || continue + sessions=$(list_numeric_dirs "$full_ref" "$checkpoint_path") + while IFS= read -r session_index; do + [[ -n "$session_index" ]] || continue + session_path="$checkpoint_path/$session_index" + artifacts=$(list_full_artifacts "$full_ref" "$session_path") + while IFS= read -r artifact; do + [[ -n "$artifact" ]] || continue + printf '%s\t%s\t%s\n' "$full_ref" "$session_index" "$artifact" + done <<< "$artifacts" + done <<< "$sessions" + done <<< "$full_refs" + ) + + if [[ "$dry_run" == "true" ]]; then + printf 'checkpoint %s\n' "$checkpoint_id" + printf '%s' "$checkpoint_output" + printf '\n' + fi +done <<< "$checkpoint_ids" + +planned_entries=$(wc -l < "$plan_entries_file" | tr -d '[:space:]') +unique_planned_entries=$(write_unique_mktree_input "$plan_entries_file" | wc -l | tr -d '[:space:]') + +if [[ "$dry_run" != "true" ]]; then + printf 'Migration plan:\n' + printf ' target ref: %s\n' "$V1_REF" + printf ' checkpoints with raw transcripts: %s\n' "$planned_checkpoints" + printf ' sessions with raw transcripts: %s\n' "$planned_sessions" + printf ' raw transcript base files: %s\n' "$planned_raw_transcripts" + printf ' planned v1 tree entries: %s (%s unique target paths)\n' "$planned_entries" "$unique_planned_entries" + printf ' missing raw-transcript checkpoints: %s\n' "$missing_raw_checkpoints" + printf ' missing companion checkpoint metadata: %s\n' "$missing_metadata_checkpoints" + printf ' missing companion session metadata: %s\n' "$missing_metadata_sessions" + printf '\n' +fi + +if [[ "$apply" == "true" ]]; then + # This is the final write path, intentionally blocked until the migration + # behavior is reviewed with real repo output. It reuses v2 blob objects, + # builds one complete v1 tree, and would then update V1_REF to the commit. + die "--apply is scaffolded but intentionally blocked before creating a migration commit or updating $V1_REF" +fi + +if [[ "$dry_run" != "true" ]]; then + printf 'No refs were written. Use --dry-run to print every source and target artifact.\n' + printf 'The single-commit apply path is scaffolded but blocked before updating %s.\n' "$V1_REF" +fi From 1ea7f14cd7da4cffbd6badddf3968384681cf9d3 Mon Sep 17 00:00:00 2001 From: computermode <2917645+computermode@users.noreply.github.com> Date: Fri, 22 May 2026 15:43:02 -0700 Subject: [PATCH 02/35] Add v2 checkpoint migration apply mode Entire-Checkpoint: e25de9aab773 --- scripts/migrate-v2-checkpoints-to-v1.sh | 733 +++++++++++++++++------- 1 file changed, 525 insertions(+), 208 deletions(-) diff --git a/scripts/migrate-v2-checkpoints-to-v1.sh b/scripts/migrate-v2-checkpoints-to-v1.sh index 77d7539e1..20c44506e 100755 --- a/scripts/migrate-v2-checkpoints-to-v1.sh +++ b/scripts/migrate-v2-checkpoints-to-v1.sh @@ -2,7 +2,7 @@ set -euo pipefail # -# migrate-v2-checkpoints-to-v1.sh - Inspect legacy v2 checkpoints for v1 migration. +# migrate-v2-checkpoints-to-v1.sh - Migrate legacy v2 checkpoints to v1. # # USAGE: # ./scripts/migrate-v2-checkpoints-to-v1.sh [OPTIONS] [SINCE_COMMIT] @@ -11,22 +11,22 @@ set -euo pipefail # -h, --help Show this help message # --list Print checkpoint IDs and associated commit IDs only # --dry-run Print every v2 folder/file that would be migrated -# --apply Build migration commit support path (currently blocked before writing) +# --apply Write one local refs/heads/entire/checkpoints/v1 migration commit # --repo Local repository path to inspect # --since Commit before the checkpoints to inspect # --head Limit scan to one history tip (default: all branches/remotes) # # DESCRIPTION: -# Read-only first pass for converting legacy checkpoints v2 data back to the -# v1 checkpoint format. The script finds commits newer than SINCE_COMMIT on -# local branches/remotes (or on --head, when supplied), extracts -# Entire-Checkpoint trailers, and prints the v2 /full folders/files that -# contain raw transcripts: +# Standalone helper for converting legacy checkpoints v2 data back to the v1 +# checkpoint format. The script finds commits newer than SINCE_COMMIT on local +# branches/remotes (or on --head, when supplied), extracts Entire-Checkpoint +# trailers, and locates the v2 /full folders/files that contain raw transcripts: # # refs/entire/checkpoints/v2/full/*://raw_transcript* # -# When refs/entire/checkpoints/v2/main is available, companion checkpoint and -# session metadata paths are printed after the full transcript folders. +# The default mode prints a migration plan without writing refs. --dry-run +# prints every source folder/file. --apply writes one local migration commit to +# refs/heads/entire/checkpoints/v1. # # If --repo or SINCE_COMMIT is omitted, the script prompts for it. # @@ -42,7 +42,15 @@ repo_path="" dry_run=false apply=false list_mode=false +tmp_dir="" plan_entries_file="" +checkpoint_commits_file="" +checkpoint_ids_file="" +checkpoint_paths_file="" +full_artifacts_file="" +raw_sessions_file="" +raw_checkpoint_ids_file="" +main_metadata_file="" show_help() { sed -n '3,/^$/p' "$0" | sed -E 's/^# ?//' @@ -58,8 +66,8 @@ warn() { } cleanup() { - if [[ -n "$plan_entries_file" && -f "$plan_entries_file" ]]; then - rm -f "$plan_entries_file" + if [[ -n "$tmp_dir" && -d "$tmp_dir" ]]; then + rm -rf "$tmp_dir" fi } trap cleanup EXIT @@ -84,50 +92,69 @@ list_numeric_dirs() { } list_full_refs() { - git for-each-ref --format='%(refname)' "$V2_FULL_REF_PREFIX" | sort + git for-each-ref --format='%(refname)' "$V2_FULL_REF_PREFIX" | + sort | + awk -v current="${V2_FULL_REF_PREFIX}/current" ' + $0 == current { current_ref = $0; next } + { refs[++n] = $0 } + END { + if (current_ref != "") { + print current_ref + } + for (i = n; i >= 1; i--) { + print refs[i] + } + } + ' } -list_checkpoint_ids_between() { +write_checkpoint_commit_index_between() { local since="$1" local head="$2" - git log --format=%B "${since}..${head}" | - sed -nE "s/^[[:space:]]*${TRAILER_KEY}:[[:space:]]*([0-9a-f]{12})[[:space:]]*$/\\1/p" | - awk '!seen[$0]++' + local output_file="$3" + + git log --format='__ENTIRE_COMMIT__%H%n%B' "${since}..${head}" | + awk -v key="$TRAILER_KEY" ' + /^__ENTIRE_COMMIT__/ { + commit = substr($0, length("__ENTIRE_COMMIT__") + 1) + next + } + { + line = $0 + pattern = "^[[:space:]]*" key ":[[:space:]]*([0-9a-f]{12})[[:space:]]*$" + if (line ~ pattern) { + sub("^[[:space:]]*" key ":[[:space:]]*", "", line) + sub("[[:space:]]*$", "", line) + if (commit != "" && !seen[line SUBSEP commit]++) { + print line "\t" commit + } + } + } + ' > "$output_file" } -list_checkpoint_ids_from_all_refs() { +write_checkpoint_commit_index_from_all_refs() { local since="$1" - git log HEAD --branches --remotes --format=%B --not "$since" | - sed -nE "s/^[[:space:]]*${TRAILER_KEY}:[[:space:]]*([0-9a-f]{12})[[:space:]]*$/\\1/p" | - awk '!seen[$0]++' -} - -list_commit_ids_for_checkpoint() { - local checkpoint_id="$1" - local since="$2" - local head="$3" - local commits - - if [[ -n "$head" ]]; then - commits=$(git log --format=%H --extended-regexp \ - --grep="^${TRAILER_KEY}:[[:space:]]*${checkpoint_id}[[:space:]]*$" \ - "${since}..${head}") - else - commits=$(git log HEAD --branches --remotes --format=%H --extended-regexp \ - --grep="^${TRAILER_KEY}:[[:space:]]*${checkpoint_id}[[:space:]]*$" \ - --not "$since") - fi - - printf '%s\n' "$commits" | awk 'NF && !seen[$0]++' -} - -list_full_artifacts() { - local ref_name="$1" - local session_path="$2" - local entries - entries=$(git ls-tree --name-only "${ref_name}:${session_path}" 2>/dev/null || true) - printf '%s\n' "$entries" | - sed -nE '/^raw_transcript(\.[0-9]+)?$/p; /^raw_transcript_hash\.txt$/p' + local output_file="$2" + + git log HEAD --branches --remotes --format='__ENTIRE_COMMIT__%H%n%B' --not "$since" | + awk -v key="$TRAILER_KEY" ' + /^__ENTIRE_COMMIT__/ { + commit = substr($0, length("__ENTIRE_COMMIT__") + 1) + next + } + { + line = $0 + pattern = "^[[:space:]]*" key ":[[:space:]]*([0-9a-f]{12})[[:space:]]*$" + if (line ~ pattern) { + sub("^[[:space:]]*" key ":[[:space:]]*", "", line) + sub("[[:space:]]*$", "", line) + if (commit != "" && !seen[line SUBSEP commit]++) { + print line "\t" commit + } + } + } + ' > "$output_file" } v1_raw_artifact_name() { @@ -148,47 +175,35 @@ v1_raw_artifact_name() { esac } -append_tree_entry_from_source() { - local source_ref="$1" - local source_path="$2" - local target_path="$3" - local entry meta mode type hash - - entry=$(git ls-tree "$source_ref" -- "$source_path" || true) - [[ -n "$entry" ]] || return 1 - - meta=${entry%%$'\t'*} - read -r mode type hash <<< "$meta" - [[ "$type" == "blob" && -n "$hash" ]] || return 1 - - printf '%s %s %s\t%s\n' "$mode" "$type" "$hash" "$target_path" >> "$plan_entries_file" -} - write_unique_mktree_input() { local entries_file="$1" - awk -F '\t' 'NF >= 2 { line[$2] = $0 } END { for (path in line) print line[path] }' "$entries_file" | + awk -F '\t' 'NF >= 2 && !seen[$2]++ { print }' "$entries_file" | sort -k2 } build_v1_tree_from_plan() { local entries_file="$1" - local combined - combined=$(mktemp "${TMPDIR:-/tmp}/v2-to-v1-tree.XXXXXX") + local combined index_file + combined="$tmp_dir/combined_index_info" + index_file="$tmp_dir/migration.index" + + rm -f "$index_file" if git show-ref --verify --quiet "$V1_REF"; then - git ls-tree -r "$V1_REF" > "$combined" + cat "$entries_file" > "$combined" + git ls-tree -r "$V1_REF" >> "$combined" else - : > "$combined" + cat "$entries_file" > "$combined" fi - cat "$entries_file" >> "$combined" - write_unique_mktree_input "$combined" | git mktree - rm -f "$combined" + + write_unique_mktree_input "$combined" | + GIT_INDEX_FILE="$index_file" git update-index --index-info + GIT_INDEX_FILE="$index_file" git write-tree } create_v1_migration_commit() { - local entries_file="$1" - local tree_hash parent_hash commit_hash + local tree_hash="$1" + local parent_hash commit_hash - tree_hash=$(build_v1_tree_from_plan "$entries_file") if git show-ref --verify --quiet "$V1_REF"; then parent_hash=$(git rev-parse "$V1_REF^{commit}") commit_hash=$(printf 'Migrate checkpoints v2 to v1\n\nSource refs: %s and %s/*\n' "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | @@ -201,6 +216,310 @@ create_v1_migration_commit() { printf '%s\n' "$commit_hash" } +write_checkpoint_id_files() { + awk -F '\t' 'NF >= 2 && !seen[$1]++ { print $1 }' "$checkpoint_commits_file" > "$checkpoint_ids_file" + awk 'NF { print $0 "\t" substr($0, 1, 2) "/" substr($0, 3) }' "$checkpoint_ids_file" > "$checkpoint_paths_file" +} + +write_full_artifact_index() { + local full_ref + : > "$full_artifacts_file" + : > "$raw_sessions_file" + : > "$raw_checkpoint_ids_file" + + while IFS= read -r full_ref; do + [[ -n "$full_ref" ]] || continue + git ls-tree -r "$full_ref" | + awk -F '\t' \ + -v ref="$full_ref" \ + -v checkpoint_paths_file="$checkpoint_paths_file" \ + -v full_artifacts_file="$full_artifacts_file" \ + -v raw_sessions_file="$raw_sessions_file" \ + -v raw_checkpoint_ids_file="$raw_checkpoint_ids_file" \ + -v plan_entries_file="$plan_entries_file" ' + BEGIN { + while ((getline line < checkpoint_paths_file) > 0) { + split(line, fields, "\t") + checkpoint_by_path[fields[2]] = fields[1] + } + } + NF >= 2 { + meta = $1 + path = $2 + n = split(path, parts, "/") + if (n != 4) { + next + } + checkpoint_path = parts[1] "/" parts[2] + if (!(checkpoint_path in checkpoint_by_path)) { + next + } + session_index = parts[3] + artifact = parts[4] + if (artifact == "raw_transcript") { + target = checkpoint_path "/" session_index "/full.jsonl" + } else if (artifact ~ /^raw_transcript\.[0-9][0-9][0-9]$/) { + suffix = artifact + sub(/^raw_transcript/, "", suffix) + target = checkpoint_path "/" session_index "/full.jsonl" suffix + } else if (artifact == "raw_transcript_hash.txt") { + target = checkpoint_path "/" session_index "/content_hash.txt" + } else { + next + } + checkpoint_id = checkpoint_by_path[checkpoint_path] + print checkpoint_id "\t" checkpoint_path "\t" session_index "\t" ref "\t" artifact "\t" path "\t" target "\t" meta >> full_artifacts_file + print checkpoint_id "\t" checkpoint_path "\t" session_index >> raw_sessions_file + print checkpoint_id >> raw_checkpoint_ids_file + print meta "\t" target >> plan_entries_file + } + ' + done <<< "$full_refs" + + sort -u "$raw_sessions_file" -o "$raw_sessions_file" + awk 'NF && !seen[$0]++' "$raw_checkpoint_ids_file" > "${raw_checkpoint_ids_file}.tmp" + mv "${raw_checkpoint_ids_file}.tmp" "$raw_checkpoint_ids_file" +} + +write_main_metadata_index() { + : > "$main_metadata_file" + if [[ "$main_ref_available" != "true" ]]; then + return + fi + + git ls-tree -r "$V2_MAIN_REF" | + awk -F '\t' \ + -v checkpoint_paths_file="$checkpoint_paths_file" \ + -v raw_sessions_file="$raw_sessions_file" \ + -v main_metadata_file="$main_metadata_file" \ + -v plan_entries_file="$plan_entries_file" ' + BEGIN { + while ((getline line < checkpoint_paths_file) > 0) { + split(line, fields, "\t") + checkpoint_by_path[fields[2]] = fields[1] + } + while ((getline line < raw_sessions_file) > 0) { + split(line, fields, "\t") + session_wanted[fields[2] "/" fields[3]] = 1 + checkpoint_has_raw[fields[2]] = 1 + } + } + NF >= 2 { + meta = $1 + path = $2 + n = split(path, parts, "/") + checkpoint_path = parts[1] "/" parts[2] + if (!(checkpoint_path in checkpoint_by_path)) { + next + } + checkpoint_id = checkpoint_by_path[checkpoint_path] + if (n == 3 && parts[3] == "metadata.json") { + if (!(checkpoint_path in checkpoint_has_raw)) { + next + } + print checkpoint_id "\t" checkpoint_path "\t-\tcheckpoint_metadata\t" path "\t" meta >> main_metadata_file + next + } + if (n == 4 && (parts[4] == "metadata.json" || parts[4] == "prompt.txt")) { + session_key = checkpoint_path "/" parts[3] + if (!(session_key in session_wanted)) { + next + } + kind = parts[4] == "metadata.json" ? "session_metadata" : "prompt" + print checkpoint_id "\t" checkpoint_path "\t" parts[3] "\t" kind "\t" path "\t" meta >> main_metadata_file + print meta "\t" path >> plan_entries_file + } + } + ' +} + +append_checkpoint_metadata_plan_entries() { + if ! awk -F '\t' '$4 == "checkpoint_metadata" { found = 1; exit } END { exit found ? 0 : 1 }' "$main_metadata_file"; then + return + fi + + if [[ "$apply" == "true" ]]; then + rewrite_checkpoint_metadata_plan_entries + return + fi + + awk -F '\t' '$4 == "checkpoint_metadata" { print $6 "\t" $5 }' "$main_metadata_file" >> "$plan_entries_file" +} + +rewrite_checkpoint_metadata_plan_entries() { + command -v python3 >/dev/null 2>&1 || die "python3 is required for --apply metadata rewriting" + + local rewrite_dir rewrite_manifest rewrite_paths rewrite_hashes + rewrite_dir="$tmp_dir/rewritten-checkpoint-metadata" + rewrite_manifest="$tmp_dir/rewritten-checkpoint-metadata.tsv" + rewrite_paths="$tmp_dir/rewritten-checkpoint-metadata.paths" + rewrite_hashes="$tmp_dir/rewritten-checkpoint-metadata.hashes" + + mkdir -p "$rewrite_dir" + + python3 - "$main_metadata_file" "$raw_sessions_file" "$rewrite_dir" "$rewrite_manifest" "$rewrite_paths" <<'PY' +import json +import os +import subprocess +import sys + +main_metadata_file, raw_sessions_file, rewrite_dir, rewrite_manifest, rewrite_paths = sys.argv[1:] + +sessions_by_checkpoint = {} +with open(raw_sessions_file, "r", encoding="utf-8") as f: + for line in f: + line = line.rstrip("\n") + if not line: + continue + checkpoint_id, checkpoint_path, session_index = line.split("\t") + del checkpoint_id + sessions_by_checkpoint.setdefault(checkpoint_path, set()).add(session_index) + +records = [] +with open(main_metadata_file, "r", encoding="utf-8") as f: + for line in f: + line = line.rstrip("\n") + if not line: + continue + fields = line.split("\t") + if len(fields) < 6: + continue + checkpoint_id, checkpoint_path, session_index, kind, target_path, object_info = fields[:6] + del checkpoint_id, session_index + if kind == "checkpoint_metadata" and checkpoint_path in sessions_by_checkpoint: + object_parts = object_info.split() + if len(object_parts) != 3 or object_parts[1] != "blob": + sys.stderr.write(f"unexpected ls-tree metadata for {target_path}: {object_info}\n") + sys.exit(1) + records.append((object_parts[2], target_path, checkpoint_path)) + +if not records: + open(rewrite_manifest, "w", encoding="utf-8").close() + open(rewrite_paths, "w", encoding="utf-8").close() + sys.exit(0) + +batch_input = "".join(blob + "\n" for blob, _, _ in records).encode("ascii") +batch = subprocess.run( + ["git", "cat-file", "--batch"], + input=batch_input, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, +) +if batch.returncode != 0: + sys.stderr.write(batch.stderr.decode("utf-8", errors="replace")) + sys.exit(batch.returncode) + +out = batch.stdout +offset = 0 + +with open(rewrite_manifest, "w", encoding="utf-8") as manifest, open(rewrite_paths, "w", encoding="utf-8") as paths: + for blob, target_path, checkpoint_path in records: + header_end = out.find(b"\n", offset) + if header_end < 0: + sys.stderr.write(f"missing git cat-file header for {blob}\n") + sys.exit(1) + header = out[offset:header_end].decode("ascii", errors="replace") + offset = header_end + 1 + parts = header.split() + if len(parts) < 3 or parts[1] != "blob": + sys.stderr.write(f"unexpected git cat-file header for {blob}: {header}\n") + sys.exit(1) + size = int(parts[2]) + data = out[offset:offset + size] + offset += size + if offset >= len(out) or out[offset:offset + 1] != b"\n": + sys.stderr.write(f"missing git cat-file record separator for {blob}\n") + sys.exit(1) + offset += 1 + + metadata = json.loads(data.decode("utf-8")) + session_entries = [] + for session_index in sorted(sessions_by_checkpoint[checkpoint_path], key=lambda value: int(value)): + session_prefix = f"/{checkpoint_path}/{session_index}" + session_entries.append({ + "metadata": f"{session_prefix}/metadata.json", + "transcript": f"{session_prefix}/full.jsonl", + "content_hash": f"{session_prefix}/content_hash.txt", + "prompt": f"{session_prefix}/prompt.txt", + }) + metadata["sessions"] = session_entries + + rewritten_path = os.path.join(rewrite_dir, checkpoint_path, "metadata.json") + os.makedirs(os.path.dirname(rewritten_path), exist_ok=True) + with open(rewritten_path, "w", encoding="utf-8") as rewritten: + json.dump(metadata, rewritten, indent=2) + rewritten.write("\n") + + manifest.write(f"{rewritten_path}\t{target_path}\n") + paths.write(rewritten_path + "\n") +PY + + if [[ ! -s "$rewrite_paths" ]]; then + return + fi + + git hash-object -w --stdin-paths < "$rewrite_paths" > "$rewrite_hashes" + awk -F '\t' ' + NR == FNR { + target[FNR] = $2 + next + } + { + print "100644 blob " $1 "\t" target[FNR] + } + ' "$rewrite_manifest" "$rewrite_hashes" >> "$plan_entries_file" +} + +compute_plan_counts() { + planned_checkpoints=$(wc -l < "$raw_checkpoint_ids_file" | tr -d '[:space:]') + planned_sessions=$(wc -l < "$raw_sessions_file" | tr -d '[:space:]') + planned_raw_transcripts=$(awk -F '\t' '$5 == "raw_transcript" { count++ } END { print count + 0 }' "$full_artifacts_file") + missing_raw_checkpoints=$(awk ' + NR == FNR { + raw[$1] = 1 + next + } + NF && !($1 in raw) { + count++ + } + END { + print count + 0 + } + ' "$raw_checkpoint_ids_file" "$checkpoint_ids_file") + missing_metadata_checkpoints=$(awk -F '\t' ' + NR == FNR { + if ($4 == "checkpoint_metadata") { + have[$1] = 1 + } + next + } + NF && !($1 in have) { + count++ + } + END { + print count + 0 + } + ' "$main_metadata_file" "$raw_checkpoint_ids_file") + missing_metadata_sessions=$(awk -F '\t' ' + NR == FNR { + if ($4 == "session_metadata") { + have[$1 "\t" $3] = 1 + } + next + } + NF { + key = $1 "\t" $3 + if (!(key in have)) { + count++ + } + } + END { + print count + 0 + } + ' "$main_metadata_file" "$raw_sessions_file") +} + while [[ $# -gt 0 ]]; do case "$1" in -h|--help) @@ -253,7 +572,16 @@ if (( mode_count > 1 )); then die "--list, --dry-run, and --apply are mutually exclusive" fi -plan_entries_file=$(mktemp "${TMPDIR:-/tmp}/v2-to-v1-plan.XXXXXX") +tmp_dir=$(mktemp -d "${TMPDIR:-/tmp}/v2-to-v1.XXXXXX") +plan_entries_file="$tmp_dir/plan_entries" +checkpoint_commits_file="$tmp_dir/checkpoint_commits" +checkpoint_ids_file="$tmp_dir/checkpoint_ids" +checkpoint_paths_file="$tmp_dir/checkpoint_paths" +full_artifacts_file="$tmp_dir/full_artifacts" +raw_sessions_file="$tmp_dir/raw_sessions" +raw_checkpoint_ids_file="$tmp_dir/raw_checkpoint_ids" +main_metadata_file="$tmp_dir/main_metadata" +: > "$plan_entries_file" if [[ -z "$repo_path" ]]; then printf 'Local repo path: ' >&2 @@ -288,22 +616,12 @@ if [[ -n "$head_commitish" ]]; then die "$since_commit is not an ancestor of $head_commitish" fi -main_ref_available=false -if git show-ref --verify --quiet "$V2_MAIN_REF"; then - main_ref_available=true -else - warn "missing $V2_MAIN_REF; companion metadata paths will not be shown" -fi - -full_refs=$(list_full_refs) -[[ -n "$full_refs" ]] || die "missing refs under $V2_FULL_REF_PREFIX; cannot locate raw transcripts" - if [[ -n "$head_hash" ]]; then - checkpoint_ids=$(list_checkpoint_ids_between "$since_hash" "$head_hash") + write_checkpoint_commit_index_between "$since_hash" "$head_hash" "$checkpoint_commits_file" else - checkpoint_ids=$(list_checkpoint_ids_from_all_refs "$since_hash") + write_checkpoint_commit_index_from_all_refs "$since_hash" "$checkpoint_commits_file" fi -if [[ -z "$checkpoint_ids" ]]; then +if [[ ! -s "$checkpoint_commits_file" ]]; then if [[ -n "$head_hash" ]]; then printf 'No %s trailers found in %s..%s\n' "$TRAILER_KEY" "$since_hash" "$head_hash" else @@ -311,23 +629,41 @@ if [[ -z "$checkpoint_ids" ]]; then fi exit 0 fi +write_checkpoint_id_files if [[ "$list_mode" == "true" ]]; then - checkpoint_count=$(printf '%s\n' "$checkpoint_ids" | sed '/^$/d' | wc -l | tr -d '[:space:]') + checkpoint_count=$(wc -l < "$checkpoint_ids_file" | tr -d '[:space:]') printf 'Checkpoints: %s\n' "$checkpoint_count" printf 'checkpoint_id\tcommit_ids\n' - while IFS= read -r checkpoint_id; do - [[ -n "$checkpoint_id" ]] || continue - commits=$(list_commit_ids_for_checkpoint "$checkpoint_id" "$since_hash" "$head_hash" | tr '\n' ' ' | sed 's/[[:space:]]*$//') - printf '%s\t' "$checkpoint_id" - if [[ -n "$commits" ]]; then - printf '%s' "$commits" - fi - printf '\n' - done <<< "$checkpoint_ids" + awk -F '\t' ' + NF >= 2 { + if (!seen_checkpoint[$1]++) { + order[++n] = $1 + } + key = $1 SUBSEP $2 + if (!seen_pair[key]++) { + commits[$1] = commits[$1] == "" ? $2 : commits[$1] " " $2 + } + } + END { + for (i = 1; i <= n; i++) { + print order[i] "\t" commits[order[i]] + } + } + ' "$checkpoint_commits_file" exit 0 fi +main_ref_available=false +if git show-ref --verify --quiet "$V2_MAIN_REF"; then + main_ref_available=true +else + warn "missing $V2_MAIN_REF; companion metadata paths will not be shown" +fi + +full_refs=$(list_full_refs) +[[ -n "$full_refs" ]] || die "missing refs under $V2_FULL_REF_PREFIX; cannot locate raw transcripts" + printf 'Repository: %s\n' "$repo_root" if [[ -n "$head_hash" ]]; then printf 'Scanning commits: %s..%s\n' "$since_hash" "$head_hash" @@ -341,119 +677,74 @@ printf 'Full refs:\n' printf '%s\n' "$full_refs" | sed 's/^/ /' printf '\n' -planned_checkpoints=0 -planned_sessions=0 -planned_raw_transcripts=0 -missing_raw_checkpoints=0 -missing_metadata_checkpoints=0 -missing_metadata_sessions=0 - -while IFS= read -r checkpoint_id; do - [[ -n "$checkpoint_id" ]] || continue - - checkpoint_path=$(checkpoint_to_path "$checkpoint_id") +write_full_artifact_index +write_main_metadata_index +append_checkpoint_metadata_plan_entries +compute_plan_counts - found_full_artifact=false - found_sessions="" - checkpoint_output="" - while IFS= read -r full_ref; do - [[ -n "$full_ref" ]] || continue - - sessions=$(list_numeric_dirs "$full_ref" "$checkpoint_path") - printed_full_checkpoint=false - while IFS= read -r session_index; do - [[ -n "$session_index" ]] || continue - session_path="$checkpoint_path/$session_index" - artifacts=$(list_full_artifacts "$full_ref" "$session_path") - if [[ -z "$artifacts" ]]; then - continue - fi - - found_full_artifact=true - found_sessions="${found_sessions}${session_index}"$'\n' - if [[ "$printed_full_checkpoint" != "true" ]]; then - printed_full_checkpoint=true - checkpoint_output="${checkpoint_output} full checkpoint folder: ${full_ref}:${checkpoint_path}"$'\n' - fi - checkpoint_output="${checkpoint_output} full session folder: ${full_ref}:${session_path}"$'\n' - while IFS= read -r artifact; do - [[ -n "$artifact" ]] || continue - checkpoint_output="${checkpoint_output} raw artifact: ${full_ref}:${session_path}/${artifact}"$'\n' - done <<< "$artifacts" - done <<< "$sessions" - done <<< "$full_refs" - - if [[ "$found_full_artifact" != "true" ]]; then - warn "no raw_transcript artifacts found for checkpoint $checkpoint_id" - missing_raw_checkpoints=$((missing_raw_checkpoints + 1)) - continue +if [[ "$dry_run" != "true" ]]; then + if (( missing_raw_checkpoints > 0 )); then + warn "$missing_raw_checkpoints checkpoint trailer(s) do not have raw_transcript artifacts and will be skipped" fi + if (( missing_metadata_checkpoints > 0 )); then + warn "$missing_metadata_checkpoints checkpoint(s) with raw transcripts are missing companion checkpoint metadata" + fi + if (( missing_metadata_sessions > 0 )); then + warn "$missing_metadata_sessions session(s) with raw transcripts are missing companion session metadata" + fi +fi - planned_checkpoints=$((planned_checkpoints + 1)) +if [[ "$dry_run" == "true" ]]; then + while IFS= read -r checkpoint_id; do + [[ -n "$checkpoint_id" ]] || continue - if [[ "$main_ref_available" == "true" ]] && tree_path_exists "$V2_MAIN_REF" "$checkpoint_path"; then - checkpoint_output="${checkpoint_output} companion metadata folder: ${V2_MAIN_REF}:${checkpoint_path}"$'\n' - if tree_path_exists "$V2_MAIN_REF" "$checkpoint_path/metadata.json"; then - checkpoint_output="${checkpoint_output} checkpoint metadata: ${V2_MAIN_REF}:${checkpoint_path}/metadata.json"$'\n' - append_tree_entry_from_source "$V2_MAIN_REF" "$checkpoint_path/metadata.json" "$checkpoint_path/metadata.json" || - warn "failed to plan checkpoint metadata for $checkpoint_id" - fi + checkpoint_path=$(checkpoint_to_path "$checkpoint_id") - metadata_sessions=$(printf '%s' "$found_sessions" | sed '/^$/d' | sort -n | uniq) - while IFS= read -r session_index; do - [[ -n "$session_index" ]] || continue - session_path="$checkpoint_path/$session_index" - planned_sessions=$((planned_sessions + 1)) - if tree_path_exists "$V2_MAIN_REF" "$session_path/metadata.json"; then - checkpoint_output="${checkpoint_output} session metadata: ${V2_MAIN_REF}:${session_path}/metadata.json"$'\n' - append_tree_entry_from_source "$V2_MAIN_REF" "$session_path/metadata.json" "$session_path/metadata.json" || - warn "failed to plan session metadata for checkpoint $checkpoint_id session $session_index" - else - missing_metadata_sessions=$((missing_metadata_sessions + 1)) - warn "missing session metadata for checkpoint $checkpoint_id session $session_index on $V2_MAIN_REF" - fi - if tree_path_exists "$V2_MAIN_REF" "$session_path/prompt.txt"; then - checkpoint_output="${checkpoint_output} prompt: ${V2_MAIN_REF}:${session_path}/prompt.txt"$'\n' - append_tree_entry_from_source "$V2_MAIN_REF" "$session_path/prompt.txt" "$session_path/prompt.txt" || - warn "failed to plan prompt for checkpoint $checkpoint_id session $session_index" - fi - done <<< "$metadata_sessions" - elif [[ "$main_ref_available" == "true" ]]; then - missing_metadata_checkpoints=$((missing_metadata_checkpoints + 1)) - warn "checkpoint $checkpoint_id has raw transcript artifacts but no companion metadata on $V2_MAIN_REF" - fi + checkpoint_output="" - while IFS=$'\t' read -r full_ref session_index artifact; do - [[ -n "${full_ref:-}" && -n "${session_index:-}" && -n "${artifact:-}" ]] || continue - session_path="$checkpoint_path/$session_index" - v1_artifact=$(v1_raw_artifact_name "$artifact") || continue - append_tree_entry_from_source "$full_ref" "$session_path/$artifact" "$session_path/$v1_artifact" || - warn "failed to plan raw artifact for checkpoint $checkpoint_id session $session_index: $artifact" - if [[ "$artifact" == "raw_transcript" ]]; then - planned_raw_transcripts=$((planned_raw_transcripts + 1)) + if ! awk -F '\t' -v checkpoint_id="$checkpoint_id" '$1 == checkpoint_id { found = 1; exit } END { exit found ? 0 : 1 }' "$full_artifacts_file"; then + warn "no raw_transcript artifacts found for checkpoint $checkpoint_id" + continue fi - done < <( - while IFS= read -r full_ref; do - [[ -n "$full_ref" ]] || continue - sessions=$(list_numeric_dirs "$full_ref" "$checkpoint_path") - while IFS= read -r session_index; do - [[ -n "$session_index" ]] || continue - session_path="$checkpoint_path/$session_index" - artifacts=$(list_full_artifacts "$full_ref" "$session_path") - while IFS= read -r artifact; do - [[ -n "$artifact" ]] || continue - printf '%s\t%s\t%s\n' "$full_ref" "$session_index" "$artifact" - done <<< "$artifacts" - done <<< "$sessions" - done <<< "$full_refs" - ) - - if [[ "$dry_run" == "true" ]]; then + checkpoint_output=$(awk -F '\t' -v checkpoint_id="$checkpoint_id" ' + $1 == checkpoint_id { + full_checkpoint_key = $4 SUBSEP $2 + if (!seen_full_checkpoint[full_checkpoint_key]++) { + print " full checkpoint folder: " $4 ":" $2 + } + session_key = $4 SUBSEP $2 SUBSEP $3 + if (!seen_session[session_key]++) { + print " full session folder: " $4 ":" $2 "/" $3 + } + print " raw artifact: " $4 ":" $6 + } + ' "$full_artifacts_file") + + if [[ "$main_ref_available" == "true" ]] && awk -F '\t' -v checkpoint_id="$checkpoint_id" '$1 == checkpoint_id && $4 == "checkpoint_metadata" { found = 1; exit } END { exit found ? 0 : 1 }' "$main_metadata_file"; then + metadata_output=$(awk -F '\t' -v checkpoint_id="$checkpoint_id" -v main_ref="$V2_MAIN_REF" ' + $1 == checkpoint_id { + if (!printed_folder++) { + print " companion metadata folder: " main_ref ":" $2 + } + if ($4 == "checkpoint_metadata") { + print " checkpoint metadata: " main_ref ":" $5 + } else if ($4 == "session_metadata") { + print " session metadata: " main_ref ":" $5 + } else if ($4 == "prompt") { + print " prompt: " main_ref ":" $5 + } + } + ' "$main_metadata_file") + checkpoint_output="${checkpoint_output}"$'\n'"${metadata_output}" + elif [[ "$main_ref_available" == "true" ]]; then + warn "checkpoint $checkpoint_id has raw transcript artifacts but no companion metadata on $V2_MAIN_REF" + fi + printf 'checkpoint %s\n' "$checkpoint_id" printf '%s' "$checkpoint_output" printf '\n' - fi -done <<< "$checkpoint_ids" + done < "$checkpoint_ids_file" +fi planned_entries=$(wc -l < "$plan_entries_file" | tr -d '[:space:]') unique_planned_entries=$(write_unique_mktree_input "$plan_entries_file" | wc -l | tr -d '[:space:]') @@ -472,13 +763,39 @@ if [[ "$dry_run" != "true" ]]; then fi if [[ "$apply" == "true" ]]; then - # This is the final write path, intentionally blocked until the migration - # behavior is reviewed with real repo output. It reuses v2 blob objects, - # builds one complete v1 tree, and would then update V1_REF to the commit. - die "--apply is scaffolded but intentionally blocked before creating a migration commit or updating $V1_REF" + if [[ "$unique_planned_entries" == "0" ]]; then + die "nothing to migrate: no v1 tree entries were planned" + fi + + old_ref_hash="" + if git show-ref --verify --quiet "$V1_REF"; then + old_ref_hash=$(git rev-parse "$V1_REF^{commit}") + fi + + if [[ -n "$old_ref_hash" ]]; then + old_tree_hash=$(git rev-parse "$old_ref_hash^{tree}") + else + old_tree_hash="" + fi + + tree_hash=$(build_v1_tree_from_plan "$plan_entries_file") + if [[ -n "$old_tree_hash" && "$old_tree_hash" == "$tree_hash" ]]; then + printf '%s is already up to date; no migration commit created.\n' "$V1_REF" + exit 0 + fi + + commit_hash=$(create_v1_migration_commit "$tree_hash") + if [[ -n "$old_ref_hash" ]]; then + git update-ref "$V1_REF" "$commit_hash" "$old_ref_hash" + else + git update-ref "$V1_REF" "$commit_hash" + fi + + printf 'Wrote migration commit: %s\n' "$commit_hash" + printf 'Updated %s\n' "$V1_REF" + exit 0 fi if [[ "$dry_run" != "true" ]]; then - printf 'No refs were written. Use --dry-run to print every source and target artifact.\n' - printf 'The single-commit apply path is scaffolded but blocked before updating %s.\n' "$V1_REF" + printf 'Plan only: no refs were written. Use --dry-run to print every source and target artifact, or --apply to write the migration commit.\n' fi From 5ef0e37545bcdd65084e6200d86df68acbaa5194 Mon Sep 17 00:00:00 2001 From: computermode <2917645+computermode@users.noreply.github.com> Date: Tue, 26 May 2026 11:21:40 -0700 Subject: [PATCH 03/35] Address v2 migration script review comments Entire-Checkpoint: 0951efc5bce0 --- scripts/migrate-v2-checkpoints-to-v1.sh | 97 +++++++++++-------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/scripts/migrate-v2-checkpoints-to-v1.sh b/scripts/migrate-v2-checkpoints-to-v1.sh index 20c44506e..065c4bdb8 100755 --- a/scripts/migrate-v2-checkpoints-to-v1.sh +++ b/scripts/migrate-v2-checkpoints-to-v1.sh @@ -53,7 +53,7 @@ raw_checkpoint_ids_file="" main_metadata_file="" show_help() { - sed -n '3,/^$/p' "$0" | sed -E 's/^# ?//' + sed -n '5,/^$/p' "$0" | sed -E 's/^# ?//' } die() { @@ -77,20 +77,6 @@ checkpoint_to_path() { printf '%s/%s' "${checkpoint_id:0:2}" "${checkpoint_id:2}" } -tree_path_exists() { - local ref_name="$1" - local path="$2" - git cat-file -e "${ref_name}:${path}" 2>/dev/null -} - -list_numeric_dirs() { - local ref_name="$1" - local path="$2" - local entries - entries=$(git ls-tree -d --name-only "${ref_name}:${path}" 2>/dev/null || true) - printf '%s\n' "$entries" | sed -nE '/^[0-9]+$/p' -} - list_full_refs() { git for-each-ref --format='%(refname)' "$V2_FULL_REF_PREFIX" | sort | @@ -136,8 +122,21 @@ write_checkpoint_commit_index_between() { write_checkpoint_commit_index_from_all_refs() { local since="$1" local output_file="$2" + local refs_file="$tmp_dir/refs_containing_since" + + : > "$refs_file" + git for-each-ref --contains "$since" --format='%(refname)' refs/heads refs/remotes > "$refs_file" + if git merge-base --is-ancestor "$since" HEAD 2>/dev/null; then + printf 'HEAD\n' >> "$refs_file" + fi + sort -u "$refs_file" -o "$refs_file" + + if [[ ! -s "$refs_file" ]]; then + : > "$output_file" + return + fi - git log HEAD --branches --remotes --format='__ENTIRE_COMMIT__%H%n%B' --not "$since" | + xargs git log --format='__ENTIRE_COMMIT__%H%n%B' "$since".. < "$refs_file" | awk -v key="$TRAILER_KEY" ' /^__ENTIRE_COMMIT__/ { commit = substr($0, length("__ENTIRE_COMMIT__") + 1) @@ -157,24 +156,6 @@ write_checkpoint_commit_index_from_all_refs() { ' > "$output_file" } -v1_raw_artifact_name() { - local artifact="$1" - case "$artifact" in - raw_transcript) - printf 'full.jsonl' - ;; - raw_transcript.[0-9][0-9][0-9]) - printf 'full.jsonl%s' "${artifact#raw_transcript}" - ;; - raw_transcript_hash.txt) - printf 'content_hash.txt' - ;; - *) - return 1 - ;; - esac -} - write_unique_mktree_input() { local entries_file="$1" awk -F '\t' 'NF >= 2 && !seen[$2]++ { print }' "$entries_file" | @@ -475,10 +456,14 @@ compute_plan_counts() { planned_checkpoints=$(wc -l < "$raw_checkpoint_ids_file" | tr -d '[:space:]') planned_sessions=$(wc -l < "$raw_sessions_file" | tr -d '[:space:]') planned_raw_transcripts=$(awk -F '\t' '$5 == "raw_transcript" { count++ } END { print count + 0 }' "$full_artifacts_file") - missing_raw_checkpoints=$(awk ' - NR == FNR { - raw[$1] = 1 - next + missing_raw_checkpoints=$(awk -v raw_file="$raw_checkpoint_ids_file" ' + BEGIN { + while ((getline line < raw_file) > 0) { + if (line != "") { + raw[line] = 1 + } + } + close(raw_file) } NF && !($1 in raw) { count++ @@ -486,13 +471,16 @@ compute_plan_counts() { END { print count + 0 } - ' "$raw_checkpoint_ids_file" "$checkpoint_ids_file") - missing_metadata_checkpoints=$(awk -F '\t' ' - NR == FNR { - if ($4 == "checkpoint_metadata") { - have[$1] = 1 + ' "$checkpoint_ids_file") + missing_metadata_checkpoints=$(awk -F '\t' -v metadata_file="$main_metadata_file" ' + BEGIN { + while ((getline line < metadata_file) > 0) { + split(line, fields, "\t") + if (fields[4] == "checkpoint_metadata") { + have[fields[1]] = 1 + } } - next + close(metadata_file) } NF && !($1 in have) { count++ @@ -500,13 +488,16 @@ compute_plan_counts() { END { print count + 0 } - ' "$main_metadata_file" "$raw_checkpoint_ids_file") - missing_metadata_sessions=$(awk -F '\t' ' - NR == FNR { - if ($4 == "session_metadata") { - have[$1 "\t" $3] = 1 + ' "$raw_checkpoint_ids_file") + missing_metadata_sessions=$(awk -F '\t' -v metadata_file="$main_metadata_file" ' + BEGIN { + while ((getline line < metadata_file) > 0) { + split(line, fields, "\t") + if (fields[4] == "session_metadata") { + have[fields[1] "\t" fields[3]] = 1 + } } - next + close(metadata_file) } NF { key = $1 "\t" $3 @@ -517,7 +508,7 @@ compute_plan_counts() { END { print count + 0 } - ' "$main_metadata_file" "$raw_sessions_file") + ' "$raw_sessions_file") } while [[ $# -gt 0 ]]; do @@ -625,7 +616,7 @@ if [[ ! -s "$checkpoint_commits_file" ]]; then if [[ -n "$head_hash" ]]; then printf 'No %s trailers found in %s..%s\n' "$TRAILER_KEY" "$since_hash" "$head_hash" else - printf 'No %s trailers found on local branches/remotes after %s\n' "$TRAILER_KEY" "$since_hash" + printf 'No %s trailers found on local branches/remotes containing %s\n' "$TRAILER_KEY" "$since_hash" fi exit 0 fi @@ -668,7 +659,7 @@ printf 'Repository: %s\n' "$repo_root" if [[ -n "$head_hash" ]]; then printf 'Scanning commits: %s..%s\n' "$since_hash" "$head_hash" else - printf 'Scanning commits: local branches/remotes after %s\n' "$since_hash" + printf 'Scanning commits: local branches/remotes containing %s\n' "$since_hash" fi if [[ "$main_ref_available" == "true" ]]; then printf 'Companion metadata ref: %s\n' "$V2_MAIN_REF" From 09dd22e8b2387df91485f1df6631e7633c5db297 Mon Sep 17 00:00:00 2001 From: computermode <2917645+computermode@users.noreply.github.com> Date: Tue, 26 May 2026 12:21:53 -0700 Subject: [PATCH 04/35] Write migrated checkpoints as dated v1 commits Entire-Checkpoint: 80c40cdeb702 --- scripts/migrate-v2-checkpoints-to-v1.sh | 299 +++++++++++++++++++++--- 1 file changed, 269 insertions(+), 30 deletions(-) diff --git a/scripts/migrate-v2-checkpoints-to-v1.sh b/scripts/migrate-v2-checkpoints-to-v1.sh index 065c4bdb8..07f8fee25 100755 --- a/scripts/migrate-v2-checkpoints-to-v1.sh +++ b/scripts/migrate-v2-checkpoints-to-v1.sh @@ -11,7 +11,7 @@ set -euo pipefail # -h, --help Show this help message # --list Print checkpoint IDs and associated commit IDs only # --dry-run Print every v2 folder/file that would be migrated -# --apply Write one local refs/heads/entire/checkpoints/v1 migration commit +# --apply Write local refs/heads/entire/checkpoints/v1 migration commits # --repo Local repository path to inspect # --since Commit before the checkpoints to inspect # --head Limit scan to one history tip (default: all branches/remotes) @@ -25,8 +25,8 @@ set -euo pipefail # refs/entire/checkpoints/v2/full/*://raw_transcript* # # The default mode prints a migration plan without writing refs. --dry-run -# prints every source folder/file. --apply writes one local migration commit to -# refs/heads/entire/checkpoints/v1. +# prints every source folder/file. --apply writes one local migration commit +# per checkpoint to refs/heads/entire/checkpoints/v1. # # If --repo or SINCE_COMMIT is omitted, the script prompts for it. # @@ -162,18 +162,17 @@ write_unique_mktree_input() { sort -k2 } -build_v1_tree_from_plan() { - local entries_file="$1" +build_v1_tree_from_entries() { + local base_tree_hash="$1" + local entries_file="$2" local combined index_file combined="$tmp_dir/combined_index_info" index_file="$tmp_dir/migration.index" rm -f "$index_file" - if git show-ref --verify --quiet "$V1_REF"; then - cat "$entries_file" > "$combined" - git ls-tree -r "$V1_REF" >> "$combined" - else - cat "$entries_file" > "$combined" + cat "$entries_file" > "$combined" + if [[ -n "$base_tree_hash" ]]; then + git ls-tree -r "$base_tree_hash" >> "$combined" fi write_unique_mktree_input "$combined" | @@ -181,17 +180,29 @@ build_v1_tree_from_plan() { GIT_INDEX_FILE="$index_file" git write-tree } -create_v1_migration_commit() { - local tree_hash="$1" - local parent_hash commit_hash - - if git show-ref --verify --quiet "$V1_REF"; then - parent_hash=$(git rev-parse "$V1_REF^{commit}") - commit_hash=$(printf 'Migrate checkpoints v2 to v1\n\nSource refs: %s and %s/*\n' "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | - git commit-tree "$tree_hash" -p "$parent_hash") +create_v1_checkpoint_migration_commit() { + local checkpoint_id="$1" + local tree_hash="$2" + local parent_hash="$3" + local commit_date="$4" + local commit_hash + + if [[ -n "$parent_hash" ]]; then + if [[ -n "$commit_date" ]]; then + commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | + GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash" -p "$parent_hash") + else + commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | + git commit-tree "$tree_hash" -p "$parent_hash") + fi else - commit_hash=$(printf 'Migrate checkpoints v2 to v1\n\nSource refs: %s and %s/*\n' "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | - git commit-tree "$tree_hash") + if [[ -n "$commit_date" ]]; then + commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | + GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash") + else + commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | + git commit-tree "$tree_hash") + fi fi printf '%s\n' "$commit_hash" @@ -202,6 +213,188 @@ write_checkpoint_id_files() { awk 'NF { print $0 "\t" substr($0, 1, 2) "/" substr($0, 3) }' "$checkpoint_ids_file" > "$checkpoint_paths_file" } +write_apply_checkpoint_ids() { + local output_file="$1" + + awk ' + NR == FNR { + raw[$1] = 1 + next + } + NF && ($1 in raw) { + ids[++n] = $1 + } + END { + for (i = n; i >= 1; i--) { + print ids[i] + } + } + ' "$raw_checkpoint_ids_file" "$checkpoint_ids_file" > "$output_file" +} + +write_bulk_migration_entries() { + local bulk_commit="$1" + local parent_commit="$2" + local output_entries_file="$3" + local output_ids_file="$4" + local changed_paths_file checkpoint_dirs_file + + changed_paths_file="$tmp_dir/bulk_changed_paths" + checkpoint_dirs_file="$tmp_dir/bulk_checkpoint_dirs" + + if [[ -n "$parent_commit" ]]; then + git diff --name-only "$parent_commit" "$bulk_commit" > "$changed_paths_file" + else + git ls-tree -r --name-only "$bulk_commit" > "$changed_paths_file" + fi + + awk -F '/' ' + NF >= 3 && $1 ~ /^[0-9a-f][0-9a-f]$/ && $2 ~ /^[0-9a-f]{10}$/ { + dir = $1 "/" $2 + if (!seen[dir]++) { + print dir + } + } + ' "$changed_paths_file" > "$checkpoint_dirs_file" + + awk -F '/' 'NF >= 2 { print $1 $2 }' "$checkpoint_dirs_file" > "$output_ids_file" + + git ls-tree -r "$bulk_commit" | + awk -F '\t' -v checkpoint_dirs_file="$checkpoint_dirs_file" ' + BEGIN { + while ((getline dir < checkpoint_dirs_file) > 0) { + prefixes[dir "/"] = 1 + } + close(checkpoint_dirs_file) + } + NF >= 2 { + for (prefix in prefixes) { + if (index($2, prefix) == 1) { + print + next + } + } + } + ' > "$output_entries_file" +} + +write_checkpoint_entries() { + local checkpoint_id="$1" + local output_file="$2" + local source_entries_file="$3" + local checkpoint_path + + checkpoint_path=$(checkpoint_to_path "$checkpoint_id") + awk -F '\t' -v prefix="${checkpoint_path}/" 'NF >= 2 && index($2, prefix) == 1 { print }' "$source_entries_file" > "$output_file" +} + +write_checkpoint_commit_dates() { + local source_entries_file="$1" + local output_file="$2" + + command -v python3 >/dev/null 2>&1 || die "python3 is required for --apply metadata date extraction" + + python3 - "$source_entries_file" "$output_file" <<'PY' +import json +import re +import subprocess +import sys + +source_entries_file, output_file = sys.argv[1:] +metadata_re = re.compile(r"^([0-9a-f]{2})/([0-9a-f]{10})/[0-9]+/metadata\.json$") + +records = [] +with open(source_entries_file, "r", encoding="utf-8") as f: + for line in f: + line = line.rstrip("\n") + if not line or "\t" not in line: + continue + object_info, path = line.split("\t", 1) + match = metadata_re.match(path) + if not match: + continue + object_parts = object_info.split() + if len(object_parts) != 3 or object_parts[1] != "blob": + continue + checkpoint_id = match.group(1) + match.group(2) + records.append((checkpoint_id, object_parts[2])) + +if not records: + open(output_file, "w", encoding="utf-8").close() + sys.exit(0) + +batch_input = "".join(blob + "\n" for _, blob in records).encode("ascii") +batch = subprocess.run( + ["git", "cat-file", "--batch"], + input=batch_input, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, +) +if batch.returncode != 0: + sys.stderr.write(batch.stderr.decode("utf-8", errors="replace")) + sys.exit(batch.returncode) + +dates = {} +out = batch.stdout +offset = 0 +for checkpoint_id, blob in records: + header_end = out.find(b"\n", offset) + if header_end < 0: + sys.stderr.write(f"missing git cat-file header for {blob}\n") + sys.exit(1) + header = out[offset:header_end].decode("ascii", errors="replace") + offset = header_end + 1 + parts = header.split() + if len(parts) < 3 or parts[1] != "blob": + sys.stderr.write(f"unexpected git cat-file header for {blob}: {header}\n") + sys.exit(1) + size = int(parts[2]) + data = out[offset:offset + size] + offset += size + if offset >= len(out) or out[offset:offset + 1] != b"\n": + sys.stderr.write(f"missing git cat-file record separator for {blob}\n") + sys.exit(1) + offset += 1 + + metadata = json.loads(data.decode("utf-8")) + created_at = metadata.get("created_at") + if created_at and (checkpoint_id not in dates or created_at < dates[checkpoint_id]): + dates[checkpoint_id] = created_at + +with open(output_file, "w", encoding="utf-8") as f: + for checkpoint_id in sorted(dates): + f.write(f"{checkpoint_id}\t{dates[checkpoint_id]}\n") +PY +} + +order_checkpoint_ids_by_date() { + local checkpoint_ids_source_file="$1" + local checkpoint_dates_file="$2" + local output_file="$3" + + awk -F '\t' ' + NR == FNR { + date[$1] = $2 + next + } + NF { + order++ + sort_date = ($1 in date && date[$1] != "") ? date[$1] : sprintf("9999-12-31T23:59:59Z-%09d", order) + print sort_date "\t" sprintf("%09d", order) "\t" $1 + } + ' "$checkpoint_dates_file" "$checkpoint_ids_source_file" | + sort -t $'\t' -k1,1 -k2,2 | + cut -f3 > "$output_file" +} + +checkpoint_commit_date() { + local checkpoint_id="$1" + local checkpoint_dates_file="$2" + + awk -F '\t' -v checkpoint_id="$checkpoint_id" '$1 == checkpoint_id { print $2; exit }' "$checkpoint_dates_file" +} + write_full_artifact_index() { local full_ref : > "$full_artifacts_file" @@ -759,34 +952,80 @@ if [[ "$apply" == "true" ]]; then fi old_ref_hash="" + base_tree_hash="" + parent_hash="" + rewriting_bulk_migration=false if git show-ref --verify --quiet "$V1_REF"; then old_ref_hash=$(git rev-parse "$V1_REF^{commit}") + parent_hash="$old_ref_hash" + base_tree_hash=$(git rev-parse "$old_ref_hash^{tree}") + + if [[ "$(git log -1 --format=%s "$old_ref_hash")" == "Migrate checkpoints v2 to v1" ]]; then + rewriting_bulk_migration=true + if parent_hash=$(git rev-parse --verify --quiet "$old_ref_hash^"); then + base_tree_hash=$(git rev-parse "$parent_hash^{tree}") + else + parent_hash="" + base_tree_hash="" + fi + fi fi - if [[ -n "$old_ref_hash" ]]; then - old_tree_hash=$(git rev-parse "$old_ref_hash^{tree}") + source_plan_entries_file="$plan_entries_file" + apply_checkpoint_ids_source_file="$tmp_dir/apply_checkpoint_ids_source" + if [[ "$rewriting_bulk_migration" == "true" ]]; then + source_plan_entries_file="$tmp_dir/bulk_plan_entries" + write_bulk_migration_entries "$old_ref_hash" "$parent_hash" "$source_plan_entries_file" "$apply_checkpoint_ids_source_file" else - old_tree_hash="" + write_apply_checkpoint_ids "$apply_checkpoint_ids_source_file" fi - tree_hash=$(build_v1_tree_from_plan "$plan_entries_file") - if [[ -n "$old_tree_hash" && "$old_tree_hash" == "$tree_hash" ]]; then + checkpoint_dates_file="$tmp_dir/checkpoint_dates" + apply_checkpoint_ids_file="$tmp_dir/apply_checkpoint_ids" + write_checkpoint_commit_dates "$source_plan_entries_file" "$checkpoint_dates_file" + order_checkpoint_ids_by_date "$apply_checkpoint_ids_source_file" "$checkpoint_dates_file" "$apply_checkpoint_ids_file" + + commit_count=0 + current_tree_hash="$base_tree_hash" + final_commit_hash="$parent_hash" + while IFS= read -r checkpoint_id; do + [[ -n "$checkpoint_id" ]] || continue + checkpoint_entries_file="$tmp_dir/checkpoint-${checkpoint_id}.entries" + write_checkpoint_entries "$checkpoint_id" "$checkpoint_entries_file" "$source_plan_entries_file" + [[ -s "$checkpoint_entries_file" ]] || continue + + new_tree_hash=$(build_v1_tree_from_entries "$current_tree_hash" "$checkpoint_entries_file") + if [[ -n "$current_tree_hash" && "$new_tree_hash" == "$current_tree_hash" ]]; then + continue + fi + + commit_date=$(checkpoint_commit_date "$checkpoint_id" "$checkpoint_dates_file") + final_commit_hash=$(create_v1_checkpoint_migration_commit "$checkpoint_id" "$new_tree_hash" "$final_commit_hash" "$commit_date") + current_tree_hash="$new_tree_hash" + commit_count=$((commit_count + 1)) + done < "$apply_checkpoint_ids_file" + + if (( commit_count == 0 )); then printf '%s is already up to date; no migration commit created.\n' "$V1_REF" exit 0 fi - commit_hash=$(create_v1_migration_commit "$tree_hash") if [[ -n "$old_ref_hash" ]]; then - git update-ref "$V1_REF" "$commit_hash" "$old_ref_hash" + git update-ref "$V1_REF" "$final_commit_hash" "$old_ref_hash" else - git update-ref "$V1_REF" "$commit_hash" + git update-ref "$V1_REF" "$final_commit_hash" fi - printf 'Wrote migration commit: %s\n' "$commit_hash" + if [[ "$rewriting_bulk_migration" == "true" ]]; then + printf 'Rewrote previous bulk migration into %s per-checkpoint commit(s).\n' "$commit_count" + else + printf 'Wrote %s per-checkpoint migration commit(s).\n' "$commit_count" + fi + printf 'Latest migration commit: %s\n' "$final_commit_hash" printf 'Updated %s\n' "$V1_REF" exit 0 fi if [[ "$dry_run" != "true" ]]; then - printf 'Plan only: no refs were written. Use --dry-run to print every source and target artifact, or --apply to write the migration commit.\n' + printf 'Plan only: no refs were written. Use --dry-run to print every source and target artifact, or --apply to write migration commits.\n' fi From 1cb2ec4119ae1b0deabe4c38cd83d8262127f1f4 Mon Sep 17 00:00:00 2001 From: computermode <2917645+computermode@users.noreply.github.com> Date: Tue, 26 May 2026 12:27:53 -0700 Subject: [PATCH 05/35] Qualify checkpoint push refspec Entire-Checkpoint: 2134122b88bc --- cmd/entire/cli/strategy/push_common.go | 7 +++- cmd/entire/cli/strategy/push_common_test.go | 43 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index a5f2e91b7..7ed7e0aeb 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -211,7 +211,7 @@ func tryPushSessionsCommon(ctx context.Context, remoteName, branchName string) ( ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) defer cancel() - result, err := remote.Push(ctx, remoteName, branchName) + result, err := remote.Push(ctx, remoteName, branchPushRefSpec(branchName)) outputStr := result.Output if err != nil { return pushResult{}, classifyPushFailure(ctx, outputStr, err) @@ -220,6 +220,11 @@ func tryPushSessionsCommon(ctx context.Context, remoteName, branchName string) ( return parsePushResult(outputStr), nil } +func branchPushRefSpec(branchName string) string { + branchRef := plumbing.NewBranchReferenceName(branchName).String() + return branchRef + ":" + branchRef +} + // protectedRefError means the remote is blocking writes to the ref itself. type protectedRefError struct { output string diff --git a/cmd/entire/cli/strategy/push_common_test.go b/cmd/entire/cli/strategy/push_common_test.go index 3dde109d0..859869f83 100644 --- a/cmd/entire/cli/strategy/push_common_test.go +++ b/cmd/entire/cli/strategy/push_common_test.go @@ -1336,6 +1336,49 @@ func TestDoPushBranch_NewContent_SaysDone(t *testing.T) { assert.NotContains(t, output, "already up-to-date", "should not say 'already up-to-date' when content was pushed") } +// TestDoPushBranch_AmbiguousLocalRefs verifies that checkpoint pushes qualify +// the branch refspec. A stale refs/entire/checkpoints/v1 ref can otherwise make +// git reject the unqualified source ref as ambiguous. +// +// Not parallel: uses t.Chdir() and os.Stderr redirection. +func TestDoPushBranch_AmbiguousLocalRefs(t *testing.T) { + workDir := setupRepoWithCheckpointBranch(t) + + headCmd := exec.CommandContext(context.Background(), "git", "rev-parse", "HEAD") + headCmd.Dir = workDir + headCmd.Env = testutil.GitIsolatedEnv() + headOut, err := headCmd.Output() + require.NoError(t, err) + + staleRefCmd := exec.CommandContext( + context.Background(), + "git", + "update-ref", + "refs/entire/checkpoints/v1", + strings.TrimSpace(string(headOut)), + ) + staleRefCmd.Dir = workDir + staleRefCmd.Env = testutil.GitIsolatedEnv() + out, err := staleRefCmd.CombinedOutput() + require.NoError(t, err, "stale ref setup failed: %s", out) + + bareDir := t.TempDir() + initCmd := exec.CommandContext(context.Background(), "git", "init", "--bare") + initCmd.Dir = bareDir + initCmd.Env = testutil.GitIsolatedEnv() + out, err = initCmd.CombinedOutput() + require.NoError(t, err, "git init --bare failed: %s", out) + + t.Chdir(workDir) + + restore := captureStderr(t) + err = doPushBranch(context.Background(), bareDir, paths.MetadataBranchName) + output := restore() + + require.NoError(t, err) + assert.Contains(t, output, " done", "should push despite ambiguous local refs") +} + func TestIsProtectedRefRejection(t *testing.T) { t.Parallel() From acbd55841674f4132ec6587ffbd04fead0c4c617 Mon Sep 17 00:00:00 2001 From: computermode <2917645+computermode@users.noreply.github.com> Date: Tue, 26 May 2026 12:39:16 -0700 Subject: [PATCH 06/35] Repair split checkpoint migration rewrites Entire-Checkpoint: c006f720c485 --- scripts/migrate-v2-checkpoints-to-v1.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/migrate-v2-checkpoints-to-v1.sh b/scripts/migrate-v2-checkpoints-to-v1.sh index 07f8fee25..4ac4bc662 100755 --- a/scripts/migrate-v2-checkpoints-to-v1.sh +++ b/scripts/migrate-v2-checkpoints-to-v1.sh @@ -94,6 +94,13 @@ list_full_refs() { ' } +find_bulk_migration_ancestor() { + local head_hash="$1" + + git log --first-parent --format='%H%x09%s' "$head_hash" | + awk -F '\t' '$2 == "Migrate checkpoints v2 to v1" { print $1; exit }' +} + write_checkpoint_commit_index_between() { local since="$1" local head="$2" @@ -954,15 +961,17 @@ if [[ "$apply" == "true" ]]; then old_ref_hash="" base_tree_hash="" parent_hash="" + bulk_migration_hash="" rewriting_bulk_migration=false if git show-ref --verify --quiet "$V1_REF"; then old_ref_hash=$(git rev-parse "$V1_REF^{commit}") parent_hash="$old_ref_hash" base_tree_hash=$(git rev-parse "$old_ref_hash^{tree}") - if [[ "$(git log -1 --format=%s "$old_ref_hash")" == "Migrate checkpoints v2 to v1" ]]; then + bulk_migration_hash=$(find_bulk_migration_ancestor "$old_ref_hash") + if [[ -n "$bulk_migration_hash" ]]; then rewriting_bulk_migration=true - if parent_hash=$(git rev-parse --verify --quiet "$old_ref_hash^"); then + if parent_hash=$(git rev-parse --verify --quiet "${bulk_migration_hash}^"); then base_tree_hash=$(git rev-parse "$parent_hash^{tree}") else parent_hash="" From 49dede5df8c6ae2912ff20832939c56e458f6fd1 Mon Sep 17 00:00:00 2001 From: computermode <2917645+computermode@users.noreply.github.com> Date: Tue, 26 May 2026 13:03:29 -0700 Subject: [PATCH 07/35] Record associated commits in v2 migration Entire-Checkpoint: d4f6517be448 --- scripts/migrate-v2-checkpoints-to-v1.sh | 36 ++++++++++++++++++------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/scripts/migrate-v2-checkpoints-to-v1.sh b/scripts/migrate-v2-checkpoints-to-v1.sh index 4ac4bc662..94e82f735 100755 --- a/scripts/migrate-v2-checkpoints-to-v1.sh +++ b/scripts/migrate-v2-checkpoints-to-v1.sh @@ -187,28 +187,46 @@ build_v1_tree_from_entries() { GIT_INDEX_FILE="$index_file" git write-tree } +write_original_associated_commit_trailers() { + local checkpoint_id="$1" + + awk -F '\t' -v checkpoint_id="$checkpoint_id" ' + NF >= 2 && $1 == checkpoint_id && !seen[$2]++ { + printf "Original-Associated-Commit: %s\n", $2 + } + ' "$checkpoint_commits_file" +} + +write_v1_checkpoint_migration_message() { + local checkpoint_id="$1" + + printf 'Checkpoint: %s\n\n' "$checkpoint_id" + printf 'Migrated from checkpoints v2.\n' + write_original_associated_commit_trailers "$checkpoint_id" + printf 'Source refs: %s and %s/*\n' "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" +} + create_v1_checkpoint_migration_commit() { local checkpoint_id="$1" local tree_hash="$2" local parent_hash="$3" local commit_date="$4" - local commit_hash + local commit_hash message_file + + message_file="$tmp_dir/commit-message-${checkpoint_id}" + write_v1_checkpoint_migration_message "$checkpoint_id" > "$message_file" if [[ -n "$parent_hash" ]]; then if [[ -n "$commit_date" ]]; then - commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | - GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash" -p "$parent_hash") + commit_hash=$(GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash" -p "$parent_hash" < "$message_file") else - commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | - git commit-tree "$tree_hash" -p "$parent_hash") + commit_hash=$(git commit-tree "$tree_hash" -p "$parent_hash" < "$message_file") fi else if [[ -n "$commit_date" ]]; then - commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | - GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash") + commit_hash=$(GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash" < "$message_file") else - commit_hash=$(printf 'Checkpoint: %s\n\nMigrated from checkpoints v2.\nSource refs: %s and %s/*\n' "$checkpoint_id" "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" | - git commit-tree "$tree_hash") + commit_hash=$(git commit-tree "$tree_hash" < "$message_file") fi fi From d54434bb80d3ac07aff92eeb9fd23dc723ace36b Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 14:51:31 -0700 Subject: [PATCH 08/35] Preserve checkpoint commit timestamps Entire-Checkpoint: 7d613a7f649a --- cmd/entire/cli/checkpoint/checkpoint.go | 4 + cmd/entire/cli/checkpoint/committed.go | 13 ++- .../checkpoint/committed_commit_time_test.go | 100 ++++++++++++++++++ cmd/entire/cli/checkpoint/temporary.go | 6 +- 4 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 cmd/entire/cli/checkpoint/committed_commit_time_test.go diff --git a/cmd/entire/cli/checkpoint/checkpoint.go b/cmd/entire/cli/checkpoint/checkpoint.go index 35f3aaf07..9e17ee0b0 100644 --- a/cmd/entire/cli/checkpoint/checkpoint.go +++ b/cmd/entire/cli/checkpoint/checkpoint.go @@ -212,6 +212,10 @@ type WriteCommittedOptions struct { // the original v1 checkpoint time in v2 metadata and retention decisions. CreatedAt time.Time + // CommitTime is the optional git author/committer timestamp for the + // metadata-branch commit. When zero, writers use the current time. + CommitTime time.Time + // Strategy is the name of the strategy that created this checkpoint Strategy string diff --git a/cmd/entire/cli/checkpoint/committed.go b/cmd/entire/cli/checkpoint/committed.go index 998e90595..28468940e 100644 --- a/cmd/entire/cli/checkpoint/committed.go +++ b/cmd/entire/cli/checkpoint/committed.go @@ -117,7 +117,11 @@ func (s *GitStore) WriteCommitted(ctx context.Context, opts WriteCommittedOption } commitMsg := s.buildCommitMessage(opts, taskMetadataPath) - newCommitHash, err := s.createCommit(ctx, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail) + commitTime := opts.CommitTime + if commitTime.IsZero() { + commitTime = time.Now() + } + newCommitHash, err := s.createCommitAt(ctx, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail, commitTime) if err != nil { return err } @@ -1932,11 +1936,14 @@ func GetGitAuthorFromRepo(repo *git.Repository) (name, email string) { // CreateCommit creates a git commit object with the given tree, parent, message, and author. // If parentHash is ZeroHash, the commit is created without a parent (orphan commit). func CreateCommit(ctx context.Context, repo *git.Repository, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string) (plumbing.Hash, error) { - now := time.Now() + return createCommitObject(ctx, repo, treeHash, parentHash, message, authorName, authorEmail, time.Now()) +} + +func createCommitObject(ctx context.Context, repo *git.Repository, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string, commitTime time.Time) (plumbing.Hash, error) { sig := object.Signature{ Name: authorName, Email: authorEmail, - When: now, + When: commitTime, } commit := &object.Commit{ diff --git a/cmd/entire/cli/checkpoint/committed_commit_time_test.go b/cmd/entire/cli/checkpoint/committed_commit_time_test.go new file mode 100644 index 000000000..91c5546c2 --- /dev/null +++ b/cmd/entire/cli/checkpoint/committed_commit_time_test.go @@ -0,0 +1,100 @@ +package checkpoint + +import ( + "context" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/entireio/cli/redact" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" + "github.com/stretchr/testify/require" +) + +const ( + commitTimeStrategy = "manual-commit" + commitTimeTestAuthor = "Test" + commitTimeTestEmail = "test@example.com" +) + +func TestWriteCommitted_CommitTime(t *testing.T) { + t.Parallel() + + repo, store := setupCommittedCommitTimeRepo(t) + commitTime := time.Date(2024, 3, 2, 1, 2, 3, 0, time.UTC) + + err := store.WriteCommitted(context.Background(), WriteCommittedOptions{ + CheckpointID: id.MustCheckpointID("a1b2c3d4e5f6"), + SessionID: "session-commit-time", + CreatedAt: time.Date(2024, 3, 1, 1, 2, 3, 0, time.UTC), + CommitTime: commitTime, + Strategy: commitTimeStrategy, + Transcript: redact.AlreadyRedacted([]byte("transcript line\n")), + AuthorName: "Migration", + AuthorEmail: "migration@example.com", + }) + require.NoError(t, err) + + commit := metadataHeadCommit(t, repo) + require.True(t, commit.Author.When.Equal(commitTime), "author time = %s, want %s", commit.Author.When, commitTime) + require.True(t, commit.Committer.When.Equal(commitTime), "committer time = %s, want %s", commit.Committer.When, commitTime) +} + +func TestWriteCommitted_ZeroCommitTimeUsesCurrentTime(t *testing.T) { + t.Parallel() + + repo, store := setupCommittedCommitTimeRepo(t) + createdAt := time.Date(2020, 1, 2, 3, 4, 5, 0, time.UTC) + before := time.Now().Add(-time.Second) + + err := store.WriteCommitted(context.Background(), WriteCommittedOptions{ + CheckpointID: id.MustCheckpointID("b2c3d4e5f6a1"), + SessionID: "session-current-time", + CreatedAt: createdAt, + Strategy: commitTimeStrategy, + Transcript: redact.AlreadyRedacted([]byte("transcript line\n")), + AuthorName: commitTimeTestAuthor, + AuthorEmail: commitTimeTestEmail, + }) + require.NoError(t, err) + after := time.Now().Add(time.Second) + + commit := metadataHeadCommit(t, repo) + require.False(t, commit.Author.When.Equal(createdAt), "zero CommitTime should not reuse CreatedAt as the commit timestamp") + require.False(t, commit.Author.When.Before(before), "author time = %s, want no earlier than %s", commit.Author.When, before) + require.False(t, commit.Author.When.After(after), "author time = %s, want no later than %s", commit.Author.When, after) + require.True(t, commit.Committer.When.Equal(commit.Author.When), "committer time = %s, want author time %s", commit.Committer.When, commit.Author.When) +} + +func setupCommittedCommitTimeRepo(t *testing.T) (*git.Repository, *GitStore) { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + + testutil.WriteFile(t, dir, "README.md", "# Test\n") + testutil.GitAdd(t, dir, "README.md") + testutil.GitCommit(t, dir, "initial commit") + + return repo, NewGitStore(repo) +} + +func metadataHeadCommit(t *testing.T, repo *git.Repository) *object.Commit { + t.Helper() + + ref, err := repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + + commit, err := repo.CommitObject(ref.Hash()) + require.NoError(t, err) + + return commit +} diff --git a/cmd/entire/cli/checkpoint/temporary.go b/cmd/entire/cli/checkpoint/temporary.go index ec0cd6a8a..5800c645f 100644 --- a/cmd/entire/cli/checkpoint/temporary.go +++ b/cmd/entire/cli/checkpoint/temporary.go @@ -846,7 +846,11 @@ func (s *GitStore) buildTreeWithChanges( // createCommit creates a commit object. func (s *GitStore) createCommit(ctx context.Context, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string) (plumbing.Hash, error) { - return CreateCommit(ctx, s.repo, treeHash, parentHash, message, authorName, authorEmail) + return s.createCommitAt(ctx, treeHash, parentHash, message, authorName, authorEmail, time.Now()) +} + +func (s *GitStore) createCommitAt(ctx context.Context, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string, commitTime time.Time) (plumbing.Hash, error) { + return createCommitObject(ctx, s.repo, treeHash, parentHash, message, authorName, authorEmail, commitTime) } // Helper functions extracted from strategy/common.go From f513458c1b27f84111d71d9ae64d35cfbce23b01 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 15:09:30 -0700 Subject: [PATCH 09/35] Add v2 checkpoint migration command skeleton Entire-Checkpoint: 6e2dd4694dd4 --- cmd/migrate-v2-checkpoints/history.go | 248 ++++++++++++++++++++++++ cmd/migrate-v2-checkpoints/main.go | 162 ++++++++++++++++ cmd/migrate-v2-checkpoints/main_test.go | 180 +++++++++++++++++ 3 files changed, 590 insertions(+) create mode 100644 cmd/migrate-v2-checkpoints/history.go create mode 100644 cmd/migrate-v2-checkpoints/main.go create mode 100644 cmd/migrate-v2-checkpoints/main_test.go diff --git a/cmd/migrate-v2-checkpoints/history.go b/cmd/migrate-v2-checkpoints/history.go new file mode 100644 index 000000000..81871e5d1 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/history.go @@ -0,0 +1,248 @@ +package main + +import ( + "context" + "fmt" + "io" + "sort" + "strings" + "time" + + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/trailers" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" +) + +type discoveryOptions struct { + since string + head string +} + +type discoveredCheckpoint struct { + ID checkpointID.CheckpointID + Commits []discoveredCommit +} + +type discoveredCommit struct { + Hash plumbing.Hash + ShortSHA string + Date time.Time +} + +type historyTip struct { + name string + hash plumbing.Hash +} + +func discoverCheckpointHistory(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, error) { + excluded, err := excludedCommits(ctx, repo, opts.since) + if err != nil { + return nil, err + } + + tips, err := historyTips(repo, opts.head) + if err != nil { + return nil, err + } + + seenCommits := make(map[plumbing.Hash]bool) + checkpointIndexes := make(map[string]int) + checkpoints := make([]discoveredCheckpoint, 0) + + for _, tip := range tips { + if err := scanTip(ctx, repo, tip, excluded, seenCommits, checkpointIndexes, &checkpoints); err != nil { + return nil, err + } + } + + sortDiscoveredCheckpoints(checkpoints) + return checkpoints, nil +} + +func excludedCommits(ctx context.Context, repo *git.Repository, since string) (map[plumbing.Hash]bool, error) { + if since == "" { + return make(map[plumbing.Hash]bool), nil + } + + sinceHash, err := resolveRevision(repo, since) + if err != nil { + return nil, fmt.Errorf("resolve --since %q: %w", since, err) + } + return reachableCommits(ctx, repo, sinceHash) +} + +func historyTips(repo *git.Repository, head string) ([]historyTip, error) { + if head != "" { + hash, err := resolveRevision(repo, head) + if err != nil { + return nil, fmt.Errorf("resolve --head %q: %w", head, err) + } + return []historyTip{{name: head, hash: hash}}, nil + } + + iter, err := repo.References() + if err != nil { + return nil, fmt.Errorf("list refs: %w", err) + } + defer iter.Close() + + var tips []historyTip + seenHashes := make(map[plumbing.Hash]bool) + err = iter.ForEach(func(ref *plumbing.Reference) error { + if !isHistoryRef(ref) { + return nil + } + + hash := ref.Hash() + if seenHashes[hash] { + return nil + } + seenHashes[hash] = true + tips = append(tips, historyTip{name: ref.Name().String(), hash: hash}) + return nil + }) + if err != nil { + return nil, fmt.Errorf("iterate refs: %w", err) + } + + if len(tips) == 0 { + headRef, headErr := repo.Head() + if headErr != nil { + return nil, fmt.Errorf("find HEAD: %w", headErr) + } + tips = append(tips, historyTip{name: headRef.Name().String(), hash: headRef.Hash()}) + } + + sort.Slice(tips, func(i, j int) bool { + return tips[i].name < tips[j].name + }) + return tips, nil +} + +func isHistoryRef(ref *plumbing.Reference) bool { + if ref.Type() != plumbing.HashReference { + return false + } + name := ref.Name() + if !name.IsBranch() && !name.IsRemote() { + return false + } + return !strings.HasSuffix(name.String(), "/HEAD") +} + +func resolveRevision(repo *git.Repository, revision string) (plumbing.Hash, error) { + hash, err := repo.ResolveRevision(plumbing.Revision(revision)) + if err != nil { + return plumbing.ZeroHash, err //nolint:wrapcheck // callers add flag-specific context + } + if hash == nil { + return plumbing.ZeroHash, fmt.Errorf("revision %q resolved to no commit", revision) + } + return *hash, nil +} + +func reachableCommits(ctx context.Context, repo *git.Repository, from plumbing.Hash) (map[plumbing.Hash]bool, error) { + iter, err := repo.Log(&git.LogOptions{From: from, Order: git.LogOrderCommitterTime}) + if err != nil { + return nil, fmt.Errorf("get log from %s: %w", from, err) + } + defer iter.Close() + + commits := make(map[plumbing.Hash]bool) + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return fmt.Errorf("context canceled while excluding commits: %w", err) + } + commits[commit.Hash] = true + return nil + }) + if err != nil { + return nil, fmt.Errorf("iterate commits reachable from %s: %w", from, err) + } + return commits, nil +} + +func scanTip(ctx context.Context, repo *git.Repository, tip historyTip, excluded, seenCommits map[plumbing.Hash]bool, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) error { + iter, err := repo.Log(&git.LogOptions{From: tip.hash, Order: git.LogOrderCommitterTime}) + if err != nil { + return fmt.Errorf("get log from %s: %w", tip.name, err) + } + defer iter.Close() + + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return fmt.Errorf("context canceled while scanning commits: %w", err) + } + if excluded[commit.Hash] || seenCommits[commit.Hash] { + return nil + } + seenCommits[commit.Hash] = true + addCheckpointCommit(commit, checkpointIndexes, checkpoints) + return nil + }) + if err != nil { + return fmt.Errorf("iterate commits from %s: %w", tip.name, err) + } + return nil +} + +func addCheckpointCommit(commit *object.Commit, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) { + ids := trailers.ParseAllCheckpoints(commit.Message) + if len(ids) == 0 { + return + } + + discovered := discoveredCommit{ + Hash: commit.Hash, + ShortSHA: shortHash(commit.Hash), + Date: commit.Author.When, + } + + for _, id := range ids { + key := id.String() + index, ok := checkpointIndexes[key] + if !ok { + index = len(*checkpoints) + checkpointIndexes[key] = index + *checkpoints = append(*checkpoints, discoveredCheckpoint{ID: id}) + } + (*checkpoints)[index].Commits = append((*checkpoints)[index].Commits, discovered) + } +} + +func sortDiscoveredCheckpoints(checkpoints []discoveredCheckpoint) { + sort.Slice(checkpoints, func(i, j int) bool { + return checkpoints[i].ID.String() < checkpoints[j].ID.String() + }) + for i := range checkpoints { + sort.Slice(checkpoints[i].Commits, func(j, k int) bool { + left := checkpoints[i].Commits[j] + right := checkpoints[i].Commits[k] + if !left.Date.Equal(right.Date) { + return left.Date.After(right.Date) + } + return left.Hash.String() < right.Hash.String() + }) + } +} + +func writeCheckpointList(w io.Writer, checkpoints []discoveredCheckpoint) { + for _, checkpoint := range checkpoints { + fmt.Fprint(w, checkpoint.ID) + for _, commit := range checkpoint.Commits { + fmt.Fprintf(w, " %s", commit.ShortSHA) + } + fmt.Fprintln(w) + } +} + +func shortHash(hash plumbing.Hash) string { + full := hash.String() + if len(full) <= checkpointID.ShortIDLength { + return full + } + return full[:checkpointID.ShortIDLength] +} diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go new file mode 100644 index 000000000..6ca28c581 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/main.go @@ -0,0 +1,162 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + "os" + + "github.com/entireio/cli/cmd/entire/cli/paths" + + "github.com/go-git/go-git/v6" + "github.com/spf13/pflag" +) + +type runMode string + +const ( + modePlan runMode = "plan" + modeList runMode = "list" + modeDryRun runMode = "dry-run" + modeApply runMode = "apply" +) + +type options struct { + repoPath string + since string + head string + mode runMode + help bool +} + +func main() { + if err := run(context.Background(), os.Args[1:], os.Stdout); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func run(ctx context.Context, args []string, stdout io.Writer) error { + opts, err := parseOptions(args) + if err != nil { + return err + } + if opts.help { + printUsage(stdout) + return nil + } + + _, repo, err := openRepository(ctx, opts.repoPath) + if err != nil { + return err + } + + checkpoints, err := discoverCheckpointHistory(ctx, repo, discoveryOptions{ + since: opts.since, + head: opts.head, + }) + if err != nil { + return err + } + + switch opts.mode { + case modeList: + writeCheckpointList(stdout, checkpoints) + return nil + case modePlan, modeDryRun: + fmt.Fprintf(stdout, "Discovered %d checkpoint(s) with Entire-Checkpoint trailers.\n", len(checkpoints)) + fmt.Fprintln(stdout, "V2-to-v1 migration planning will be added in the next implementation step.") + return nil + case modeApply: + return errors.New("--apply migration is not implemented yet") + default: + return fmt.Errorf("unknown mode %q", opts.mode) + } +} + +func parseOptions(args []string) (options, error) { + var opts options + opts.mode = modePlan + + flags := pflag.NewFlagSet("migrate-v2-checkpoints", pflag.ContinueOnError) + flags.SetOutput(io.Discard) + + var listMode bool + var dryRun bool + var apply bool + flags.BoolVarP(&opts.help, "help", "h", false, "show help") + flags.BoolVar(&listMode, "list", false, "print checkpoint IDs and associated commit IDs only") + flags.BoolVar(&dryRun, "dry-run", false, "print the migration plan without writing refs") + flags.BoolVar(&apply, "apply", false, "write migration commits") + flags.StringVar(&opts.repoPath, "repo", "", "local repository path to inspect") + flags.StringVar(&opts.since, "since", "", "commit before the checkpoints to inspect") + flags.StringVar(&opts.head, "head", "", "limit scan to one history tip") + + if err := flags.Parse(args); err != nil { + return opts, fmt.Errorf("parse options: %w", err) + } + + positionals := flags.Args() + if len(positionals) > 1 { + return opts, fmt.Errorf("expected at most one since commit argument, got %d", len(positionals)) + } + if len(positionals) == 1 { + if opts.since != "" { + return opts, errors.New("use either --since or positional since commit, not both") + } + opts.since = positionals[0] + } + + modeCount := 0 + if listMode { + opts.mode = modeList + modeCount++ + } + if dryRun { + opts.mode = modeDryRun + modeCount++ + } + if apply { + opts.mode = modeApply + modeCount++ + } + if modeCount > 1 { + return opts, errors.New("use only one of --list, --dry-run, or --apply") + } + + return opts, nil +} + +func printUsage(w io.Writer) { + fmt.Fprint(w, `migrate-v2-checkpoints migrates legacy v2 checkpoint data back to v1. + +Usage: + migrate-v2-checkpoints [OPTIONS] [SINCE_COMMIT] + +Options: + -h, --help Show this help message + --list Print checkpoint IDs and associated commit IDs only + --dry-run Print the migration plan without writing refs + --apply Write migration commits + --repo Local repository path to inspect + --since Commit before the checkpoints to inspect + --head Limit scan to one history tip +`) +} + +func openRepository(ctx context.Context, repoPath string) (string, *git.Repository, error) { + if repoPath == "" { + root, err := paths.WorktreeRoot(ctx) + if err != nil { + return "", nil, fmt.Errorf("find git worktree root: %w", err) + } + repoPath = root + } + + repo, err := git.PlainOpenWithOptions(repoPath, &git.PlainOpenOptions{DetectDotGit: true}) + if err != nil { + return "", nil, fmt.Errorf("open repository %q: %w", repoPath, err) + } + return repoPath, repo, nil +} diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go new file mode 100644 index 000000000..fa331d149 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -0,0 +1,180 @@ +package main + +import ( + "bytes" + "context" + "os" + "path/filepath" + "testing" + + "github.com/entireio/cli/cmd/entire/cli/testutil" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/stretchr/testify/require" +) + +const ( + oldCheckpointID = "000000000001" + mainCheckpointID = "111111111111" + featureCheckpointID = "222222222222" + featureCheckpointID2 = "333333333333" + testSinceRevision = "abc123" + testHeadRevision = "HEAD" + testRepoFlag = "--repo" + testSinceFlag = "--since" + testHeadFlag = "--head" + testListFlag = "--list" + testDryRunFlag = "--dry-run" + testApplyFlag = "--apply" + testRepoPath = "/tmp/repo" + testBaseFilename = "base.txt" + testMainFilename = "main.txt" + testFeatureFilename = "feature.txt" + testFeatureBranchName = "feature" +) + +func TestParseOptions(t *testing.T) { + t.Parallel() + + opts, err := parseOptions([]string{ + testRepoFlag, testRepoPath, + testSinceFlag, testSinceRevision, + testHeadFlag, testHeadRevision, + testListFlag, + }) + require.NoError(t, err) + require.Equal(t, testRepoPath, opts.repoPath) + require.Equal(t, testSinceRevision, opts.since) + require.Equal(t, testHeadRevision, opts.head) + require.Equal(t, modeList, opts.mode) + + opts, err = parseOptions([]string{testDryRunFlag, testSinceRevision}) + require.NoError(t, err) + require.Equal(t, testSinceRevision, opts.since) + require.Equal(t, modeDryRun, opts.mode) + + _, err = parseOptions([]string{testSinceFlag, testSinceRevision, "def456"}) + require.ErrorContains(t, err, "use either --since or positional since commit") + + _, err = parseOptions([]string{testListFlag, testApplyFlag}) + require.ErrorContains(t, err, "use only one") +} + +func TestDiscoverCheckpointHistory_AllRefsNewerThanSince(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID, featureCheckpointID, featureCheckpointID2}, discoveredCheckpointIDs(checkpoints)) + require.Equal(t, []string{shortHash(fixture.mainHash)}, discoveredCommitShortSHAs(t, checkpoints, mainCheckpointID)) + require.Equal(t, []string{shortHash(fixture.featureHash)}, discoveredCommitShortSHAs(t, checkpoints, featureCheckpointID)) + require.Equal(t, []string{shortHash(fixture.featureHash)}, discoveredCommitShortSHAs(t, checkpoints, featureCheckpointID2)) +} + +func TestDiscoverCheckpointHistory_HeadLimitsScan(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + head: fixture.mainHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID}, discoveredCheckpointIDs(checkpoints)) +} + +func TestRunListModeOpensRepoFromSubdirectory(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + subdir := filepath.Join(fixture.dir, "nested") + require.NoError(t, os.MkdirAll(subdir, 0o755)) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + testRepoFlag, subdir, + testSinceFlag, fixture.baseHash.String(), + testHeadFlag, fixture.mainHash.String(), + testListFlag, + }, &stdout) + require.NoError(t, err) + + require.Equal(t, mainCheckpointID+" "+shortHash(fixture.mainHash)+"\n", stdout.String()) +} + +type migrationHistoryFixture struct { + dir string + repo *git.Repository + baseHash plumbing.Hash + mainHash plumbing.Hash + featureHash plumbing.Hash +} + +func setupMigrationHistoryRepo(t *testing.T) migrationHistoryFixture { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + + baseHash := commitMigrationTestFile(t, dir, testBaseFilename, "base\n", + "base checkpoint\n\nEntire-Checkpoint: "+oldCheckpointID) + mainHash := commitMigrationTestFile(t, dir, testMainFilename, "main\n", + "main checkpoint\n\nEntire-Checkpoint: "+mainCheckpointID) + + testutil.GitCheckoutNewBranch(t, dir, testFeatureBranchName) + featureHash := commitMigrationTestFile(t, dir, testFeatureFilename, "feature\n", + "feature checkpoint\n\nEntire-Checkpoint: "+featureCheckpointID+"\nEntire-Checkpoint: "+featureCheckpointID2) + + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + + return migrationHistoryFixture{ + dir: dir, + repo: repo, + baseHash: baseHash, + mainHash: mainHash, + featureHash: featureHash, + } +} + +func commitMigrationTestFile(t *testing.T, dir, name, content, message string) plumbing.Hash { + t.Helper() + + testutil.WriteFile(t, dir, name, content) + testutil.GitAdd(t, dir, name) + testutil.GitCommit(t, dir, message) + return plumbing.NewHash(testutil.GetHeadHash(t, dir)) +} + +func discoveredCheckpointIDs(checkpoints []discoveredCheckpoint) []string { + ids := make([]string, len(checkpoints)) + for i, checkpoint := range checkpoints { + ids[i] = checkpoint.ID.String() + } + return ids +} + +func discoveredCommitShortSHAs(t *testing.T, checkpoints []discoveredCheckpoint, checkpointID string) []string { + t.Helper() + + for _, checkpoint := range checkpoints { + if checkpoint.ID.String() != checkpointID { + continue + } + commits := make([]string, len(checkpoint.Commits)) + for i, commit := range checkpoint.Commits { + commits[i] = commit.ShortSHA + } + return commits + } + t.Fatalf("checkpoint %s not found in %#v", checkpointID, checkpoints) + return nil +} From 399c9093bfa4bb94c941ba7890240dd5db671235 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 15:38:37 -0700 Subject: [PATCH 10/35] Migrate v2 checkpoint data through Go stores Entire-Checkpoint: ae815b84d867 --- cmd/migrate-v2-checkpoints/main.go | 24 +++- cmd/migrate-v2-checkpoints/main_test.go | 82 ++++++++++++ cmd/migrate-v2-checkpoints/migration.go | 167 ++++++++++++++++++++++++ 3 files changed, 268 insertions(+), 5 deletions(-) create mode 100644 cmd/migrate-v2-checkpoints/migration.go diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go index 6ca28c581..d785146a4 100644 --- a/cmd/migrate-v2-checkpoints/main.go +++ b/cmd/migrate-v2-checkpoints/main.go @@ -8,6 +8,7 @@ import ( "os" "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/settings" "github.com/go-git/go-git/v6" "github.com/spf13/pflag" @@ -47,10 +48,11 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { return nil } - _, repo, err := openRepository(ctx, opts.repoPath) + repoRoot, repo, err := openRepository(ctx, opts.repoPath) if err != nil { return err } + ctx = settings.WithWorktreeRoot(ctx, repoRoot) checkpoints, err := discoverCheckpointHistory(ctx, repo, discoveryOptions{ since: opts.since, @@ -65,11 +67,19 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { writeCheckpointList(stdout, checkpoints) return nil case modePlan, modeDryRun: - fmt.Fprintf(stdout, "Discovered %d checkpoint(s) with Entire-Checkpoint trailers.\n", len(checkpoints)) - fmt.Fprintln(stdout, "V2-to-v1 migration planning will be added in the next implementation step.") + report, err := migrateDiscoveredCheckpoints(ctx, repo, checkpoints, migrationOptions{apply: false}) + if err != nil { + return err + } + writeMigrationReport(stdout, report, false) return nil case modeApply: - return errors.New("--apply migration is not implemented yet") + report, err := migrateDiscoveredCheckpoints(ctx, repo, checkpoints, migrationOptions{apply: true}) + if err != nil { + return err + } + writeMigrationReport(stdout, report, true) + return nil default: return fmt.Errorf("unknown mode %q", opts.mode) } @@ -158,5 +168,9 @@ func openRepository(ctx context.Context, repoPath string) (string, *git.Reposito if err != nil { return "", nil, fmt.Errorf("open repository %q: %w", repoPath, err) } - return repoPath, repo, nil + repoRoot := repoPath + if worktree, worktreeErr := repo.Worktree(); worktreeErr == nil { + repoRoot = worktree.Filesystem().Root() + } + return repoRoot, repo, nil } diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index fa331d149..bccb642ea 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -5,9 +5,17 @@ import ( "context" "os" "path/filepath" + "strings" "testing" + "time" + "github.com/entireio/cli/cmd/entire/cli/agent" + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/session" "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/entireio/cli/redact" "github.com/go-git/go-git/v6" "github.com/go-git/go-git/v6/plumbing" @@ -110,6 +118,80 @@ func TestRunListModeOpensRepoFromSubdirectory(t *testing.T) { require.Equal(t, mainCheckpointID+" "+shortHash(fixture.mainHash)+"\n", stdout.String()) } +func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + createdAt := time.Date(2024, 5, 6, 7, 8, 9, 0, time.UTC) + transcript := []byte("{\"type\":\"assistant\",\"message\":\"migrated\"}\n") + + err := checkpoint.NewV2GitStore(fixture.repo).WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-to-migrate", + CreatedAt: createdAt, + Strategy: "manual-commit", + Branch: "main", + Transcript: redact.AlreadyRedacted(transcript), + Prompts: []string{"first prompt", "second prompt"}, + FilesTouched: []string{"main.go"}, + CheckpointsCount: 2, + AuthorName: "Test", + AuthorEmail: "test@example.com", + Agent: agent.AgentTypeClaudeCode, + Model: "claude-test-model", + TurnID: "turn-1", + CheckpointTranscriptStart: 42, + CompactTranscriptStart: 9, + Kind: string(session.KindAgentReview), + ReviewSkills: []string{"review-skill"}, + ReviewPrompt: "review this", + HasReview: true, + }) + require.NoError(t, err) + + var stdout bytes.Buffer + err = run(context.Background(), []string{ + testRepoFlag, fixture.dir, + testSinceFlag, fixture.baseHash.String(), + testHeadFlag, fixture.mainHash.String(), + testApplyFlag, + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "migrated checkpoints: 1") + require.Contains(t, stdout.String(), "migrated sessions: 1") + + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.Len(t, summary.Sessions, 1) + require.Equal(t, 2, summary.CheckpointsCount) + require.Equal(t, []string{"main.go"}, summary.FilesTouched) + require.True(t, summary.HasReview) + + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.Equal(t, transcript, content.Transcript) + require.Equal(t, strings.Join([]string{"first prompt", "second prompt"}, checkpoint.PromptSeparator), content.Prompts) + require.Equal(t, createdAt, content.Metadata.CreatedAt) + require.Equal(t, "manual-commit", content.Metadata.Strategy) + require.Equal(t, "main", content.Metadata.Branch) + require.Equal(t, agent.AgentTypeClaudeCode, content.Metadata.Agent) + require.Equal(t, "claude-test-model", content.Metadata.Model) + require.Equal(t, "turn-1", content.Metadata.TurnID) + require.Equal(t, 0, content.Metadata.CheckpointTranscriptStart) + require.Equal(t, string(session.KindAgentReview), content.Metadata.Kind) + require.Equal(t, []string{"review-skill"}, content.Metadata.ReviewSkills) + require.Equal(t, "review this", content.Metadata.ReviewPrompt) + + ref, err := fixture.repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + commit, err := fixture.repo.CommitObject(ref.Hash()) + require.NoError(t, err) + require.True(t, commit.Author.When.Equal(createdAt), "author time = %s, want %s", commit.Author.When, createdAt) +} + type migrationHistoryFixture struct { dir string repo *git.Repository diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go new file mode 100644 index 000000000..63c45b038 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -0,0 +1,167 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + "github.com/entireio/cli/cmd/entire/cli/session" + "github.com/entireio/cli/redact" + + "github.com/go-git/go-git/v6" +) + +type migrationOptions struct { + apply bool +} + +type migrationReport struct { + DiscoveredChecks int + ExistingV1Checkpoints int + MissingV2Metadata int + MissingRawTranscripts int + PlannedCheckpoints int + PlannedSessions int + MigratedCheckpoints int + MigratedSessions int +} + +type checkpointMigrator struct { + v1Store *checkpoint.GitStore + v2Store *checkpoint.V2GitStore + opts migrationOptions + authorName string + authorEmail string + report *migrationReport +} + +func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, discovered []discoveredCheckpoint, opts migrationOptions) (migrationReport, error) { + authorName, authorEmail := checkpoint.GetGitAuthorFromRepo(repo) + report := migrationReport{DiscoveredChecks: len(discovered)} + migrator := checkpointMigrator{ + v1Store: checkpoint.NewGitStore(repo), + v2Store: checkpoint.NewV2GitStore(repo), + opts: opts, + authorName: authorName, + authorEmail: authorEmail, + report: &report, + } + + for _, discoveredCheckpoint := range discovered { + migratedSessions, err := migrator.migrateCheckpoint(ctx, discoveredCheckpoint) + if err != nil { + return report, err + } + if migratedSessions > 0 { + report.PlannedCheckpoints++ + if opts.apply { + report.MigratedCheckpoints++ + } + } + } + return report, nil +} + +func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered discoveredCheckpoint) (int, error) { + existing, err := m.v1Store.ReadCommitted(ctx, discovered.ID) + if err != nil { + return 0, fmt.Errorf("read v1 checkpoint %s: %w", discovered.ID, err) + } + if existing != nil { + m.report.ExistingV1Checkpoints++ + return 0, nil + } + + summary, err := m.v2Store.ReadCommitted(ctx, discovered.ID) + if err != nil { + return 0, fmt.Errorf("read v2 checkpoint %s: %w", discovered.ID, err) + } + if summary == nil || len(summary.Sessions) == 0 { + m.report.MissingV2Metadata++ + return 0, nil + } + + migratedSessions := 0 + for sessionIndex := range summary.Sessions { + content, err := m.v2Store.ReadSessionContent(ctx, discovered.ID, sessionIndex) + if err != nil { + if errors.Is(err, checkpoint.ErrNoTranscript) { + m.report.MissingRawTranscripts++ + continue + } + return migratedSessions, fmt.Errorf("read v2 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) + } + if !hasRequiredV2Metadata(content) { + m.report.MissingV2Metadata++ + continue + } + + m.report.PlannedSessions++ + if m.opts.apply { + writeOpts := writeOptionsFromV2Content(content, summary, m.authorName, m.authorEmail) + if err := m.v1Store.WriteCommitted(ctx, writeOpts); err != nil { + return migratedSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) + } + m.report.MigratedSessions++ + } + migratedSessions++ + } + return migratedSessions, nil +} + +func hasRequiredV2Metadata(content *checkpoint.SessionContent) bool { + return !content.Metadata.CheckpointID.IsEmpty() && content.Metadata.SessionID != "" +} + +func writeOptionsFromV2Content(content *checkpoint.SessionContent, summary *checkpoint.CheckpointSummary, authorName, authorEmail string) checkpoint.WriteCommittedOptions { + meta := content.Metadata + return checkpoint.WriteCommittedOptions{ + CheckpointID: meta.CheckpointID, + SessionID: meta.SessionID, + CreatedAt: meta.CreatedAt, + CommitTime: meta.CreatedAt, + Strategy: meta.Strategy, + Branch: meta.Branch, + Transcript: redact.AlreadyRedacted(content.Transcript), + Prompts: checkpoint.SplitPromptContent(content.Prompts), + FilesTouched: meta.FilesTouched, + CheckpointsCount: meta.CheckpointsCount, + AuthorName: authorName, + AuthorEmail: authorEmail, + Agent: meta.Agent, + Model: meta.Model, + TurnID: meta.TurnID, + TranscriptIdentifierAtStart: meta.TranscriptIdentifierAtStart, + CheckpointTranscriptStart: 0, + TokenUsage: meta.TokenUsage, + SessionMetrics: meta.SessionMetrics, + InitialAttribution: meta.InitialAttribution, + PromptAttributionsJSON: meta.PromptAttributions, + CombinedAttribution: summary.CombinedAttribution, + Summary: meta.Summary, + Kind: meta.Kind, + ReviewSkills: meta.ReviewSkills, + ReviewPrompt: meta.ReviewPrompt, + HasReview: summary.HasReview || session.Kind(meta.Kind).IsReview(), + } +} + +func writeMigrationReport(w io.Writer, report migrationReport, applied bool) { + if applied { + fmt.Fprintln(w, "Migration result:") + } else { + fmt.Fprintln(w, "Migration plan:") + } + fmt.Fprintf(w, " discovered checkpoint trailers: %d\n", report.DiscoveredChecks) + fmt.Fprintf(w, " already present in v1: %d\n", report.ExistingV1Checkpoints) + fmt.Fprintf(w, " missing v2 metadata: %d\n", report.MissingV2Metadata) + fmt.Fprintf(w, " missing raw transcripts: %d\n", report.MissingRawTranscripts) + fmt.Fprintf(w, " checkpoints with raw transcripts: %d\n", report.PlannedCheckpoints) + fmt.Fprintf(w, " sessions with raw transcripts: %d\n", report.PlannedSessions) + if applied { + fmt.Fprintf(w, " migrated checkpoints: %d\n", report.MigratedCheckpoints) + fmt.Fprintf(w, " migrated sessions: %d\n", report.MigratedSessions) + } +} From 3a20ce9d76d6e9f8b50fba41d9e51fd02292fa6d Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 15:44:58 -0700 Subject: [PATCH 11/35] Expand v2 checkpoint migration tests Entire-Checkpoint: f5e4e2ae22f3 --- cmd/migrate-v2-checkpoints/main_test.go | 133 ++++++++++++++++++++++-- 1 file changed, 127 insertions(+), 6 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index bccb642ea..baa540e95 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -40,6 +40,10 @@ const ( testMainFilename = "main.txt" testFeatureFilename = "feature.txt" testFeatureBranchName = "feature" + testStrategy = "manual-commit" + testAuthorName = "Test" + testAuthorEmail = "test@example.com" + testBranchName = "main" ) func TestParseOptions(t *testing.T) { @@ -130,14 +134,14 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { CheckpointID: cpID, SessionID: "session-to-migrate", CreatedAt: createdAt, - Strategy: "manual-commit", - Branch: "main", + Strategy: testStrategy, + Branch: testBranchName, Transcript: redact.AlreadyRedacted(transcript), Prompts: []string{"first prompt", "second prompt"}, FilesTouched: []string{"main.go"}, CheckpointsCount: 2, - AuthorName: "Test", - AuthorEmail: "test@example.com", + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, Agent: agent.AgentTypeClaudeCode, Model: "claude-test-model", TurnID: "turn-1", @@ -175,8 +179,8 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { require.Equal(t, transcript, content.Transcript) require.Equal(t, strings.Join([]string{"first prompt", "second prompt"}, checkpoint.PromptSeparator), content.Prompts) require.Equal(t, createdAt, content.Metadata.CreatedAt) - require.Equal(t, "manual-commit", content.Metadata.Strategy) - require.Equal(t, "main", content.Metadata.Branch) + require.Equal(t, testStrategy, content.Metadata.Strategy) + require.Equal(t, testBranchName, content.Metadata.Branch) require.Equal(t, agent.AgentTypeClaudeCode, content.Metadata.Agent) require.Equal(t, "claude-test-model", content.Metadata.Model) require.Equal(t, "turn-1", content.Metadata.TurnID) @@ -192,6 +196,85 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { require.True(t, commit.Author.When.Equal(createdAt), "author time = %s, want %s", commit.Author.When, createdAt) } +func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-dry-run", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"dry run\"}\n")), + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, testDryRunFlag) + require.Contains(t, stdout, "Migration plan:") + require.Contains(t, stdout, "checkpoints with raw transcripts: 1") + require.Contains(t, stdout, "sessions with raw transcripts: 1") + + summary, err := checkpoint.NewGitStore(fixture.repo).ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.Nil(t, summary) +} + +func TestRunApplySkipsExistingV1Checkpoint(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-v2", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2\"}\n")), + }) + + existingTranscript := []byte("{\"message\":\"already v1\"}\n") + err := checkpoint.NewGitStore(fixture.repo).WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-v1", + CreatedAt: time.Date(2024, 1, 2, 3, 4, 5, 0, time.UTC), + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted(existingTranscript), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + }) + require.NoError(t, err) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, testApplyFlag) + require.Contains(t, stdout, "already present in v1: 1") + require.Contains(t, stdout, "migrated checkpoints: 0") + require.Contains(t, stdout, "migrated sessions: 0") + + content, err := checkpoint.NewGitStore(fixture.repo).ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.Equal(t, existingTranscript, content.Transcript) + require.Equal(t, "session-existing-v1", content.Metadata.SessionID) +} + +func TestRunDryRunReportsMissingV2MetadataAndRawTranscripts(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ + CheckpointID: id.MustCheckpointID(featureCheckpointID), + SessionID: "session-missing-raw", + }) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + testRepoFlag, fixture.dir, + testSinceFlag, fixture.baseHash.String(), + testDryRunFlag, + }, &stdout) + require.NoError(t, err) + + require.Contains(t, stdout.String(), "missing v2 metadata: 2") + require.Contains(t, stdout.String(), "missing raw transcripts: 1") + require.Contains(t, stdout.String(), "checkpoints with raw transcripts: 0") + require.Contains(t, stdout.String(), "sessions with raw transcripts: 0") +} + type migrationHistoryFixture struct { dir string repo *git.Repository @@ -236,6 +319,44 @@ func commitMigrationTestFile(t *testing.T, dir, name, content, message string) p return plumbing.NewHash(testutil.GetHeadHash(t, dir)) } +func writeTestV2Checkpoint(t *testing.T, repo *git.Repository, opts checkpoint.WriteCommittedOptions) { + t.Helper() + + if opts.CreatedAt.IsZero() { + opts.CreatedAt = time.Date(2024, 5, 6, 7, 8, 9, 0, time.UTC) + } + if opts.Strategy == "" { + opts.Strategy = testStrategy + } + if opts.Branch == "" { + opts.Branch = testBranchName + } + if opts.AuthorName == "" { + opts.AuthorName = testAuthorName + } + if opts.AuthorEmail == "" { + opts.AuthorEmail = testAuthorEmail + } + + err := checkpoint.NewV2GitStore(repo).WriteCommitted(context.Background(), opts) + require.NoError(t, err) +} + +func runMigrationCommand(t *testing.T, fixture migrationHistoryFixture, head plumbing.Hash, mode string) string { + t.Helper() + + args := []string{ + testRepoFlag, fixture.dir, + testSinceFlag, fixture.baseHash.String(), + testHeadFlag, head.String(), + mode, + } + var stdout bytes.Buffer + err := run(context.Background(), args, &stdout) + require.NoError(t, err) + return stdout.String() +} + func discoveredCheckpointIDs(checkpoints []discoveredCheckpoint) []string { ids := make([]string, len(checkpoints)) for i, checkpoint := range checkpoints { From 81cddb92e8a1e6d9d39b7d1bde25b2c9241a1b15 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 16:06:41 -0700 Subject: [PATCH 12/35] Delegate temporary commit creation Entire-Checkpoint: 0fc8bb5eb2ef --- cmd/entire/cli/checkpoint/temporary.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/entire/cli/checkpoint/temporary.go b/cmd/entire/cli/checkpoint/temporary.go index 5800c645f..bbe5c7624 100644 --- a/cmd/entire/cli/checkpoint/temporary.go +++ b/cmd/entire/cli/checkpoint/temporary.go @@ -846,7 +846,7 @@ func (s *GitStore) buildTreeWithChanges( // createCommit creates a commit object. func (s *GitStore) createCommit(ctx context.Context, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string) (plumbing.Hash, error) { - return s.createCommitAt(ctx, treeHash, parentHash, message, authorName, authorEmail, time.Now()) + return CreateCommit(ctx, s.repo, treeHash, parentHash, message, authorName, authorEmail) } func (s *GitStore) createCommitAt(ctx context.Context, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string, commitTime time.Time) (plumbing.Hash, error) { From e68ee795e372d4dfb7a0d8947ed5d5b22bb7cbff Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 16:27:41 -0700 Subject: [PATCH 13/35] Limit v2 migration discovery to since histories Entire-Checkpoint: ec521d6d285c --- cmd/migrate-v2-checkpoints/history.go | 89 ++++++++++++++++++++++--- cmd/migrate-v2-checkpoints/main_test.go | 48 +++++++++++++ 2 files changed, 128 insertions(+), 9 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/history.go b/cmd/migrate-v2-checkpoints/history.go index 81871e5d1..82be73e7b 100644 --- a/cmd/migrate-v2-checkpoints/history.go +++ b/cmd/migrate-v2-checkpoints/history.go @@ -37,13 +37,19 @@ type historyTip struct { hash plumbing.Hash } +type discoveryScope struct { + excluded map[plumbing.Hash]bool + sinceHash plumbing.Hash + hasSince bool +} + func discoverCheckpointHistory(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, error) { - excluded, err := excludedCommits(ctx, repo, opts.since) + scope, err := newDiscoveryScope(ctx, repo, opts.since) if err != nil { return nil, err } - tips, err := historyTips(repo, opts.head) + tips, err := historyTips(ctx, repo, opts.head, scope) if err != nil { return nil, err } @@ -53,7 +59,7 @@ func discoverCheckpointHistory(ctx context.Context, repo *git.Repository, opts d checkpoints := make([]discoveredCheckpoint, 0) for _, tip := range tips { - if err := scanTip(ctx, repo, tip, excluded, seenCommits, checkpointIndexes, &checkpoints); err != nil { + if err := scanTip(ctx, repo, tip, scope.excluded, seenCommits, checkpointIndexes, &checkpoints); err != nil { return nil, err } } @@ -62,24 +68,35 @@ func discoverCheckpointHistory(ctx context.Context, repo *git.Repository, opts d return checkpoints, nil } -func excludedCommits(ctx context.Context, repo *git.Repository, since string) (map[plumbing.Hash]bool, error) { +func newDiscoveryScope(ctx context.Context, repo *git.Repository, since string) (discoveryScope, error) { if since == "" { - return make(map[plumbing.Hash]bool), nil + return discoveryScope{excluded: make(map[plumbing.Hash]bool)}, nil } sinceHash, err := resolveRevision(repo, since) if err != nil { - return nil, fmt.Errorf("resolve --since %q: %w", since, err) + return discoveryScope{}, fmt.Errorf("resolve --since %q: %w", since, err) } - return reachableCommits(ctx, repo, sinceHash) + excluded, err := reachableCommits(ctx, repo, sinceHash) + if err != nil { + return discoveryScope{}, err + } + return discoveryScope{ + excluded: excluded, + sinceHash: sinceHash, + hasSince: true, + }, nil } -func historyTips(repo *git.Repository, head string) ([]historyTip, error) { +func historyTips(ctx context.Context, repo *git.Repository, head string, scope discoveryScope) ([]historyTip, error) { if head != "" { hash, err := resolveRevision(repo, head) if err != nil { return nil, fmt.Errorf("resolve --head %q: %w", head, err) } + if err := requireTipContainsSince(ctx, repo, hash, head, scope); err != nil { + return nil, err + } return []historyTip{{name: head, hash: hash}}, nil } @@ -100,6 +117,13 @@ func historyTips(repo *git.Repository, head string) ([]historyTip, error) { if seenHashes[hash] { return nil } + include, includeErr := tipContainsSince(ctx, repo, hash, scope) + if includeErr != nil { + return fmt.Errorf("check whether %s contains --since: %w", ref.Name(), includeErr) + } + if !include { + return nil + } seenHashes[hash] = true tips = append(tips, historyTip{name: ref.Name().String(), hash: hash}) return nil @@ -113,7 +137,13 @@ func historyTips(repo *git.Repository, head string) ([]historyTip, error) { if headErr != nil { return nil, fmt.Errorf("find HEAD: %w", headErr) } - tips = append(tips, historyTip{name: headRef.Name().String(), hash: headRef.Hash()}) + include, includeErr := tipContainsSince(ctx, repo, headRef.Hash(), scope) + if includeErr != nil { + return nil, fmt.Errorf("check whether HEAD contains --since: %w", includeErr) + } + if include { + tips = append(tips, historyTip{name: headRef.Name().String(), hash: headRef.Hash()}) + } } sort.Slice(tips, func(i, j int) bool { @@ -122,6 +152,24 @@ func historyTips(repo *git.Repository, head string) ([]historyTip, error) { return tips, nil } +func requireTipContainsSince(ctx context.Context, repo *git.Repository, tipHash plumbing.Hash, tipName string, scope discoveryScope) error { + contains, err := tipContainsSince(ctx, repo, tipHash, scope) + if err != nil { + return fmt.Errorf("check whether --head %q contains --since: %w", tipName, err) + } + if !contains { + return fmt.Errorf("%s is not an ancestor of --head %q", scope.sinceHash, tipName) + } + return nil +} + +func tipContainsSince(ctx context.Context, repo *git.Repository, tipHash plumbing.Hash, scope discoveryScope) (bool, error) { + if !scope.hasSince { + return true, nil + } + return commitReachableFrom(ctx, repo, tipHash, scope.sinceHash) +} + func isHistoryRef(ref *plumbing.Reference) bool { if ref.Type() != plumbing.HashReference { return false @@ -165,6 +213,29 @@ func reachableCommits(ctx context.Context, repo *git.Repository, from plumbing.H return commits, nil } +func commitReachableFrom(ctx context.Context, repo *git.Repository, from, target plumbing.Hash) (bool, error) { + iter, err := repo.Log(&git.LogOptions{From: from, Order: git.LogOrderCommitterTime}) + if err != nil { + return false, fmt.Errorf("get log from %s: %w", from, err) + } + defer iter.Close() + + found := false + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return fmt.Errorf("context canceled while checking ancestry: %w", err) + } + if commit.Hash == target { + found = true + } + return nil + }) + if err != nil { + return false, fmt.Errorf("iterate commits from %s: %w", from, err) + } + return found, nil +} + func scanTip(ctx context.Context, repo *git.Repository, tip historyTip, excluded, seenCommits map[plumbing.Hash]bool, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) error { iter, err := repo.Log(&git.LogOptions{From: tip.hash, Order: git.LogOrderCommitterTime}) if err != nil { diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index baa540e95..39cfdb17c 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "os" + "os/exec" "path/filepath" "strings" "testing" @@ -27,6 +28,7 @@ const ( mainCheckpointID = "111111111111" featureCheckpointID = "222222222222" featureCheckpointID2 = "333333333333" + unrelatedCheckpointID = "444444444444" testSinceRevision = "abc123" testHeadRevision = "HEAD" testRepoFlag = "--repo" @@ -103,6 +105,33 @@ func TestDiscoverCheckpointHistory_HeadLimitsScan(t *testing.T) { require.Equal(t, []string{mainCheckpointID}, discoveredCheckpointIDs(checkpoints)) } +func TestDiscoverCheckpointHistory_SkipsRefsThatDoNotContainSince(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + commitUnrelatedMigrationTestFile(t, fixture.dir) + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID, featureCheckpointID, featureCheckpointID2}, discoveredCheckpointIDs(checkpoints)) +} + +func TestDiscoverCheckpointHistory_HeadMustContainSince(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + unrelatedHash := commitUnrelatedMigrationTestFile(t, fixture.dir) + + _, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{ + since: fixture.baseHash.String(), + head: unrelatedHash.String(), + }) + require.ErrorContains(t, err, "is not an ancestor of --head") +} + func TestRunListModeOpensRepoFromSubdirectory(t *testing.T) { t.Parallel() @@ -319,6 +348,25 @@ func commitMigrationTestFile(t *testing.T, dir, name, content, message string) p return plumbing.NewHash(testutil.GetHeadHash(t, dir)) } +func commitUnrelatedMigrationTestFile(t *testing.T, dir string) plumbing.Hash { + t.Helper() + + runMigrationGit(t, dir, "checkout", "--orphan", "unrelated") + runMigrationGit(t, dir, "rm", "-rf", ".") + return commitMigrationTestFile(t, dir, "unrelated.txt", "unrelated\n", + "unrelated checkpoint\n\nEntire-Checkpoint: "+unrelatedCheckpointID) +} + +func runMigrationGit(t *testing.T, dir string, args ...string) { + t.Helper() + + cmd := exec.CommandContext(context.Background(), "git", args...) + cmd.Dir = dir + cmd.Env = testutil.GitIsolatedEnv() + output, err := cmd.CombinedOutput() + require.NoError(t, err, "git %s failed: %s", strings.Join(args, " "), output) +} + func writeTestV2Checkpoint(t *testing.T, repo *git.Repository, opts checkpoint.WriteCommittedOptions) { t.Helper() From f49344e847aa72366f5c20c33db7932c06b21962 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 17:06:48 -0700 Subject: [PATCH 14/35] Scope migrated review status to restored sessions Entire-Checkpoint: 592345236abe --- cmd/migrate-v2-checkpoints/main_test.go | 37 +++++++++++++++++++++++-- cmd/migrate-v2-checkpoints/migration.go | 2 +- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 39cfdb17c..39cc65fb3 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -46,6 +46,7 @@ const ( testAuthorName = "Test" testAuthorEmail = "test@example.com" testBranchName = "main" + testReviewSkill = "review-skill" ) func TestParseOptions(t *testing.T) { @@ -177,7 +178,7 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { CheckpointTranscriptStart: 42, CompactTranscriptStart: 9, Kind: string(session.KindAgentReview), - ReviewSkills: []string{"review-skill"}, + ReviewSkills: []string{testReviewSkill}, ReviewPrompt: "review this", HasReview: true, }) @@ -215,7 +216,7 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { require.Equal(t, "turn-1", content.Metadata.TurnID) require.Equal(t, 0, content.Metadata.CheckpointTranscriptStart) require.Equal(t, string(session.KindAgentReview), content.Metadata.Kind) - require.Equal(t, []string{"review-skill"}, content.Metadata.ReviewSkills) + require.Equal(t, []string{testReviewSkill}, content.Metadata.ReviewSkills) require.Equal(t, "review this", content.Metadata.ReviewPrompt) ref, err := fixture.repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) @@ -281,6 +282,38 @@ func TestRunApplySkipsExistingV1Checkpoint(t *testing.T) { require.Equal(t, "session-existing-v1", content.Metadata.SessionID) } +func TestRunApplyHasReviewReflectsOnlyMigratedSessions(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "normal-session", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"normal\"}\n")), + }) + writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "review-session-without-raw-transcript", + Kind: string(session.KindAgentReview), + ReviewSkills: []string{testReviewSkill}, + ReviewPrompt: "review this", + HasReview: true, + CompactTranscript: []byte("{\"message\":\"compact review only\"}\n"), + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, testApplyFlag) + require.Contains(t, stdout, "missing raw transcripts: 1") + require.Contains(t, stdout, "migrated checkpoints: 1") + require.Contains(t, stdout, "migrated sessions: 1") + + summary, err := checkpoint.NewGitStore(fixture.repo).ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.False(t, summary.HasReview) + require.Len(t, summary.Sessions, 1) +} + func TestRunDryRunReportsMissingV2MetadataAndRawTranscripts(t *testing.T) { t.Parallel() diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 63c45b038..82c60ec20 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -144,7 +144,7 @@ func writeOptionsFromV2Content(content *checkpoint.SessionContent, summary *chec Kind: meta.Kind, ReviewSkills: meta.ReviewSkills, ReviewPrompt: meta.ReviewPrompt, - HasReview: summary.HasReview || session.Kind(meta.Kind).IsReview(), + HasReview: session.Kind(meta.Kind).IsReview(), } } From 6641537d1b09716a3f94d38141d01088510452e6 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Tue, 26 May 2026 17:41:03 -0700 Subject: [PATCH 15/35] Address v2 migration feedback Entire-Checkpoint: 962df06c99d3 --- .../checkpoint/committed_commit_time_test.go | 35 +- cmd/entire/cli/checkpoint/v2_committed.go | 50 +- cmd/entire/cli/checkpoint/v2_read.go | 2 +- cmd/entire/cli/checkpoint/v2_store.go | 7 +- cmd/entire/cli/checkpoint/v2_store_test.go | 79 +- cmd/migrate-v2-checkpoints/history.go | 73 +- cmd/migrate-v2-checkpoints/main_test.go | 136 ++- cmd/migrate-v2-checkpoints/migration.go | 126 +- scripts/migrate-v2-checkpoints-to-v1.sh | 1058 ----------------- 9 files changed, 436 insertions(+), 1130 deletions(-) delete mode 100755 scripts/migrate-v2-checkpoints-to-v1.sh diff --git a/cmd/entire/cli/checkpoint/committed_commit_time_test.go b/cmd/entire/cli/checkpoint/committed_commit_time_test.go index 91c5546c2..ddf3eac4e 100644 --- a/cmd/entire/cli/checkpoint/committed_commit_time_test.go +++ b/cmd/entire/cli/checkpoint/committed_commit_time_test.go @@ -71,6 +71,33 @@ func TestWriteCommitted_ZeroCommitTimeUsesCurrentTime(t *testing.T) { require.True(t, commit.Committer.When.Equal(commit.Author.When), "committer time = %s, want author time %s", commit.Committer.When, commit.Author.When) } +func TestV2WriteCommitted_CommitTime(t *testing.T) { + t.Parallel() + + repo, _ := setupCommittedCommitTimeRepo(t) + store := NewV2GitStore(repo) + commitTime := time.Date(2024, 4, 5, 6, 7, 8, 0, time.UTC) + + err := store.WriteCommitted(context.Background(), WriteCommittedOptions{ + CheckpointID: id.MustCheckpointID("c3d4e5f6a1b2"), + SessionID: "session-v2-commit-time", + CreatedAt: time.Date(2024, 4, 1, 2, 3, 4, 0, time.UTC), + CommitTime: commitTime, + Strategy: commitTimeStrategy, + Transcript: redact.AlreadyRedacted([]byte("transcript line\n")), + AuthorName: "Migration", + AuthorEmail: "migration@example.com", + }) + require.NoError(t, err) + + mainCommit := refHeadCommit(t, repo, plumbing.ReferenceName(paths.V2MainRefName)) + fullCommit := refHeadCommit(t, repo, plumbing.ReferenceName(paths.V2FullCurrentRefName)) + for _, commit := range []*object.Commit{mainCommit, fullCommit} { + require.True(t, commit.Author.When.Equal(commitTime), "author time = %s, want %s", commit.Author.When, commitTime) + require.True(t, commit.Committer.When.Equal(commitTime), "committer time = %s, want %s", commit.Committer.When, commitTime) + } +} + func setupCommittedCommitTimeRepo(t *testing.T) (*git.Repository, *GitStore) { t.Helper() @@ -90,7 +117,13 @@ func setupCommittedCommitTimeRepo(t *testing.T) (*git.Repository, *GitStore) { func metadataHeadCommit(t *testing.T, repo *git.Repository) *object.Commit { t.Helper() - ref, err := repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + return refHeadCommit(t, repo, plumbing.NewBranchReferenceName(paths.MetadataBranchName)) +} + +func refHeadCommit(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName) *object.Commit { + t.Helper() + + ref, err := repo.Reference(refName, true) require.NoError(t, err) commit, err := repo.CommitObject(ref.Hash()) diff --git a/cmd/entire/cli/checkpoint/v2_committed.go b/cmd/entire/cli/checkpoint/v2_committed.go index 1295c2120..a122e42fa 100644 --- a/cmd/entire/cli/checkpoint/v2_committed.go +++ b/cmd/entire/cli/checkpoint/v2_committed.go @@ -10,6 +10,7 @@ import ( "os" "strconv" "strings" + "time" "github.com/entireio/cli/cmd/entire/cli/agent" "github.com/entireio/cli/cmd/entire/cli/agent/types" @@ -141,7 +142,14 @@ func (s *V2GitStore) WriteCommittedMainBatch(ctx context.Context, batch []WriteC authorEmail = fallbackEmail } } - return s.updateRef(ctx, refName, rootTreeHash, parentHash, commitMsg, authorName, authorEmail) + return s.updateRefAt(ctx, refName, rootTreeHash, parentHash, commitMsg, authorName, authorEmail, commitTimeForWrite(last)) +} + +func commitTimeForWrite(opts WriteCommittedOptions) time.Time { + if opts.CommitTime.IsZero() { + return time.Now() + } + return opts.CommitTime } func (s *V2GitStore) existingMainCheckpointIDs(ctx context.Context, rootTreeHash plumbing.Hash) (map[id.CheckpointID]struct{}, error) { @@ -353,7 +361,7 @@ func (s *V2GitStore) UpdateCommitted(ctx context.Context, opts UpdateCommittedOp } // fullSessionArtifacts describes where a checkpoint session's raw transcript -// artifacts live on the v2 /full/current ref. +// artifacts live on a v2 /full ref. type fullSessionArtifacts struct { RefName plumbing.ReferenceName Found bool @@ -362,7 +370,7 @@ type fullSessionArtifacts struct { } // HasFullSessionArtifacts reports whether the raw transcript and content hash -// for a checkpoint session exist in the local v2 /full/current ref. +// for a checkpoint session exist in the local v2 /full refs. func (s *V2GitStore) HasFullSessionArtifacts(checkpointID id.CheckpointID, sessionIndex int) (bool, error) { artifacts, err := s.findFullSessionArtifacts(checkpointID, sessionIndex) if err != nil { @@ -401,13 +409,13 @@ func (s *V2GitStore) findFullSessionArtifacts(checkpointID id.CheckpointID, sess return fullSessionArtifacts{}, nil } -// FullSessionArtifactsIndex answers "does this session have complete /full/current +// FullSessionArtifactsIndex answers "does this session have complete /full // artifacts?" with an O(1) map lookup. Build it once via // BuildFullSessionArtifactsIndex. type FullSessionArtifactsIndex map[string]struct{} // Has reports whether the given session has a complete pair of -// raw_transcript and raw_transcript_hash.txt entries in /full/current. +// raw_transcript and raw_transcript_hash.txt entries in a /full ref. func (idx FullSessionArtifactsIndex) Has(checkpointID id.CheckpointID, sessionIndex int) bool { if idx == nil { return false @@ -420,7 +428,7 @@ func fullArtifactsIndexKey(checkpointID id.CheckpointID, sessionIndex int) strin return string(checkpointID) + "/" + strconv.Itoa(sessionIndex) } -// BuildFullSessionArtifactsIndex walks the /full/current ref's tree once and +// BuildFullSessionArtifactsIndex walks the local /full refs once and // records sessions whose subtree contains both raw_transcript[/.NNN] and // raw_transcript_hash.txt. Amortizes per-session HasFullSessionArtifacts // calls across the rest of the run. @@ -511,7 +519,17 @@ func sessionHasCompleteFullArtifacts(entries []object.TreeEntry) bool { } func (s *V2GitStore) fullRefSearchOrder() ([]plumbing.ReferenceName, error) { - return []plumbing.ReferenceName{plumbing.ReferenceName(paths.V2FullCurrentRefName)}, nil + archived, err := s.listArchivedFullRefs() + if err != nil { + return nil, err + } + + refNames := make([]plumbing.ReferenceName, 0, len(archived)+1) + refNames = append(refNames, plumbing.ReferenceName(paths.V2FullCurrentRefName)) + for i := len(archived) - 1; i >= 0; i-- { + refNames = append(refNames, archived[i]) + } + return refNames, nil } func (s *V2GitStore) inspectFullSessionArtifacts(refName plumbing.ReferenceName, checkpointID id.CheckpointID, sessionIndex int) (fullSessionArtifacts, error) { @@ -668,18 +686,12 @@ func (s *V2GitStore) updateCommittedMain(ctx context.Context, opts UpdateCommitt func (s *V2GitStore) updateCommittedFullTranscript(ctx context.Context, opts UpdateCommittedOptions, sessionIndex int) error { refName := plumbing.ReferenceName(paths.V2FullCurrentRefName) - existing, findErr := s.findFullSessionArtifacts(opts.CheckpointID, sessionIndex) - if findErr != nil { - return findErr - } - if existing.Found { - refName = existing.RefName + if _, err := s.inspectFullSessionArtifacts(refName, opts.CheckpointID, sessionIndex); err != nil { + return err } - if refName == plumbing.ReferenceName(paths.V2FullCurrentRefName) { - if err := s.ensureRef(ctx, refName); err != nil { - return fmt.Errorf("failed to ensure /full/current ref: %w", err) - } + if err := s.ensureRef(ctx, refName); err != nil { + return fmt.Errorf("failed to ensure /full/current ref: %w", err) } parentHash, rootTreeHash, err := s.GetRefState(refName) @@ -802,7 +814,7 @@ func (s *V2GitStore) writeCommittedMain(ctx context.Context, opts WriteCommitted } commitMsg := fmt.Sprintf("Checkpoint: %s\n", opts.CheckpointID) - if err := s.updateRef(ctx, refName, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail); err != nil { + if err := s.updateRefAt(ctx, refName, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail, commitTimeForWrite(opts)); err != nil { return 0, err } return sessionIndex, nil @@ -1053,7 +1065,7 @@ func (s *V2GitStore) writeCommittedFullTranscript(ctx context.Context, opts Writ } commitMsg := fmt.Sprintf("Checkpoint: %s\n", opts.CheckpointID) - if err := s.updateRef(ctx, refName, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail); err != nil { + if err := s.updateRefAt(ctx, refName, newTreeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail, commitTimeForWrite(opts)); err != nil { return err } diff --git a/cmd/entire/cli/checkpoint/v2_read.go b/cmd/entire/cli/checkpoint/v2_read.go index 5dd2b042f..5118ff0eb 100644 --- a/cmd/entire/cli/checkpoint/v2_read.go +++ b/cmd/entire/cli/checkpoint/v2_read.go @@ -260,7 +260,7 @@ func (s *V2GitStore) ReadSessionPrompts(ctx context.Context, checkpointID id.Che } // ReadSessionContent reads a session's metadata and prompts from the v2 /main ref, -// and the raw transcript (raw_transcript) from /full/current. +// and the raw transcript (raw_transcript) from local or remote /full refs. // This is the v2 equivalent of GitStore.ReadSessionContent — it reads the raw agent // transcript, not the compact transcript.jsonl. Used by resume and RestoreLogsOnly. // Returns ErrNoTranscript if the session exists but no raw transcript is available. diff --git a/cmd/entire/cli/checkpoint/v2_store.go b/cmd/entire/cli/checkpoint/v2_store.go index 513599730..a8be15efc 100644 --- a/cmd/entire/cli/checkpoint/v2_store.go +++ b/cmd/entire/cli/checkpoint/v2_store.go @@ -7,6 +7,7 @@ import ( "log/slog" "os/exec" "strings" + "time" "github.com/entireio/cli/cmd/entire/cli/logging" @@ -130,7 +131,11 @@ func commitTreeHashViaCLI(ctx context.Context, commitHash plumbing.Hash) (plumbi // updateRef creates a new commit on a ref with the given tree, updating the ref to point to it. func (s *V2GitStore) updateRef(ctx context.Context, refName plumbing.ReferenceName, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string) error { - commitHash, err := CreateCommit(ctx, s.repo, treeHash, parentHash, message, authorName, authorEmail) + return s.updateRefAt(ctx, refName, treeHash, parentHash, message, authorName, authorEmail, time.Now()) +} + +func (s *V2GitStore) updateRefAt(ctx context.Context, refName plumbing.ReferenceName, treeHash, parentHash plumbing.Hash, message, authorName, authorEmail string, commitTime time.Time) error { + commitHash, err := createCommitObject(ctx, s.repo, treeHash, parentHash, message, authorName, authorEmail, commitTime) if err != nil { return fmt.Errorf("failed to create commit: %w", err) } diff --git a/cmd/entire/cli/checkpoint/v2_store_test.go b/cmd/entire/cli/checkpoint/v2_store_test.go index b174f39e4..450f7fce0 100644 --- a/cmd/entire/cli/checkpoint/v2_store_test.go +++ b/cmd/entire/cli/checkpoint/v2_store_test.go @@ -174,7 +174,12 @@ func TestV2GitStore_UpdateRef_CreatesCommit(t *testing.T) { // v2MainTree returns the root tree from the /main ref for test assertions. func v2MainTree(t *testing.T, repo *git.Repository) *object.Tree { t.Helper() - ref, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + return v2TreeFromRef(t, repo, plumbing.ReferenceName(paths.V2MainRefName)) +} + +func v2TreeFromRef(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName) *object.Tree { + t.Helper() + ref, err := repo.Reference(refName, true) require.NoError(t, err) commit, err := repo.CommitObject(ref.Hash()) require.NoError(t, err) @@ -507,13 +512,7 @@ func TestV2GitStore_WriteCommittedMain_MultiSession(t *testing.T) { // v2FullTree returns the root tree from the /full/current ref for test assertions. func v2FullTree(t *testing.T, repo *git.Repository) *object.Tree { t.Helper() - ref, err := repo.Reference(plumbing.ReferenceName(paths.V2FullCurrentRefName), true) - require.NoError(t, err) - commit, err := repo.CommitObject(ref.Hash()) - require.NoError(t, err) - tree, err := commit.Tree() - require.NoError(t, err) - return tree + return v2TreeFromRef(t, repo, plumbing.ReferenceName(paths.V2FullCurrentRefName)) } func TestV2GitStore_WriteCommittedFull_WritesTranscript(t *testing.T) { @@ -852,6 +851,42 @@ func TestV2GitStore_UpdateCommitted_NoTranscript_OnlyUpdatesMain(t *testing.T) { assert.Contains(t, content, "original") } +func TestV2GitStore_UpdateCommitted_WritesCurrentWhenOnlyArchiveHasArtifacts(t *testing.T) { + t.Parallel() + repo := initTestRepo(t) + store := NewV2GitStore(repo) + ctx := context.Background() + + cpID := id.MustCheckpointID("aa44bb55cc66") + require.NoError(t, store.WriteCommitted(ctx, WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "test-session-archive-update", + Strategy: commitTimeStrategy, + Agent: agent.AgentTypeClaudeCode, + Transcript: redact.AlreadyRedacted([]byte(`{"type":"assistant","message":"archived"}`)), + AuthorName: commitTimeTestAuthor, + AuthorEmail: commitTimeTestEmail, + })) + archiveRefName := archiveV2FullCurrentRef(t, repo, "0000000000001") + resetV2FullCurrentRef(ctx, t, repo) + + require.NoError(t, store.UpdateCommitted(ctx, UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "test-session-archive-update", + Transcript: redact.AlreadyRedacted([]byte(`{"type":"assistant","message":"current"}`)), + Agent: agent.AgentTypeClaudeCode, + })) + + currentTree := v2FullTree(t, repo) + currentContent := v2ReadFile(t, currentTree, cpID.Path()+"/0/"+paths.V2RawTranscriptFileName) + assert.Contains(t, currentContent, "current") + + archiveTree := v2TreeFromRef(t, repo, archiveRefName) + archivedContent := v2ReadFile(t, archiveTree, cpID.Path()+"/0/"+paths.V2RawTranscriptFileName) + assert.Contains(t, archivedContent, "archived") + assert.NotContains(t, archivedContent, "current") +} + func TestV2GitStore_UpdateCommitted_CheckpointNotFound(t *testing.T) { t.Parallel() repo := initTestRepo(t) @@ -976,6 +1011,34 @@ func TestV2GitStore_BuildFullSessionArtifactsIndex_AgreesWithHasFullSessionArtif assert.False(t, index.Has(missing, 0)) } +func TestV2GitStore_BuildFullSessionArtifactsIndex_IncludesArchivedGenerations(t *testing.T) { + t.Parallel() + repo := initTestRepo(t) + store := NewV2GitStore(repo) + ctx := context.Background() + + cpID := id.MustCheckpointID("ab12cd34ef56") + require.NoError(t, store.WriteCommitted(ctx, WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-archived-index", + Strategy: commitTimeStrategy, + Agent: agent.AgentTypeClaudeCode, + Transcript: redact.AlreadyRedacted([]byte(`{"archived":true}`)), + AuthorName: commitTimeTestAuthor, + AuthorEmail: commitTimeTestEmail, + })) + archiveV2FullCurrentRef(t, repo, "0000000000001") + resetV2FullCurrentRef(ctx, t, repo) + + index, err := store.BuildFullSessionArtifactsIndex() + require.NoError(t, err) + require.True(t, index.Has(cpID, 0)) + + hasArtifacts, err := store.HasFullSessionArtifacts(cpID, 0) + require.NoError(t, err) + require.True(t, hasArtifacts) +} + // A nil index — the documented test-only fallback — must not panic on Has. func TestV2GitStore_BuildFullSessionArtifactsIndex_NilSafe(t *testing.T) { t.Parallel() diff --git a/cmd/migrate-v2-checkpoints/history.go b/cmd/migrate-v2-checkpoints/history.go index 82be73e7b..e053aa2a5 100644 --- a/cmd/migrate-v2-checkpoints/history.go +++ b/cmd/migrate-v2-checkpoints/history.go @@ -9,6 +9,7 @@ import ( "time" checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" "github.com/entireio/cli/cmd/entire/cli/trailers" "github.com/go-git/go-git/v6" @@ -178,10 +179,38 @@ func isHistoryRef(ref *plumbing.Reference) bool { if !name.IsBranch() && !name.IsRemote() { return false } + if isInternalHistoryRefName(name) { + return false + } return !strings.HasSuffix(name.String(), "/HEAD") } +func isInternalHistoryRefName(name plumbing.ReferenceName) bool { + if name == plumbing.NewBranchReferenceName(paths.MetadataBranchName) || + name == plumbing.NewBranchReferenceName(paths.TrailsBranchName) { + return true + } + + remotePrefix := "refs/remotes/" + nameString := name.String() + if !strings.HasPrefix(nameString, remotePrefix) { + return false + } + remoteAndBranch := strings.TrimPrefix(nameString, remotePrefix) + _, branchName, ok := strings.Cut(remoteAndBranch, "/") + if !ok { + return false + } + return branchName == paths.MetadataBranchName || branchName == paths.TrailsBranchName +} + func resolveRevision(repo *git.Repository, revision string) (plumbing.Hash, error) { + if isShortHexRevision(revision) { + if err := rejectAmbiguousCommitPrefix(repo, revision); err != nil { + return plumbing.ZeroHash, err + } + } + hash, err := repo.ResolveRevision(plumbing.Revision(revision)) if err != nil { return plumbing.ZeroHash, err //nolint:wrapcheck // callers add flag-specific context @@ -192,6 +221,48 @@ func resolveRevision(repo *git.Repository, revision string) (plumbing.Hash, erro return *hash, nil } +func isShortHexRevision(revision string) bool { + if revision == "" || len(revision) >= len(plumbing.ZeroHash.String()) { + return false + } + for _, r := range revision { + switch { + case r >= '0' && r <= '9': + case r >= 'a' && r <= 'f': + case r >= 'A' && r <= 'F': + default: + return false + } + } + return true +} + +func rejectAmbiguousCommitPrefix(repo *git.Repository, revision string) error { + prefix := strings.ToLower(revision) + iter, err := repo.CommitObjects() + if err != nil { + return fmt.Errorf("list commit objects for revision %q: %w", revision, err) + } + defer iter.Close() + + var matches []plumbing.Hash + if err := iter.ForEach(func(commit *object.Commit) error { + if strings.HasPrefix(commit.Hash.String(), prefix) { + matches = append(matches, commit.Hash) + } + return nil + }); err != nil { + return fmt.Errorf("scan commit objects for revision %q: %w", revision, err) + } + if len(matches) < 2 { + return nil + } + sort.Slice(matches, func(i, j int) bool { + return matches[i].String() < matches[j].String() + }) + return fmt.Errorf("ambiguous revision %q matches commit prefixes %s and %s", revision, matches[0], matches[1]) +} + func reachableCommits(ctx context.Context, repo *git.Repository, from plumbing.Hash) (map[plumbing.Hash]bool, error) { iter, err := repo.Log(&git.LogOptions{From: from, Order: git.LogOrderCommitterTime}) if err != nil { @@ -269,7 +340,7 @@ func addCheckpointCommit(commit *object.Commit, checkpointIndexes map[string]int discovered := discoveredCommit{ Hash: commit.Hash, ShortSHA: shortHash(commit.Hash), - Date: commit.Author.When, + Date: commit.Committer.When, } for _, id := range ids { diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 39cc65fb3..8b44674ae 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -3,6 +3,7 @@ package main import ( "bytes" "context" + "fmt" "os" "os/exec" "path/filepath" @@ -20,6 +21,7 @@ import ( "github.com/go-git/go-git/v6" "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" "github.com/stretchr/testify/require" ) @@ -47,6 +49,7 @@ const ( testAuthorEmail = "test@example.com" testBranchName = "main" testReviewSkill = "review-skill" + testToolUseID = "toolu_test123" ) func TestParseOptions(t *testing.T) { @@ -133,6 +136,68 @@ func TestDiscoverCheckpointHistory_HeadMustContainSince(t *testing.T) { require.ErrorContains(t, err, "is not an ancestor of --head") } +func TestDiscoverCheckpointHistory_ExcludesInternalRefs(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + runMigrationGit(t, fixture.dir, "checkout", "-b", paths.MetadataBranchName, fixture.mainHash.String()) + commitMigrationTestFile(t, fixture.dir, "internal.txt", "internal\n", + "internal checkpoint\n\nEntire-Checkpoint: "+unrelatedCheckpointID) + + repo, err := git.PlainOpen(fixture.dir) + require.NoError(t, err) + checkpoints, err := discoverCheckpointHistory(context.Background(), repo, discoveryOptions{ + since: fixture.baseHash.String(), + }) + require.NoError(t, err) + + require.Equal(t, []string{mainCheckpointID, featureCheckpointID, featureCheckpointID2}, discoveredCheckpointIDs(checkpoints)) +} + +func TestResolveRevisionRejectsAmbiguousShortCommitPrefix(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + prefixes := map[string]struct{}{} + ambiguousPrefix := "" + for i := range 17 { + hash := commitMigrationTestFile(t, fixture.dir, fmt.Sprintf("ambiguous-%02d.txt", i), fmt.Sprintf("%d\n", i), fmt.Sprintf("ambiguous %d", i)) + prefix := hash.String()[:1] + if _, exists := prefixes[prefix]; exists { + ambiguousPrefix = prefix + break + } + prefixes[prefix] = struct{}{} + } + require.NotEmpty(t, ambiguousPrefix) + + repo, err := git.PlainOpen(fixture.dir) + require.NoError(t, err) + _, err = resolveRevision(repo, ambiguousPrefix) + require.ErrorContains(t, err, "ambiguous revision") +} + +func TestAddCheckpointCommitUsesCommitterTime(t *testing.T) { + t.Parallel() + + authorTime := time.Date(2024, 1, 2, 3, 4, 5, 0, time.UTC) + committerTime := time.Date(2024, 2, 3, 4, 5, 6, 0, time.UTC) + commit := &object.Commit{ + Hash: plumbing.NewHash("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), + Author: object.Signature{When: authorTime}, + Committer: object.Signature{When: committerTime}, + Message: "commit\n\nEntire-Checkpoint: " + mainCheckpointID, + } + checkpointIndexes := map[string]int{} + checkpoints := []discoveredCheckpoint{} + + addCheckpointCommit(commit, checkpointIndexes, &checkpoints) + + require.Len(t, checkpoints, 1) + require.Len(t, checkpoints[0].Commits, 1) + require.Equal(t, committerTime, checkpoints[0].Commits[0].Date) +} + func TestRunListModeOpensRepoFromSubdirectory(t *testing.T) { t.Parallel() @@ -214,7 +279,7 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { require.Equal(t, agent.AgentTypeClaudeCode, content.Metadata.Agent) require.Equal(t, "claude-test-model", content.Metadata.Model) require.Equal(t, "turn-1", content.Metadata.TurnID) - require.Equal(t, 0, content.Metadata.CheckpointTranscriptStart) + require.Equal(t, 9, content.Metadata.CheckpointTranscriptStart) require.Equal(t, string(session.KindAgentReview), content.Metadata.Kind) require.Equal(t, []string{testReviewSkill}, content.Metadata.ReviewSkills) require.Equal(t, "review this", content.Metadata.ReviewPrompt) @@ -239,23 +304,28 @@ func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { stdout := runMigrationCommand(t, fixture, fixture.mainHash, testDryRunFlag) require.Contains(t, stdout, "Migration plan:") - require.Contains(t, stdout, "checkpoints with raw transcripts: 1") - require.Contains(t, stdout, "sessions with raw transcripts: 1") + require.Contains(t, stdout, "checkpoints eligible for migration: 1") + require.Contains(t, stdout, "sessions eligible for migration: 1") summary, err := checkpoint.NewGitStore(fixture.repo).ReadCommitted(context.Background(), cpID) require.NoError(t, err) require.Nil(t, summary) } -func TestRunApplySkipsExistingV1Checkpoint(t *testing.T) { +func TestRunApplySkipsExistingV1SessionsAndMigratesMissingSessions(t *testing.T) { t.Parallel() fixture := setupMigrationHistoryRepo(t) cpID := id.MustCheckpointID(mainCheckpointID) writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ CheckpointID: cpID, - SessionID: "session-v2", - Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2\"}\n")), + SessionID: "session-existing-v1", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 existing\"}\n")), + }) + writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-v2-missing-from-v1", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 new\"}\n")), }) existingTranscript := []byte("{\"message\":\"already v1\"}\n") @@ -272,14 +342,53 @@ func TestRunApplySkipsExistingV1Checkpoint(t *testing.T) { require.NoError(t, err) stdout := runMigrationCommand(t, fixture, fixture.mainHash, testApplyFlag) - require.Contains(t, stdout, "already present in v1: 1") - require.Contains(t, stdout, "migrated checkpoints: 0") - require.Contains(t, stdout, "migrated sessions: 0") + require.Contains(t, stdout, "already present v1 sessions: 1") + require.Contains(t, stdout, "migrated checkpoints: 1") + require.Contains(t, stdout, "migrated sessions: 1") - content, err := checkpoint.NewGitStore(fixture.repo).ReadSessionContent(context.Background(), cpID, 0) + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.Len(t, summary.Sessions, 2) + content, err := v1Store.ReadSessionContentByID(context.Background(), cpID, "session-existing-v1") require.NoError(t, err) require.Equal(t, existingTranscript, content.Transcript) require.Equal(t, "session-existing-v1", content.Metadata.SessionID) + content, err = v1Store.ReadSessionContentByID(context.Background(), cpID, "session-v2-missing-from-v1") + require.NoError(t, err) + require.JSONEq(t, `{"message":"from v2 new"}`, string(content.Transcript)) +} + +func TestRunApplyMigratesTaskMetadata(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "task-session", + IsTask: true, + ToolUseID: testToolUseID, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"task\"}\n")), + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, testApplyFlag) + require.Contains(t, stdout, "migrated sessions: 1") + + v1Store := checkpoint.NewGitStore(fixture.repo) + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.True(t, content.Metadata.IsTask) + require.Equal(t, testToolUseID, content.Metadata.ToolUseID) + + ref, err := fixture.repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + commit, err := fixture.repo.CommitObject(ref.Hash()) + require.NoError(t, err) + tree, err := commit.Tree() + require.NoError(t, err) + _, err = tree.File(cpID.Path() + "/tasks/" + testToolUseID + "/checkpoint.json") + require.NoError(t, err) } func TestRunApplyHasReviewReflectsOnlyMigratedSessions(t *testing.T) { @@ -331,10 +440,11 @@ func TestRunDryRunReportsMissingV2MetadataAndRawTranscripts(t *testing.T) { }, &stdout) require.NoError(t, err) - require.Contains(t, stdout.String(), "missing v2 metadata: 2") + require.Contains(t, stdout.String(), "missing v2 checkpoint metadata: 2") + require.Contains(t, stdout.String(), "missing required v2 session metadata: 0") require.Contains(t, stdout.String(), "missing raw transcripts: 1") - require.Contains(t, stdout.String(), "checkpoints with raw transcripts: 0") - require.Contains(t, stdout.String(), "sessions with raw transcripts: 0") + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 0") + require.Contains(t, stdout.String(), "sessions eligible for migration: 0") } type migrationHistoryFixture struct { diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 82c60ec20..3094511fa 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -18,20 +18,22 @@ type migrationOptions struct { } type migrationReport struct { - DiscoveredChecks int - ExistingV1Checkpoints int - MissingV2Metadata int - MissingRawTranscripts int - PlannedCheckpoints int - PlannedSessions int - MigratedCheckpoints int - MigratedSessions int + DiscoveredCheckpoints int + ExistingV1Sessions int + MissingV2CheckpointMetadata int + MissingV2SessionMetadata int + MissingRawTranscripts int + EligibleCheckpoints int + EligibleSessions int + MigratedCheckpoints int + MigratedSessions int } type checkpointMigrator struct { v1Store *checkpoint.GitStore v2Store *checkpoint.V2GitStore opts migrationOptions + fullIndex checkpoint.FullSessionArtifactsIndex authorName string authorEmail string report *migrationReport @@ -39,11 +41,21 @@ type checkpointMigrator struct { func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, discovered []discoveredCheckpoint, opts migrationOptions) (migrationReport, error) { authorName, authorEmail := checkpoint.GetGitAuthorFromRepo(repo) - report := migrationReport{DiscoveredChecks: len(discovered)} + v2Store := checkpoint.NewV2GitStore(repo) + report := migrationReport{DiscoveredCheckpoints: len(discovered)} + var fullIndex checkpoint.FullSessionArtifactsIndex + if !opts.apply { + var err error + fullIndex, err = v2Store.BuildFullSessionArtifactsIndex() + if err != nil { + return report, fmt.Errorf("build v2 full artifact index: %w", err) + } + } migrator := checkpointMigrator{ v1Store: checkpoint.NewGitStore(repo), - v2Store: checkpoint.NewV2GitStore(repo), + v2Store: v2Store, opts: opts, + fullIndex: fullIndex, authorName: authorName, authorEmail: authorEmail, report: &report, @@ -54,11 +66,12 @@ func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, dis if err != nil { return report, err } - if migratedSessions > 0 { - report.PlannedCheckpoints++ - if opts.apply { - report.MigratedCheckpoints++ - } + if migratedSessions == 0 { + continue + } + report.EligibleCheckpoints++ + if opts.apply { + report.MigratedCheckpoints++ } } return report, nil @@ -69,9 +82,9 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di if err != nil { return 0, fmt.Errorf("read v1 checkpoint %s: %w", discovered.ID, err) } - if existing != nil { - m.report.ExistingV1Checkpoints++ - return 0, nil + existingSessionIDs, err := m.existingV1SessionIDs(ctx, discovered, existing) + if err != nil { + return 0, err } summary, err := m.v2Store.ReadCommitted(ctx, discovered.ID) @@ -79,13 +92,30 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di return 0, fmt.Errorf("read v2 checkpoint %s: %w", discovered.ID, err) } if summary == nil || len(summary.Sessions) == 0 { - m.report.MissingV2Metadata++ + m.report.MissingV2CheckpointMetadata++ return 0, nil } migratedSessions := 0 for sessionIndex := range summary.Sessions { - content, err := m.v2Store.ReadSessionContent(ctx, discovered.ID, sessionIndex) + metadataContent, err := m.readV2SessionMetadata(ctx, discovered, sessionIndex) + if err != nil { + if errors.Is(err, checkpoint.ErrCheckpointNotFound) { + m.report.MissingV2SessionMetadata++ + continue + } + return migratedSessions, fmt.Errorf("read v2 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) + } + if !hasRequiredV2Metadata(metadataContent) { + m.report.MissingV2SessionMetadata++ + continue + } + if _, exists := existingSessionIDs[metadataContent.Metadata.SessionID]; exists { + m.report.ExistingV1Sessions++ + continue + } + + content, err := m.readV2SessionContent(ctx, discovered, sessionIndex, metadataContent) if err != nil { if errors.Is(err, checkpoint.ErrNoTranscript) { m.report.MissingRawTranscripts++ @@ -94,11 +124,11 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di return migratedSessions, fmt.Errorf("read v2 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) } if !hasRequiredV2Metadata(content) { - m.report.MissingV2Metadata++ + m.report.MissingV2SessionMetadata++ continue } - m.report.PlannedSessions++ + m.report.EligibleSessions++ if m.opts.apply { writeOpts := writeOptionsFromV2Content(content, summary, m.authorName, m.authorEmail) if err := m.v1Store.WriteCommitted(ctx, writeOpts); err != nil { @@ -111,6 +141,43 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di return migratedSessions, nil } +func (m checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered discoveredCheckpoint, summary *checkpoint.CheckpointSummary) (map[string]struct{}, error) { + existing := make(map[string]struct{}) + if summary == nil { + return existing, nil + } + for sessionIndex := range summary.Sessions { + content, err := m.v1Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) + if err != nil { + return nil, fmt.Errorf("read v1 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) + } + if content.Metadata.SessionID == "" { + continue + } + existing[content.Metadata.SessionID] = struct{}{} + } + return existing, nil +} + +func (m checkpointMigrator) readV2SessionMetadata(ctx context.Context, discovered discoveredCheckpoint, sessionIndex int) (*checkpoint.SessionContent, error) { + content, err := m.v2Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) + if err != nil { + return nil, fmt.Errorf("read v2 session metadata and prompts: %w", err) + } + return content, nil +} + +func (m checkpointMigrator) readV2SessionContent(ctx context.Context, discovered discoveredCheckpoint, sessionIndex int, metadataContent *checkpoint.SessionContent) (*checkpoint.SessionContent, error) { + if m.opts.apply || !m.fullIndex.Has(discovered.ID, sessionIndex) { + content, err := m.v2Store.ReadSessionContent(ctx, discovered.ID, sessionIndex) + if err != nil { + return nil, fmt.Errorf("read full v2 session content: %w", err) + } + return content, nil + } + return metadataContent, nil +} + func hasRequiredV2Metadata(content *checkpoint.SessionContent) bool { return !content.Metadata.CheckpointID.IsEmpty() && content.Metadata.SessionID != "" } @@ -133,8 +200,10 @@ func writeOptionsFromV2Content(content *checkpoint.SessionContent, summary *chec Agent: meta.Agent, Model: meta.Model, TurnID: meta.TurnID, + IsTask: meta.IsTask, + ToolUseID: meta.ToolUseID, TranscriptIdentifierAtStart: meta.TranscriptIdentifierAtStart, - CheckpointTranscriptStart: 0, + CheckpointTranscriptStart: meta.GetTranscriptStart(), TokenUsage: meta.TokenUsage, SessionMetrics: meta.SessionMetrics, InitialAttribution: meta.InitialAttribution, @@ -154,12 +223,13 @@ func writeMigrationReport(w io.Writer, report migrationReport, applied bool) { } else { fmt.Fprintln(w, "Migration plan:") } - fmt.Fprintf(w, " discovered checkpoint trailers: %d\n", report.DiscoveredChecks) - fmt.Fprintf(w, " already present in v1: %d\n", report.ExistingV1Checkpoints) - fmt.Fprintf(w, " missing v2 metadata: %d\n", report.MissingV2Metadata) + fmt.Fprintf(w, " discovered checkpoints: %d\n", report.DiscoveredCheckpoints) + fmt.Fprintf(w, " already present v1 sessions: %d\n", report.ExistingV1Sessions) + fmt.Fprintf(w, " missing v2 checkpoint metadata: %d\n", report.MissingV2CheckpointMetadata) + fmt.Fprintf(w, " missing required v2 session metadata: %d\n", report.MissingV2SessionMetadata) fmt.Fprintf(w, " missing raw transcripts: %d\n", report.MissingRawTranscripts) - fmt.Fprintf(w, " checkpoints with raw transcripts: %d\n", report.PlannedCheckpoints) - fmt.Fprintf(w, " sessions with raw transcripts: %d\n", report.PlannedSessions) + fmt.Fprintf(w, " checkpoints eligible for migration: %d\n", report.EligibleCheckpoints) + fmt.Fprintf(w, " sessions eligible for migration: %d\n", report.EligibleSessions) if applied { fmt.Fprintf(w, " migrated checkpoints: %d\n", report.MigratedCheckpoints) fmt.Fprintf(w, " migrated sessions: %d\n", report.MigratedSessions) diff --git a/scripts/migrate-v2-checkpoints-to-v1.sh b/scripts/migrate-v2-checkpoints-to-v1.sh deleted file mode 100755 index 94e82f735..000000000 --- a/scripts/migrate-v2-checkpoints-to-v1.sh +++ /dev/null @@ -1,1058 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# -# migrate-v2-checkpoints-to-v1.sh - Migrate legacy v2 checkpoints to v1. -# -# USAGE: -# ./scripts/migrate-v2-checkpoints-to-v1.sh [OPTIONS] [SINCE_COMMIT] -# -# OPTIONS: -# -h, --help Show this help message -# --list Print checkpoint IDs and associated commit IDs only -# --dry-run Print every v2 folder/file that would be migrated -# --apply Write local refs/heads/entire/checkpoints/v1 migration commits -# --repo Local repository path to inspect -# --since Commit before the checkpoints to inspect -# --head Limit scan to one history tip (default: all branches/remotes) -# -# DESCRIPTION: -# Standalone helper for converting legacy checkpoints v2 data back to the v1 -# checkpoint format. The script finds commits newer than SINCE_COMMIT on local -# branches/remotes (or on --head, when supplied), extracts Entire-Checkpoint -# trailers, and locates the v2 /full folders/files that contain raw transcripts: -# -# refs/entire/checkpoints/v2/full/*://raw_transcript* -# -# The default mode prints a migration plan without writing refs. --dry-run -# prints every source folder/file. --apply writes one local migration commit -# per checkpoint to refs/heads/entire/checkpoints/v1. -# -# If --repo or SINCE_COMMIT is omitted, the script prompts for it. -# - -V1_REF="refs/heads/entire/checkpoints/v1" -V2_MAIN_REF="refs/entire/checkpoints/v2/main" -V2_FULL_REF_PREFIX="refs/entire/checkpoints/v2/full" -TRAILER_KEY="Entire-Checkpoint" - -since_commit="" -head_commitish="" -repo_path="" -dry_run=false -apply=false -list_mode=false -tmp_dir="" -plan_entries_file="" -checkpoint_commits_file="" -checkpoint_ids_file="" -checkpoint_paths_file="" -full_artifacts_file="" -raw_sessions_file="" -raw_checkpoint_ids_file="" -main_metadata_file="" - -show_help() { - sed -n '5,/^$/p' "$0" | sed -E 's/^# ?//' -} - -die() { - printf 'error: %s\n' "$*" >&2 - exit 1 -} - -warn() { - printf 'warning: %s\n' "$*" >&2 -} - -cleanup() { - if [[ -n "$tmp_dir" && -d "$tmp_dir" ]]; then - rm -rf "$tmp_dir" - fi -} -trap cleanup EXIT - -checkpoint_to_path() { - local checkpoint_id="$1" - printf '%s/%s' "${checkpoint_id:0:2}" "${checkpoint_id:2}" -} - -list_full_refs() { - git for-each-ref --format='%(refname)' "$V2_FULL_REF_PREFIX" | - sort | - awk -v current="${V2_FULL_REF_PREFIX}/current" ' - $0 == current { current_ref = $0; next } - { refs[++n] = $0 } - END { - if (current_ref != "") { - print current_ref - } - for (i = n; i >= 1; i--) { - print refs[i] - } - } - ' -} - -find_bulk_migration_ancestor() { - local head_hash="$1" - - git log --first-parent --format='%H%x09%s' "$head_hash" | - awk -F '\t' '$2 == "Migrate checkpoints v2 to v1" { print $1; exit }' -} - -write_checkpoint_commit_index_between() { - local since="$1" - local head="$2" - local output_file="$3" - - git log --format='__ENTIRE_COMMIT__%H%n%B' "${since}..${head}" | - awk -v key="$TRAILER_KEY" ' - /^__ENTIRE_COMMIT__/ { - commit = substr($0, length("__ENTIRE_COMMIT__") + 1) - next - } - { - line = $0 - pattern = "^[[:space:]]*" key ":[[:space:]]*([0-9a-f]{12})[[:space:]]*$" - if (line ~ pattern) { - sub("^[[:space:]]*" key ":[[:space:]]*", "", line) - sub("[[:space:]]*$", "", line) - if (commit != "" && !seen[line SUBSEP commit]++) { - print line "\t" commit - } - } - } - ' > "$output_file" -} - -write_checkpoint_commit_index_from_all_refs() { - local since="$1" - local output_file="$2" - local refs_file="$tmp_dir/refs_containing_since" - - : > "$refs_file" - git for-each-ref --contains "$since" --format='%(refname)' refs/heads refs/remotes > "$refs_file" - if git merge-base --is-ancestor "$since" HEAD 2>/dev/null; then - printf 'HEAD\n' >> "$refs_file" - fi - sort -u "$refs_file" -o "$refs_file" - - if [[ ! -s "$refs_file" ]]; then - : > "$output_file" - return - fi - - xargs git log --format='__ENTIRE_COMMIT__%H%n%B' "$since".. < "$refs_file" | - awk -v key="$TRAILER_KEY" ' - /^__ENTIRE_COMMIT__/ { - commit = substr($0, length("__ENTIRE_COMMIT__") + 1) - next - } - { - line = $0 - pattern = "^[[:space:]]*" key ":[[:space:]]*([0-9a-f]{12})[[:space:]]*$" - if (line ~ pattern) { - sub("^[[:space:]]*" key ":[[:space:]]*", "", line) - sub("[[:space:]]*$", "", line) - if (commit != "" && !seen[line SUBSEP commit]++) { - print line "\t" commit - } - } - } - ' > "$output_file" -} - -write_unique_mktree_input() { - local entries_file="$1" - awk -F '\t' 'NF >= 2 && !seen[$2]++ { print }' "$entries_file" | - sort -k2 -} - -build_v1_tree_from_entries() { - local base_tree_hash="$1" - local entries_file="$2" - local combined index_file - combined="$tmp_dir/combined_index_info" - index_file="$tmp_dir/migration.index" - - rm -f "$index_file" - cat "$entries_file" > "$combined" - if [[ -n "$base_tree_hash" ]]; then - git ls-tree -r "$base_tree_hash" >> "$combined" - fi - - write_unique_mktree_input "$combined" | - GIT_INDEX_FILE="$index_file" git update-index --index-info - GIT_INDEX_FILE="$index_file" git write-tree -} - -write_original_associated_commit_trailers() { - local checkpoint_id="$1" - - awk -F '\t' -v checkpoint_id="$checkpoint_id" ' - NF >= 2 && $1 == checkpoint_id && !seen[$2]++ { - printf "Original-Associated-Commit: %s\n", $2 - } - ' "$checkpoint_commits_file" -} - -write_v1_checkpoint_migration_message() { - local checkpoint_id="$1" - - printf 'Checkpoint: %s\n\n' "$checkpoint_id" - printf 'Migrated from checkpoints v2.\n' - write_original_associated_commit_trailers "$checkpoint_id" - printf 'Source refs: %s and %s/*\n' "$V2_MAIN_REF" "$V2_FULL_REF_PREFIX" -} - -create_v1_checkpoint_migration_commit() { - local checkpoint_id="$1" - local tree_hash="$2" - local parent_hash="$3" - local commit_date="$4" - local commit_hash message_file - - message_file="$tmp_dir/commit-message-${checkpoint_id}" - write_v1_checkpoint_migration_message "$checkpoint_id" > "$message_file" - - if [[ -n "$parent_hash" ]]; then - if [[ -n "$commit_date" ]]; then - commit_hash=$(GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash" -p "$parent_hash" < "$message_file") - else - commit_hash=$(git commit-tree "$tree_hash" -p "$parent_hash" < "$message_file") - fi - else - if [[ -n "$commit_date" ]]; then - commit_hash=$(GIT_AUTHOR_DATE="$commit_date" GIT_COMMITTER_DATE="$commit_date" git commit-tree "$tree_hash" < "$message_file") - else - commit_hash=$(git commit-tree "$tree_hash" < "$message_file") - fi - fi - - printf '%s\n' "$commit_hash" -} - -write_checkpoint_id_files() { - awk -F '\t' 'NF >= 2 && !seen[$1]++ { print $1 }' "$checkpoint_commits_file" > "$checkpoint_ids_file" - awk 'NF { print $0 "\t" substr($0, 1, 2) "/" substr($0, 3) }' "$checkpoint_ids_file" > "$checkpoint_paths_file" -} - -write_apply_checkpoint_ids() { - local output_file="$1" - - awk ' - NR == FNR { - raw[$1] = 1 - next - } - NF && ($1 in raw) { - ids[++n] = $1 - } - END { - for (i = n; i >= 1; i--) { - print ids[i] - } - } - ' "$raw_checkpoint_ids_file" "$checkpoint_ids_file" > "$output_file" -} - -write_bulk_migration_entries() { - local bulk_commit="$1" - local parent_commit="$2" - local output_entries_file="$3" - local output_ids_file="$4" - local changed_paths_file checkpoint_dirs_file - - changed_paths_file="$tmp_dir/bulk_changed_paths" - checkpoint_dirs_file="$tmp_dir/bulk_checkpoint_dirs" - - if [[ -n "$parent_commit" ]]; then - git diff --name-only "$parent_commit" "$bulk_commit" > "$changed_paths_file" - else - git ls-tree -r --name-only "$bulk_commit" > "$changed_paths_file" - fi - - awk -F '/' ' - NF >= 3 && $1 ~ /^[0-9a-f][0-9a-f]$/ && $2 ~ /^[0-9a-f]{10}$/ { - dir = $1 "/" $2 - if (!seen[dir]++) { - print dir - } - } - ' "$changed_paths_file" > "$checkpoint_dirs_file" - - awk -F '/' 'NF >= 2 { print $1 $2 }' "$checkpoint_dirs_file" > "$output_ids_file" - - git ls-tree -r "$bulk_commit" | - awk -F '\t' -v checkpoint_dirs_file="$checkpoint_dirs_file" ' - BEGIN { - while ((getline dir < checkpoint_dirs_file) > 0) { - prefixes[dir "/"] = 1 - } - close(checkpoint_dirs_file) - } - NF >= 2 { - for (prefix in prefixes) { - if (index($2, prefix) == 1) { - print - next - } - } - } - ' > "$output_entries_file" -} - -write_checkpoint_entries() { - local checkpoint_id="$1" - local output_file="$2" - local source_entries_file="$3" - local checkpoint_path - - checkpoint_path=$(checkpoint_to_path "$checkpoint_id") - awk -F '\t' -v prefix="${checkpoint_path}/" 'NF >= 2 && index($2, prefix) == 1 { print }' "$source_entries_file" > "$output_file" -} - -write_checkpoint_commit_dates() { - local source_entries_file="$1" - local output_file="$2" - - command -v python3 >/dev/null 2>&1 || die "python3 is required for --apply metadata date extraction" - - python3 - "$source_entries_file" "$output_file" <<'PY' -import json -import re -import subprocess -import sys - -source_entries_file, output_file = sys.argv[1:] -metadata_re = re.compile(r"^([0-9a-f]{2})/([0-9a-f]{10})/[0-9]+/metadata\.json$") - -records = [] -with open(source_entries_file, "r", encoding="utf-8") as f: - for line in f: - line = line.rstrip("\n") - if not line or "\t" not in line: - continue - object_info, path = line.split("\t", 1) - match = metadata_re.match(path) - if not match: - continue - object_parts = object_info.split() - if len(object_parts) != 3 or object_parts[1] != "blob": - continue - checkpoint_id = match.group(1) + match.group(2) - records.append((checkpoint_id, object_parts[2])) - -if not records: - open(output_file, "w", encoding="utf-8").close() - sys.exit(0) - -batch_input = "".join(blob + "\n" for _, blob in records).encode("ascii") -batch = subprocess.run( - ["git", "cat-file", "--batch"], - input=batch_input, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=False, -) -if batch.returncode != 0: - sys.stderr.write(batch.stderr.decode("utf-8", errors="replace")) - sys.exit(batch.returncode) - -dates = {} -out = batch.stdout -offset = 0 -for checkpoint_id, blob in records: - header_end = out.find(b"\n", offset) - if header_end < 0: - sys.stderr.write(f"missing git cat-file header for {blob}\n") - sys.exit(1) - header = out[offset:header_end].decode("ascii", errors="replace") - offset = header_end + 1 - parts = header.split() - if len(parts) < 3 or parts[1] != "blob": - sys.stderr.write(f"unexpected git cat-file header for {blob}: {header}\n") - sys.exit(1) - size = int(parts[2]) - data = out[offset:offset + size] - offset += size - if offset >= len(out) or out[offset:offset + 1] != b"\n": - sys.stderr.write(f"missing git cat-file record separator for {blob}\n") - sys.exit(1) - offset += 1 - - metadata = json.loads(data.decode("utf-8")) - created_at = metadata.get("created_at") - if created_at and (checkpoint_id not in dates or created_at < dates[checkpoint_id]): - dates[checkpoint_id] = created_at - -with open(output_file, "w", encoding="utf-8") as f: - for checkpoint_id in sorted(dates): - f.write(f"{checkpoint_id}\t{dates[checkpoint_id]}\n") -PY -} - -order_checkpoint_ids_by_date() { - local checkpoint_ids_source_file="$1" - local checkpoint_dates_file="$2" - local output_file="$3" - - awk -F '\t' ' - NR == FNR { - date[$1] = $2 - next - } - NF { - order++ - sort_date = ($1 in date && date[$1] != "") ? date[$1] : sprintf("9999-12-31T23:59:59Z-%09d", order) - print sort_date "\t" sprintf("%09d", order) "\t" $1 - } - ' "$checkpoint_dates_file" "$checkpoint_ids_source_file" | - sort -t $'\t' -k1,1 -k2,2 | - cut -f3 > "$output_file" -} - -checkpoint_commit_date() { - local checkpoint_id="$1" - local checkpoint_dates_file="$2" - - awk -F '\t' -v checkpoint_id="$checkpoint_id" '$1 == checkpoint_id { print $2; exit }' "$checkpoint_dates_file" -} - -write_full_artifact_index() { - local full_ref - : > "$full_artifacts_file" - : > "$raw_sessions_file" - : > "$raw_checkpoint_ids_file" - - while IFS= read -r full_ref; do - [[ -n "$full_ref" ]] || continue - git ls-tree -r "$full_ref" | - awk -F '\t' \ - -v ref="$full_ref" \ - -v checkpoint_paths_file="$checkpoint_paths_file" \ - -v full_artifacts_file="$full_artifacts_file" \ - -v raw_sessions_file="$raw_sessions_file" \ - -v raw_checkpoint_ids_file="$raw_checkpoint_ids_file" \ - -v plan_entries_file="$plan_entries_file" ' - BEGIN { - while ((getline line < checkpoint_paths_file) > 0) { - split(line, fields, "\t") - checkpoint_by_path[fields[2]] = fields[1] - } - } - NF >= 2 { - meta = $1 - path = $2 - n = split(path, parts, "/") - if (n != 4) { - next - } - checkpoint_path = parts[1] "/" parts[2] - if (!(checkpoint_path in checkpoint_by_path)) { - next - } - session_index = parts[3] - artifact = parts[4] - if (artifact == "raw_transcript") { - target = checkpoint_path "/" session_index "/full.jsonl" - } else if (artifact ~ /^raw_transcript\.[0-9][0-9][0-9]$/) { - suffix = artifact - sub(/^raw_transcript/, "", suffix) - target = checkpoint_path "/" session_index "/full.jsonl" suffix - } else if (artifact == "raw_transcript_hash.txt") { - target = checkpoint_path "/" session_index "/content_hash.txt" - } else { - next - } - checkpoint_id = checkpoint_by_path[checkpoint_path] - print checkpoint_id "\t" checkpoint_path "\t" session_index "\t" ref "\t" artifact "\t" path "\t" target "\t" meta >> full_artifacts_file - print checkpoint_id "\t" checkpoint_path "\t" session_index >> raw_sessions_file - print checkpoint_id >> raw_checkpoint_ids_file - print meta "\t" target >> plan_entries_file - } - ' - done <<< "$full_refs" - - sort -u "$raw_sessions_file" -o "$raw_sessions_file" - awk 'NF && !seen[$0]++' "$raw_checkpoint_ids_file" > "${raw_checkpoint_ids_file}.tmp" - mv "${raw_checkpoint_ids_file}.tmp" "$raw_checkpoint_ids_file" -} - -write_main_metadata_index() { - : > "$main_metadata_file" - if [[ "$main_ref_available" != "true" ]]; then - return - fi - - git ls-tree -r "$V2_MAIN_REF" | - awk -F '\t' \ - -v checkpoint_paths_file="$checkpoint_paths_file" \ - -v raw_sessions_file="$raw_sessions_file" \ - -v main_metadata_file="$main_metadata_file" \ - -v plan_entries_file="$plan_entries_file" ' - BEGIN { - while ((getline line < checkpoint_paths_file) > 0) { - split(line, fields, "\t") - checkpoint_by_path[fields[2]] = fields[1] - } - while ((getline line < raw_sessions_file) > 0) { - split(line, fields, "\t") - session_wanted[fields[2] "/" fields[3]] = 1 - checkpoint_has_raw[fields[2]] = 1 - } - } - NF >= 2 { - meta = $1 - path = $2 - n = split(path, parts, "/") - checkpoint_path = parts[1] "/" parts[2] - if (!(checkpoint_path in checkpoint_by_path)) { - next - } - checkpoint_id = checkpoint_by_path[checkpoint_path] - if (n == 3 && parts[3] == "metadata.json") { - if (!(checkpoint_path in checkpoint_has_raw)) { - next - } - print checkpoint_id "\t" checkpoint_path "\t-\tcheckpoint_metadata\t" path "\t" meta >> main_metadata_file - next - } - if (n == 4 && (parts[4] == "metadata.json" || parts[4] == "prompt.txt")) { - session_key = checkpoint_path "/" parts[3] - if (!(session_key in session_wanted)) { - next - } - kind = parts[4] == "metadata.json" ? "session_metadata" : "prompt" - print checkpoint_id "\t" checkpoint_path "\t" parts[3] "\t" kind "\t" path "\t" meta >> main_metadata_file - print meta "\t" path >> plan_entries_file - } - } - ' -} - -append_checkpoint_metadata_plan_entries() { - if ! awk -F '\t' '$4 == "checkpoint_metadata" { found = 1; exit } END { exit found ? 0 : 1 }' "$main_metadata_file"; then - return - fi - - if [[ "$apply" == "true" ]]; then - rewrite_checkpoint_metadata_plan_entries - return - fi - - awk -F '\t' '$4 == "checkpoint_metadata" { print $6 "\t" $5 }' "$main_metadata_file" >> "$plan_entries_file" -} - -rewrite_checkpoint_metadata_plan_entries() { - command -v python3 >/dev/null 2>&1 || die "python3 is required for --apply metadata rewriting" - - local rewrite_dir rewrite_manifest rewrite_paths rewrite_hashes - rewrite_dir="$tmp_dir/rewritten-checkpoint-metadata" - rewrite_manifest="$tmp_dir/rewritten-checkpoint-metadata.tsv" - rewrite_paths="$tmp_dir/rewritten-checkpoint-metadata.paths" - rewrite_hashes="$tmp_dir/rewritten-checkpoint-metadata.hashes" - - mkdir -p "$rewrite_dir" - - python3 - "$main_metadata_file" "$raw_sessions_file" "$rewrite_dir" "$rewrite_manifest" "$rewrite_paths" <<'PY' -import json -import os -import subprocess -import sys - -main_metadata_file, raw_sessions_file, rewrite_dir, rewrite_manifest, rewrite_paths = sys.argv[1:] - -sessions_by_checkpoint = {} -with open(raw_sessions_file, "r", encoding="utf-8") as f: - for line in f: - line = line.rstrip("\n") - if not line: - continue - checkpoint_id, checkpoint_path, session_index = line.split("\t") - del checkpoint_id - sessions_by_checkpoint.setdefault(checkpoint_path, set()).add(session_index) - -records = [] -with open(main_metadata_file, "r", encoding="utf-8") as f: - for line in f: - line = line.rstrip("\n") - if not line: - continue - fields = line.split("\t") - if len(fields) < 6: - continue - checkpoint_id, checkpoint_path, session_index, kind, target_path, object_info = fields[:6] - del checkpoint_id, session_index - if kind == "checkpoint_metadata" and checkpoint_path in sessions_by_checkpoint: - object_parts = object_info.split() - if len(object_parts) != 3 or object_parts[1] != "blob": - sys.stderr.write(f"unexpected ls-tree metadata for {target_path}: {object_info}\n") - sys.exit(1) - records.append((object_parts[2], target_path, checkpoint_path)) - -if not records: - open(rewrite_manifest, "w", encoding="utf-8").close() - open(rewrite_paths, "w", encoding="utf-8").close() - sys.exit(0) - -batch_input = "".join(blob + "\n" for blob, _, _ in records).encode("ascii") -batch = subprocess.run( - ["git", "cat-file", "--batch"], - input=batch_input, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=False, -) -if batch.returncode != 0: - sys.stderr.write(batch.stderr.decode("utf-8", errors="replace")) - sys.exit(batch.returncode) - -out = batch.stdout -offset = 0 - -with open(rewrite_manifest, "w", encoding="utf-8") as manifest, open(rewrite_paths, "w", encoding="utf-8") as paths: - for blob, target_path, checkpoint_path in records: - header_end = out.find(b"\n", offset) - if header_end < 0: - sys.stderr.write(f"missing git cat-file header for {blob}\n") - sys.exit(1) - header = out[offset:header_end].decode("ascii", errors="replace") - offset = header_end + 1 - parts = header.split() - if len(parts) < 3 or parts[1] != "blob": - sys.stderr.write(f"unexpected git cat-file header for {blob}: {header}\n") - sys.exit(1) - size = int(parts[2]) - data = out[offset:offset + size] - offset += size - if offset >= len(out) or out[offset:offset + 1] != b"\n": - sys.stderr.write(f"missing git cat-file record separator for {blob}\n") - sys.exit(1) - offset += 1 - - metadata = json.loads(data.decode("utf-8")) - session_entries = [] - for session_index in sorted(sessions_by_checkpoint[checkpoint_path], key=lambda value: int(value)): - session_prefix = f"/{checkpoint_path}/{session_index}" - session_entries.append({ - "metadata": f"{session_prefix}/metadata.json", - "transcript": f"{session_prefix}/full.jsonl", - "content_hash": f"{session_prefix}/content_hash.txt", - "prompt": f"{session_prefix}/prompt.txt", - }) - metadata["sessions"] = session_entries - - rewritten_path = os.path.join(rewrite_dir, checkpoint_path, "metadata.json") - os.makedirs(os.path.dirname(rewritten_path), exist_ok=True) - with open(rewritten_path, "w", encoding="utf-8") as rewritten: - json.dump(metadata, rewritten, indent=2) - rewritten.write("\n") - - manifest.write(f"{rewritten_path}\t{target_path}\n") - paths.write(rewritten_path + "\n") -PY - - if [[ ! -s "$rewrite_paths" ]]; then - return - fi - - git hash-object -w --stdin-paths < "$rewrite_paths" > "$rewrite_hashes" - awk -F '\t' ' - NR == FNR { - target[FNR] = $2 - next - } - { - print "100644 blob " $1 "\t" target[FNR] - } - ' "$rewrite_manifest" "$rewrite_hashes" >> "$plan_entries_file" -} - -compute_plan_counts() { - planned_checkpoints=$(wc -l < "$raw_checkpoint_ids_file" | tr -d '[:space:]') - planned_sessions=$(wc -l < "$raw_sessions_file" | tr -d '[:space:]') - planned_raw_transcripts=$(awk -F '\t' '$5 == "raw_transcript" { count++ } END { print count + 0 }' "$full_artifacts_file") - missing_raw_checkpoints=$(awk -v raw_file="$raw_checkpoint_ids_file" ' - BEGIN { - while ((getline line < raw_file) > 0) { - if (line != "") { - raw[line] = 1 - } - } - close(raw_file) - } - NF && !($1 in raw) { - count++ - } - END { - print count + 0 - } - ' "$checkpoint_ids_file") - missing_metadata_checkpoints=$(awk -F '\t' -v metadata_file="$main_metadata_file" ' - BEGIN { - while ((getline line < metadata_file) > 0) { - split(line, fields, "\t") - if (fields[4] == "checkpoint_metadata") { - have[fields[1]] = 1 - } - } - close(metadata_file) - } - NF && !($1 in have) { - count++ - } - END { - print count + 0 - } - ' "$raw_checkpoint_ids_file") - missing_metadata_sessions=$(awk -F '\t' -v metadata_file="$main_metadata_file" ' - BEGIN { - while ((getline line < metadata_file) > 0) { - split(line, fields, "\t") - if (fields[4] == "session_metadata") { - have[fields[1] "\t" fields[3]] = 1 - } - } - close(metadata_file) - } - NF { - key = $1 "\t" $3 - if (!(key in have)) { - count++ - } - } - END { - print count + 0 - } - ' "$raw_sessions_file") -} - -while [[ $# -gt 0 ]]; do - case "$1" in - -h|--help) - show_help - exit 0 - ;; - --list) - list_mode=true - shift - ;; - --dry-run) - dry_run=true - shift - ;; - --apply) - apply=true - shift - ;; - --since) - [[ $# -ge 2 ]] || die "--since requires a commit" - since_commit="$2" - shift 2 - ;; - --repo) - [[ $# -ge 2 ]] || die "--repo requires a path" - repo_path="$2" - shift 2 - ;; - --head) - [[ $# -ge 2 ]] || die "--head requires a commit" - head_commitish="$2" - shift 2 - ;; - -*) - die "unknown option: $1" - ;; - *) - [[ -z "$since_commit" ]] || die "too many commit arguments" - since_commit="$1" - shift - ;; - esac -done - -mode_count=0 -[[ "$list_mode" == "true" ]] && mode_count=$((mode_count + 1)) -[[ "$dry_run" == "true" ]] && mode_count=$((mode_count + 1)) -[[ "$apply" == "true" ]] && mode_count=$((mode_count + 1)) -if (( mode_count > 1 )); then - die "--list, --dry-run, and --apply are mutually exclusive" -fi - -tmp_dir=$(mktemp -d "${TMPDIR:-/tmp}/v2-to-v1.XXXXXX") -plan_entries_file="$tmp_dir/plan_entries" -checkpoint_commits_file="$tmp_dir/checkpoint_commits" -checkpoint_ids_file="$tmp_dir/checkpoint_ids" -checkpoint_paths_file="$tmp_dir/checkpoint_paths" -full_artifacts_file="$tmp_dir/full_artifacts" -raw_sessions_file="$tmp_dir/raw_sessions" -raw_checkpoint_ids_file="$tmp_dir/raw_checkpoint_ids" -main_metadata_file="$tmp_dir/main_metadata" -: > "$plan_entries_file" - -if [[ -z "$repo_path" ]]; then - printf 'Local repo path: ' >&2 - IFS= read -r repo_path -fi - -[[ -n "$repo_path" ]] || die "a local repo path is required" -[[ -d "$repo_path" ]] || die "repo path does not exist or is not a directory: $repo_path" - -if ! repo_root=$(git -C "$repo_path" rev-parse --show-toplevel 2>/dev/null); then - die "not inside a git repository: $repo_path" -fi -cd "$repo_root" - -if [[ -z "$since_commit" ]]; then - printf 'Show v2 checkpoints newer than commit: ' >&2 - IFS= read -r since_commit -fi - -[[ -n "$since_commit" ]] || die "a base commit is required" - -if ! since_hash=$(git rev-parse --verify --quiet "${since_commit}^{commit}"); then - die "commit not found: $since_commit" -fi - -head_hash="" -if [[ -n "$head_commitish" ]]; then - if ! head_hash=$(git rev-parse --verify --quiet "${head_commitish}^{commit}"); then - die "history tip not found: $head_commitish" - fi - git merge-base --is-ancestor "$since_hash" "$head_hash" 2>/dev/null || - die "$since_commit is not an ancestor of $head_commitish" -fi - -if [[ -n "$head_hash" ]]; then - write_checkpoint_commit_index_between "$since_hash" "$head_hash" "$checkpoint_commits_file" -else - write_checkpoint_commit_index_from_all_refs "$since_hash" "$checkpoint_commits_file" -fi -if [[ ! -s "$checkpoint_commits_file" ]]; then - if [[ -n "$head_hash" ]]; then - printf 'No %s trailers found in %s..%s\n' "$TRAILER_KEY" "$since_hash" "$head_hash" - else - printf 'No %s trailers found on local branches/remotes containing %s\n' "$TRAILER_KEY" "$since_hash" - fi - exit 0 -fi -write_checkpoint_id_files - -if [[ "$list_mode" == "true" ]]; then - checkpoint_count=$(wc -l < "$checkpoint_ids_file" | tr -d '[:space:]') - printf 'Checkpoints: %s\n' "$checkpoint_count" - printf 'checkpoint_id\tcommit_ids\n' - awk -F '\t' ' - NF >= 2 { - if (!seen_checkpoint[$1]++) { - order[++n] = $1 - } - key = $1 SUBSEP $2 - if (!seen_pair[key]++) { - commits[$1] = commits[$1] == "" ? $2 : commits[$1] " " $2 - } - } - END { - for (i = 1; i <= n; i++) { - print order[i] "\t" commits[order[i]] - } - } - ' "$checkpoint_commits_file" - exit 0 -fi - -main_ref_available=false -if git show-ref --verify --quiet "$V2_MAIN_REF"; then - main_ref_available=true -else - warn "missing $V2_MAIN_REF; companion metadata paths will not be shown" -fi - -full_refs=$(list_full_refs) -[[ -n "$full_refs" ]] || die "missing refs under $V2_FULL_REF_PREFIX; cannot locate raw transcripts" - -printf 'Repository: %s\n' "$repo_root" -if [[ -n "$head_hash" ]]; then - printf 'Scanning commits: %s..%s\n' "$since_hash" "$head_hash" -else - printf 'Scanning commits: local branches/remotes containing %s\n' "$since_hash" -fi -if [[ "$main_ref_available" == "true" ]]; then - printf 'Companion metadata ref: %s\n' "$V2_MAIN_REF" -fi -printf 'Full refs:\n' -printf '%s\n' "$full_refs" | sed 's/^/ /' -printf '\n' - -write_full_artifact_index -write_main_metadata_index -append_checkpoint_metadata_plan_entries -compute_plan_counts - -if [[ "$dry_run" != "true" ]]; then - if (( missing_raw_checkpoints > 0 )); then - warn "$missing_raw_checkpoints checkpoint trailer(s) do not have raw_transcript artifacts and will be skipped" - fi - if (( missing_metadata_checkpoints > 0 )); then - warn "$missing_metadata_checkpoints checkpoint(s) with raw transcripts are missing companion checkpoint metadata" - fi - if (( missing_metadata_sessions > 0 )); then - warn "$missing_metadata_sessions session(s) with raw transcripts are missing companion session metadata" - fi -fi - -if [[ "$dry_run" == "true" ]]; then - while IFS= read -r checkpoint_id; do - [[ -n "$checkpoint_id" ]] || continue - - checkpoint_path=$(checkpoint_to_path "$checkpoint_id") - - checkpoint_output="" - - if ! awk -F '\t' -v checkpoint_id="$checkpoint_id" '$1 == checkpoint_id { found = 1; exit } END { exit found ? 0 : 1 }' "$full_artifacts_file"; then - warn "no raw_transcript artifacts found for checkpoint $checkpoint_id" - continue - fi - checkpoint_output=$(awk -F '\t' -v checkpoint_id="$checkpoint_id" ' - $1 == checkpoint_id { - full_checkpoint_key = $4 SUBSEP $2 - if (!seen_full_checkpoint[full_checkpoint_key]++) { - print " full checkpoint folder: " $4 ":" $2 - } - session_key = $4 SUBSEP $2 SUBSEP $3 - if (!seen_session[session_key]++) { - print " full session folder: " $4 ":" $2 "/" $3 - } - print " raw artifact: " $4 ":" $6 - } - ' "$full_artifacts_file") - - if [[ "$main_ref_available" == "true" ]] && awk -F '\t' -v checkpoint_id="$checkpoint_id" '$1 == checkpoint_id && $4 == "checkpoint_metadata" { found = 1; exit } END { exit found ? 0 : 1 }' "$main_metadata_file"; then - metadata_output=$(awk -F '\t' -v checkpoint_id="$checkpoint_id" -v main_ref="$V2_MAIN_REF" ' - $1 == checkpoint_id { - if (!printed_folder++) { - print " companion metadata folder: " main_ref ":" $2 - } - if ($4 == "checkpoint_metadata") { - print " checkpoint metadata: " main_ref ":" $5 - } else if ($4 == "session_metadata") { - print " session metadata: " main_ref ":" $5 - } else if ($4 == "prompt") { - print " prompt: " main_ref ":" $5 - } - } - ' "$main_metadata_file") - checkpoint_output="${checkpoint_output}"$'\n'"${metadata_output}" - elif [[ "$main_ref_available" == "true" ]]; then - warn "checkpoint $checkpoint_id has raw transcript artifacts but no companion metadata on $V2_MAIN_REF" - fi - - printf 'checkpoint %s\n' "$checkpoint_id" - printf '%s' "$checkpoint_output" - printf '\n' - done < "$checkpoint_ids_file" -fi - -planned_entries=$(wc -l < "$plan_entries_file" | tr -d '[:space:]') -unique_planned_entries=$(write_unique_mktree_input "$plan_entries_file" | wc -l | tr -d '[:space:]') - -if [[ "$dry_run" != "true" ]]; then - printf 'Migration plan:\n' - printf ' target ref: %s\n' "$V1_REF" - printf ' checkpoints with raw transcripts: %s\n' "$planned_checkpoints" - printf ' sessions with raw transcripts: %s\n' "$planned_sessions" - printf ' raw transcript base files: %s\n' "$planned_raw_transcripts" - printf ' planned v1 tree entries: %s (%s unique target paths)\n' "$planned_entries" "$unique_planned_entries" - printf ' missing raw-transcript checkpoints: %s\n' "$missing_raw_checkpoints" - printf ' missing companion checkpoint metadata: %s\n' "$missing_metadata_checkpoints" - printf ' missing companion session metadata: %s\n' "$missing_metadata_sessions" - printf '\n' -fi - -if [[ "$apply" == "true" ]]; then - if [[ "$unique_planned_entries" == "0" ]]; then - die "nothing to migrate: no v1 tree entries were planned" - fi - - old_ref_hash="" - base_tree_hash="" - parent_hash="" - bulk_migration_hash="" - rewriting_bulk_migration=false - if git show-ref --verify --quiet "$V1_REF"; then - old_ref_hash=$(git rev-parse "$V1_REF^{commit}") - parent_hash="$old_ref_hash" - base_tree_hash=$(git rev-parse "$old_ref_hash^{tree}") - - bulk_migration_hash=$(find_bulk_migration_ancestor "$old_ref_hash") - if [[ -n "$bulk_migration_hash" ]]; then - rewriting_bulk_migration=true - if parent_hash=$(git rev-parse --verify --quiet "${bulk_migration_hash}^"); then - base_tree_hash=$(git rev-parse "$parent_hash^{tree}") - else - parent_hash="" - base_tree_hash="" - fi - fi - fi - - source_plan_entries_file="$plan_entries_file" - apply_checkpoint_ids_source_file="$tmp_dir/apply_checkpoint_ids_source" - if [[ "$rewriting_bulk_migration" == "true" ]]; then - source_plan_entries_file="$tmp_dir/bulk_plan_entries" - write_bulk_migration_entries "$old_ref_hash" "$parent_hash" "$source_plan_entries_file" "$apply_checkpoint_ids_source_file" - else - write_apply_checkpoint_ids "$apply_checkpoint_ids_source_file" - fi - - checkpoint_dates_file="$tmp_dir/checkpoint_dates" - apply_checkpoint_ids_file="$tmp_dir/apply_checkpoint_ids" - write_checkpoint_commit_dates "$source_plan_entries_file" "$checkpoint_dates_file" - order_checkpoint_ids_by_date "$apply_checkpoint_ids_source_file" "$checkpoint_dates_file" "$apply_checkpoint_ids_file" - - commit_count=0 - current_tree_hash="$base_tree_hash" - final_commit_hash="$parent_hash" - while IFS= read -r checkpoint_id; do - [[ -n "$checkpoint_id" ]] || continue - checkpoint_entries_file="$tmp_dir/checkpoint-${checkpoint_id}.entries" - write_checkpoint_entries "$checkpoint_id" "$checkpoint_entries_file" "$source_plan_entries_file" - [[ -s "$checkpoint_entries_file" ]] || continue - - new_tree_hash=$(build_v1_tree_from_entries "$current_tree_hash" "$checkpoint_entries_file") - if [[ -n "$current_tree_hash" && "$new_tree_hash" == "$current_tree_hash" ]]; then - continue - fi - - commit_date=$(checkpoint_commit_date "$checkpoint_id" "$checkpoint_dates_file") - final_commit_hash=$(create_v1_checkpoint_migration_commit "$checkpoint_id" "$new_tree_hash" "$final_commit_hash" "$commit_date") - current_tree_hash="$new_tree_hash" - commit_count=$((commit_count + 1)) - done < "$apply_checkpoint_ids_file" - - if (( commit_count == 0 )); then - printf '%s is already up to date; no migration commit created.\n' "$V1_REF" - exit 0 - fi - - if [[ -n "$old_ref_hash" ]]; then - git update-ref "$V1_REF" "$final_commit_hash" "$old_ref_hash" - else - git update-ref "$V1_REF" "$final_commit_hash" - fi - - if [[ "$rewriting_bulk_migration" == "true" ]]; then - printf 'Rewrote previous bulk migration into %s per-checkpoint commit(s).\n' "$commit_count" - else - printf 'Wrote %s per-checkpoint migration commit(s).\n' "$commit_count" - fi - printf 'Latest migration commit: %s\n' "$final_commit_hash" - printf 'Updated %s\n' "$V1_REF" - exit 0 -fi - -if [[ "$dry_run" != "true" ]]; then - printf 'Plan only: no refs were written. Use --dry-run to print every source and target artifact, or --apply to write migration commits.\n' -fi From f5ef3cab98abe5087b17f8834d8229ecc6cb2036 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 11:34:34 -0700 Subject: [PATCH 16/35] Simplify migrate-v2-checkpoints and checkpoint test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Open the migration tool's repository via gitrepo.OpenPath so shared clones with object alternates resolve correctly instead of silently downgrading to PlainOpen. Inline single-call forwarders in migration.go and drop the redundant second hasRequiredV2Metadata check — both v2 session reads parse the same metadata.json. Inline stringly-typed flag-name constants in the migrate tests, and collapse refHeadCommit into its sole caller. Entire-Checkpoint: 1a30a6f9fdcd --- .../checkpoint/committed_commit_time_test.go | 8 +-- cmd/migrate-v2-checkpoints/main.go | 13 ++++- cmd/migrate-v2-checkpoints/main_test.go | 56 +++++++++---------- cmd/migrate-v2-checkpoints/migration.go | 24 +------- 4 files changed, 39 insertions(+), 62 deletions(-) diff --git a/cmd/entire/cli/checkpoint/committed_commit_time_test.go b/cmd/entire/cli/checkpoint/committed_commit_time_test.go index a2717afe1..91c5546c2 100644 --- a/cmd/entire/cli/checkpoint/committed_commit_time_test.go +++ b/cmd/entire/cli/checkpoint/committed_commit_time_test.go @@ -90,13 +90,7 @@ func setupCommittedCommitTimeRepo(t *testing.T) (*git.Repository, *GitStore) { func metadataHeadCommit(t *testing.T, repo *git.Repository) *object.Commit { t.Helper() - return refHeadCommit(t, repo, plumbing.NewBranchReferenceName(paths.MetadataBranchName)) -} - -func refHeadCommit(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName) *object.Commit { - t.Helper() - - ref, err := repo.Reference(refName, true) + ref, err := repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) require.NoError(t, err) commit, err := repo.CommitObject(ref.Hash()) diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go index d785146a4..d424a188c 100644 --- a/cmd/migrate-v2-checkpoints/main.go +++ b/cmd/migrate-v2-checkpoints/main.go @@ -7,6 +7,7 @@ import ( "io" "os" + "github.com/entireio/cli/cmd/entire/cli/gitrepo" "github.com/entireio/cli/cmd/entire/cli/paths" "github.com/entireio/cli/cmd/entire/cli/settings" @@ -164,13 +165,21 @@ func openRepository(ctx context.Context, repoPath string) (string, *git.Reposito repoPath = root } - repo, err := git.PlainOpenWithOptions(repoPath, &git.PlainOpenOptions{DetectDotGit: true}) + // DetectDotGit walks up from a subdir to find the worktree root; then + // re-open via gitrepo.OpenPath so shared clones with object alternates + // resolve correctly. + detector, err := git.PlainOpenWithOptions(repoPath, &git.PlainOpenOptions{DetectDotGit: true}) if err != nil { return "", nil, fmt.Errorf("open repository %q: %w", repoPath, err) } repoRoot := repoPath - if worktree, worktreeErr := repo.Worktree(); worktreeErr == nil { + if worktree, worktreeErr := detector.Worktree(); worktreeErr == nil { repoRoot = worktree.Filesystem().Root() } + + repo, err := gitrepo.OpenPath(repoRoot) + if err != nil { + return "", nil, fmt.Errorf("open repository %q: %w", repoRoot, err) + } return repoRoot, repo, nil } diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index c32bff76d..5f633d9ad 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -33,12 +33,6 @@ const ( unrelatedCheckpointID = "444444444444" testSinceRevision = "abc123" testHeadRevision = "HEAD" - testRepoFlag = "--repo" - testSinceFlag = "--since" - testHeadFlag = "--head" - testListFlag = "--list" - testDryRunFlag = "--dry-run" - testApplyFlag = "--apply" testRepoPath = "/tmp/repo" testBaseFilename = "base.txt" testMainFilename = "main.txt" @@ -56,10 +50,10 @@ func TestParseOptions(t *testing.T) { t.Parallel() opts, err := parseOptions([]string{ - testRepoFlag, testRepoPath, - testSinceFlag, testSinceRevision, - testHeadFlag, testHeadRevision, - testListFlag, + "--repo", testRepoPath, + "--since", testSinceRevision, + "--head", testHeadRevision, + "--list", }) require.NoError(t, err) require.Equal(t, testRepoPath, opts.repoPath) @@ -67,15 +61,15 @@ func TestParseOptions(t *testing.T) { require.Equal(t, testHeadRevision, opts.head) require.Equal(t, modeList, opts.mode) - opts, err = parseOptions([]string{testDryRunFlag, testSinceRevision}) + opts, err = parseOptions([]string{"--dry-run", testSinceRevision}) require.NoError(t, err) require.Equal(t, testSinceRevision, opts.since) require.Equal(t, modeDryRun, opts.mode) - _, err = parseOptions([]string{testSinceFlag, testSinceRevision, "def456"}) + _, err = parseOptions([]string{"--since", testSinceRevision, "def456"}) require.ErrorContains(t, err, "use either --since or positional since commit") - _, err = parseOptions([]string{testListFlag, testApplyFlag}) + _, err = parseOptions([]string{"--list", "--apply"}) require.ErrorContains(t, err, "use only one") } @@ -207,10 +201,10 @@ func TestRunListModeOpensRepoFromSubdirectory(t *testing.T) { var stdout bytes.Buffer err := run(context.Background(), []string{ - testRepoFlag, subdir, - testSinceFlag, fixture.baseHash.String(), - testHeadFlag, fixture.mainHash.String(), - testListFlag, + "--repo", subdir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--list", }, &stdout) require.NoError(t, err) @@ -250,10 +244,10 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { var stdout bytes.Buffer err := run(context.Background(), []string{ - testRepoFlag, fixture.dir, - testSinceFlag, fixture.baseHash.String(), - testHeadFlag, fixture.mainHash.String(), - testApplyFlag, + "--repo", fixture.dir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--apply", }, &stdout) require.NoError(t, err) require.Contains(t, stdout.String(), "migrated checkpoints: 1") @@ -301,7 +295,7 @@ func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"dry run\"}\n")), }) - stdout := runMigrationCommand(t, fixture, fixture.mainHash, testDryRunFlag) + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--dry-run") require.Contains(t, stdout, "Migration plan:") require.Contains(t, stdout, "checkpoints eligible for migration: 1") require.Contains(t, stdout, "sessions eligible for migration: 1") @@ -340,7 +334,7 @@ func TestRunApplySkipsExistingV1SessionsAndMigratesMissingSessions(t *testing.T) }) require.NoError(t, err) - stdout := runMigrationCommand(t, fixture, fixture.mainHash, testApplyFlag) + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") require.Contains(t, stdout, "already present v1 sessions: 1") require.Contains(t, stdout, "migrated checkpoints: 1") require.Contains(t, stdout, "migrated sessions: 1") @@ -371,7 +365,7 @@ func TestRunApplyMigratesTaskMetadata(t *testing.T) { Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"task\"}\n")), }) - stdout := runMigrationCommand(t, fixture, fixture.mainHash, testApplyFlag) + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") require.Contains(t, stdout, "migrated sessions: 1") v1Store := checkpoint.NewGitStore(fixture.repo) @@ -410,7 +404,7 @@ func TestRunApplyHasReviewReflectsOnlyMigratedSessions(t *testing.T) { CompactTranscript: []byte("{\"message\":\"compact review only\"}\n"), }) - stdout := runMigrationCommand(t, fixture, fixture.mainHash, testApplyFlag) + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") require.Contains(t, stdout, "missing raw transcripts: 1") require.Contains(t, stdout, "migrated checkpoints: 1") require.Contains(t, stdout, "migrated sessions: 1") @@ -433,9 +427,9 @@ func TestRunDryRunReportsMissingV2MetadataAndRawTranscripts(t *testing.T) { var stdout bytes.Buffer err := run(context.Background(), []string{ - testRepoFlag, fixture.dir, - testSinceFlag, fixture.baseHash.String(), - testDryRunFlag, + "--repo", fixture.dir, + "--since", fixture.baseHash.String(), + "--dry-run", }, &stdout) require.NoError(t, err) @@ -513,9 +507,9 @@ func runMigrationCommand(t *testing.T, fixture migrationHistoryFixture, head plu t.Helper() args := []string{ - testRepoFlag, fixture.dir, - testSinceFlag, fixture.baseHash.String(), - testHeadFlag, head.String(), + "--repo", fixture.dir, + "--since", fixture.baseHash.String(), + "--head", head.String(), mode, } var stdout bytes.Buffer diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 2aee6bd18..f343a607e 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -88,7 +88,7 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di migratedSessions := 0 for sessionIndex := range summary.Sessions { - metadataContent, err := m.readV2SessionMetadata(ctx, discovered, sessionIndex) + metadataContent, err := m.v2Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) if err != nil { if errors.Is(err, checkpoint.ErrCheckpointNotFound) { m.report.MissingV2SessionMetadata++ @@ -105,7 +105,7 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di continue } - content, err := m.readV2SessionContent(ctx, discovered, sessionIndex) + content, err := m.v2Store.ReadSessionContent(ctx, discovered.ID, sessionIndex) if err != nil { if errors.Is(err, checkpoint.ErrNoTranscript) { m.report.MissingRawTranscripts++ @@ -113,10 +113,6 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di } return migratedSessions, fmt.Errorf("read v2 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) } - if !hasRequiredV2Metadata(content) { - m.report.MissingV2SessionMetadata++ - continue - } m.report.EligibleSessions++ if m.opts.apply { @@ -149,22 +145,6 @@ func (m checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered return existing, nil } -func (m checkpointMigrator) readV2SessionMetadata(ctx context.Context, discovered discoveredCheckpoint, sessionIndex int) (*checkpoint.SessionContent, error) { - content, err := m.v2Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) - if err != nil { - return nil, fmt.Errorf("read v2 session metadata and prompts: %w", err) - } - return content, nil -} - -func (m checkpointMigrator) readV2SessionContent(ctx context.Context, discovered discoveredCheckpoint, sessionIndex int) (*checkpoint.SessionContent, error) { - content, err := m.v2Store.ReadSessionContent(ctx, discovered.ID, sessionIndex) - if err != nil { - return nil, fmt.Errorf("read full v2 session content: %w", err) - } - return content, nil -} - func hasRequiredV2Metadata(content *checkpoint.SessionContent) bool { return !content.Metadata.CheckpointID.IsEmpty() && content.Metadata.SessionID != "" } From 9be58398280336601d99babc7ba1dabe49c60b21 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 13:20:18 -0700 Subject: [PATCH 17/35] Improve v2 checkpoint migration reporting Entire-Checkpoint: b01e75e8d461 --- cmd/entire/cli/checkpoint/v2_store.go | 2 +- cmd/entire/cli/checkpoint/v2_store_test.go | 52 ++++++++++++++++++++ cmd/migrate-v2-checkpoints/history.go | 6 +++ cmd/migrate-v2-checkpoints/main_test.go | 5 ++ cmd/migrate-v2-checkpoints/migration.go | 56 ++++++++++++++++++---- 5 files changed, 112 insertions(+), 9 deletions(-) diff --git a/cmd/entire/cli/checkpoint/v2_store.go b/cmd/entire/cli/checkpoint/v2_store.go index 7d44bf933..0c64581bd 100644 --- a/cmd/entire/cli/checkpoint/v2_store.go +++ b/cmd/entire/cli/checkpoint/v2_store.go @@ -73,7 +73,7 @@ func (s *V2GitStore) GetRefState(refName plumbing.ReferenceName) (parentHash, tr if cliErr != nil { return plumbing.ZeroHash, plumbing.ZeroHash, fmt.Errorf("failed to get commit for ref %s: %w", refName, errors.Join(err, cliErr)) } - logging.Warn(context.Background(), "GetRefState: go-git commit read failed, used git rev-parse fallback", + logging.Debug(context.Background(), "GetRefState: go-git commit read failed, used git rev-parse fallback", slog.String("ref", refName.String()), slog.String("commit", ref.Hash().String()[:12]), slog.String("gogit_error", err.Error()), diff --git a/cmd/entire/cli/checkpoint/v2_store_test.go b/cmd/entire/cli/checkpoint/v2_store_test.go index b07983dc4..d964e8fe9 100644 --- a/cmd/entire/cli/checkpoint/v2_store_test.go +++ b/cmd/entire/cli/checkpoint/v2_store_test.go @@ -5,9 +5,13 @@ import ( "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/entireio/cli/redact" "github.com/stretchr/testify/require" + "github.com/go-git/go-git/v6" "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/storage" ) func TestNewV2GitStore(t *testing.T) { @@ -53,3 +57,51 @@ func TestV2GitStore_GetRefState_ErrorsOnMissingRef(t *testing.T) { require.Error(t, err) require.Contains(t, err.Error(), "ref refs/entire/checkpoints/v2/main not found") } + +func TestV2GitStore_GetRefState_FallsBackToGitCLIWhenCommitObjectMissing(t *testing.T) { + dir := t.TempDir() + testutil.InitRepo(t, dir) + testutil.WriteFile(t, dir, "README.md", "init") + testutil.GitAdd(t, dir, "README.md") + testutil.GitCommit(t, dir, "initial") + t.Chdir(dir) + + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + writeV2TestCheckpoint(t, repo, v2TestCheckpointOptions{ + CheckpointID: id.MustCheckpointID("b1b2b3b4b5b6"), + SessionID: "session-fallback", + Strategy: "manual-commit", + Transcript: redact.AlreadyRedacted([]byte("transcript\n")), + }) + + refName := plumbing.ReferenceName(paths.V2FullCurrentRefName) + ref, err := repo.Reference(refName, true) + require.NoError(t, err) + commit, err := repo.CommitObject(ref.Hash()) + require.NoError(t, err) + + store := NewV2GitStore(&git.Repository{ + Storer: commitObjectMissingStorer{ + Storer: repo.Storer, + missing: ref.Hash(), + }, + }) + parentHash, treeHash, err := store.GetRefState(refName) + require.NoError(t, err) + require.Equal(t, ref.Hash(), parentHash) + require.Equal(t, commit.TreeHash, treeHash) +} + +type commitObjectMissingStorer struct { + storage.Storer + + missing plumbing.Hash +} + +func (s commitObjectMissingStorer) EncodedObject(objectType plumbing.ObjectType, hash plumbing.Hash) (plumbing.EncodedObject, error) { + if hash == s.missing && objectType == plumbing.CommitObject { + return nil, plumbing.ErrObjectNotFound + } + return s.Storer.EncodedObject(objectType, hash) +} diff --git a/cmd/migrate-v2-checkpoints/history.go b/cmd/migrate-v2-checkpoints/history.go index e053aa2a5..7710f465d 100644 --- a/cmd/migrate-v2-checkpoints/history.go +++ b/cmd/migrate-v2-checkpoints/history.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "io" "sort" @@ -15,6 +16,7 @@ import ( "github.com/go-git/go-git/v6" "github.com/go-git/go-git/v6/plumbing" "github.com/go-git/go-git/v6/plumbing/object" + "github.com/go-git/go-git/v6/plumbing/storer" ) type discoveryOptions struct { @@ -298,9 +300,13 @@ func commitReachableFrom(ctx context.Context, repo *git.Repository, from, target } if commit.Hash == target { found = true + return storer.ErrStop } return nil }) + if errors.Is(err, storer.ErrStop) { + return true, nil + } if err != nil { return false, fmt.Errorf("iterate commits from %s: %w", from, err) } diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 5f633d9ad..2917b88aa 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -1,3 +1,4 @@ +//nolint:goconst // Repeated CLI flag literals keep argument-list tests readable. package main import ( @@ -299,6 +300,8 @@ func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { require.Contains(t, stdout, "Migration plan:") require.Contains(t, stdout, "checkpoints eligible for migration: 1") require.Contains(t, stdout, "sessions eligible for migration: 1") + require.Contains(t, stdout, "checkpoints to migrate:") + require.Contains(t, stdout, mainCheckpointID+" sessions=1 commits="+shortHash(fixture.mainHash)) summary, err := checkpoint.NewGitStore(fixture.repo).ReadCommitted(context.Background(), cpID) require.NoError(t, err) @@ -338,6 +341,8 @@ func TestRunApplySkipsExistingV1SessionsAndMigratesMissingSessions(t *testing.T) require.Contains(t, stdout, "already present v1 sessions: 1") require.Contains(t, stdout, "migrated checkpoints: 1") require.Contains(t, stdout, "migrated sessions: 1") + require.Contains(t, stdout, "migrated checkpoint details:") + require.Contains(t, stdout, mainCheckpointID+" sessions=1 commits="+shortHash(fixture.mainHash)) v1Store := checkpoint.NewGitStore(fixture.repo) summary, err := v1Store.ReadCommitted(context.Background(), cpID) diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index f343a607e..1abb1e7f7 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "strings" "github.com/entireio/cli/cmd/entire/cli/checkpoint" "github.com/entireio/cli/cmd/entire/cli/session" @@ -27,6 +28,13 @@ type migrationReport struct { EligibleSessions int MigratedCheckpoints int MigratedSessions int + Candidates []migrationCandidate +} + +type migrationCandidate struct { + CheckpointID string + SessionCount int + CommitSHAs []string } type checkpointMigrator struct { @@ -52,14 +60,15 @@ func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, dis } for _, discoveredCheckpoint := range discovered { - migratedSessions, err := migrator.migrateCheckpoint(ctx, discoveredCheckpoint) + eligibleSessions, err := migrator.migrateCheckpoint(ctx, discoveredCheckpoint) if err != nil { return report, err } - if migratedSessions == 0 { + if eligibleSessions == 0 { continue } report.EligibleCheckpoints++ + report.Candidates = append(report.Candidates, migrationCandidateFromDiscovered(discoveredCheckpoint, eligibleSessions)) if opts.apply { report.MigratedCheckpoints++ } @@ -67,6 +76,18 @@ func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, dis return report, nil } +func migrationCandidateFromDiscovered(discovered discoveredCheckpoint, sessionCount int) migrationCandidate { + commitSHAs := make([]string, len(discovered.Commits)) + for i, commit := range discovered.Commits { + commitSHAs[i] = commit.ShortSHA + } + return migrationCandidate{ + CheckpointID: discovered.ID.String(), + SessionCount: sessionCount, + CommitSHAs: commitSHAs, + } +} + func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered discoveredCheckpoint) (int, error) { existing, err := m.v1Store.ReadCommitted(ctx, discovered.ID) if err != nil { @@ -86,7 +107,7 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di return 0, nil } - migratedSessions := 0 + eligibleSessions := 0 for sessionIndex := range summary.Sessions { metadataContent, err := m.v2Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) if err != nil { @@ -94,7 +115,7 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di m.report.MissingV2SessionMetadata++ continue } - return migratedSessions, fmt.Errorf("read v2 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) + return eligibleSessions, fmt.Errorf("read v2 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) } if !hasRequiredV2Metadata(metadataContent) { m.report.MissingV2SessionMetadata++ @@ -111,20 +132,20 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di m.report.MissingRawTranscripts++ continue } - return migratedSessions, fmt.Errorf("read v2 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) + return eligibleSessions, fmt.Errorf("read v2 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) } m.report.EligibleSessions++ if m.opts.apply { writeOpts := writeOptionsFromV2Content(content, summary, m.authorName, m.authorEmail) if err := m.v1Store.WriteCommitted(ctx, writeOpts); err != nil { - return migratedSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) + return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) } m.report.MigratedSessions++ } - migratedSessions++ + eligibleSessions++ } - return migratedSessions, nil + return eligibleSessions, nil } func (m checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered discoveredCheckpoint, summary *checkpoint.CheckpointSummary) (map[string]struct{}, error) { @@ -201,4 +222,23 @@ func writeMigrationReport(w io.Writer, report migrationReport, applied bool) { fmt.Fprintf(w, " migrated checkpoints: %d\n", report.MigratedCheckpoints) fmt.Fprintf(w, " migrated sessions: %d\n", report.MigratedSessions) } + writeMigrationCandidates(w, report.Candidates, applied) +} + +func writeMigrationCandidates(w io.Writer, candidates []migrationCandidate, applied bool) { + if len(candidates) == 0 { + return + } + if applied { + fmt.Fprintln(w, " migrated checkpoint details:") + } else { + fmt.Fprintln(w, " checkpoints to migrate:") + } + for _, candidate := range candidates { + fmt.Fprintf(w, " %s sessions=%d commits=%s\n", + candidate.CheckpointID, + candidate.SessionCount, + strings.Join(candidate.CommitSHAs, ","), + ) + } } From ae195157963790edce4f94c60ee0aee0683b6141 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 13:36:32 -0700 Subject: [PATCH 18/35] Address review feedback Entire-Checkpoint: a458a0437dc3 --- .gitignore | 1 + cmd/migrate-v2-checkpoints/VALIDATION.md | 846 ++++++++++++++++++ cmd/migrate-v2-checkpoints/history.go | 8 +- cmd/migrate-v2-checkpoints/main.go | 1 + cmd/migrate-v2-checkpoints/v2_fixture_test.go | 4 +- mise-tasks/build | 2 + 6 files changed, 859 insertions(+), 3 deletions(-) create mode 100644 cmd/migrate-v2-checkpoints/VALIDATION.md diff --git a/.gitignore b/.gitignore index ea6cf36be..171a1f601 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ mise.local.toml # Binary output (only in root) /entire +/migrate-v2-checkpoints /vogon /testreport /bin diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md new file mode 100644 index 000000000..e57a4ba45 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -0,0 +1,846 @@ +# Validating `migrate-v2-checkpoints` + +Reusable runbook for verifying that `migrate-v2-checkpoints` (read-only or applied) +identifies the correct checkpoints, attributes the correct sessions, and — once +applied — writes complete, hash-consistent data to the v1 branch. + +Tested against the `tmp-migrate-v2-script-go` branch of the CLI at +`~/entire/cli/.worktrees/review`. The binary lives there as +`migrate-v2-checkpoints`. + +> Background: the project is rolling **back** checkpoints v2. v2 stores live +> under `refs/entire/checkpoints/v2/*` and are no longer being written. The +> v1 branch `entire/checkpoints/v1` is the surviving format. This tool reads +> v2 metadata + raw transcripts and replays them as v1 writes via +> `checkpoint.GitStore.WriteCommitted`. + +> ⛔ **DO NOT push `entire/checkpoints/v1` to any remote at any point while +> following this runbook.** The migration writes new commits to the local +> v1 branch and nothing else. Publishing those commits is a separate, +> manual decision the operator makes **only after** §5 validation has +> fully passed and they are satisfied with the result. Pushing early +> propagates any bad migration to every consumer (other clones, +> `checkpoint_remote`, the API) and makes rollback significantly more +> expensive than a local `update-ref`. If you are not sure whether you +> are about to push, you are not ready to push. + +## 1. What the tool does + +### 1.1 Discovery (`cmd/migrate-v2-checkpoints/history.go`) + +- Walks every history tip (branches under `refs/heads/*` and `refs/remotes/*/*`, + excluding `entire/checkpoints/v1` and `entire/trails/v1`). +- For each commit on those tips, parses `Entire-Checkpoint: ` trailers + (`trailers.ParseAllCheckpoints`, key constant + `trailers/trailers.go:41`). One commit can carry many trailers (squash + merges). +- Produces a list of `discoveredCheckpoint{ID, Commits}` — every checkpoint ID + ever referenced in commit history, plus the commits that mention it. +- `--since `/positional commit narrows to commits not reachable from + the named commit. `--head ` restricts to a single tip. +- Discovery is **not** v2-specific. It is a universe of "every checkpoint we + ever ran on a commit reachable from a real ref." + +### 1.2 Migration filter (`cmd/migrate-v2-checkpoints/migration.go`) + +For each discovered checkpoint: + +1. Read v1 summary from `entire/checkpoints/v1`. If present, collect existing + v1 session IDs by reading each session's `metadata.json` (`session_id` + field). +2. Read v2 summary from `refs/entire/checkpoints/v2/main`. If absent or has + no sessions → `missing v2 checkpoint metadata` and skip. +3. For every session index in the v2 summary: + - Read v2 session metadata + prompts from `/main`. Missing or empty + `checkpoint_id` / `session_id` → `missing required v2 session metadata`. + - If that session ID already exists in v1 → `already present v1 sessions`. + - Read v2 raw transcript from `/full/current`, falling back to archived + `/full/<13-digit-suffix>` refs. `ErrNoTranscript` → + `missing raw transcripts`. + - Otherwise: count `sessions eligible for migration`, and on `--apply` + write to v1 via `GitStore.WriteCommitted` using v2-sourced fields. + +A checkpoint is **eligible** if at least one v2 session is missing from v1 and +fully readable from v2. The candidate's `sessions=N` is that net count, not +the v2 session count. + +### 1.3 What ends up on v1 after `--apply` + +For each migrated session, the v1 tree at `///` gains: + +| file | source | constant in `paths/paths.go` | +|--------------------|------------------------------|-------------------------------------| +| `metadata.json` | v2 session `metadata.json` | `MetadataFileName` (line 36) | +| `prompt.txt` | v2 session prompts (joined) | `PromptFileName` (line 29) | +| `full.jsonl[.NNN]` | reassembled v2 `raw_transcript[.NNN]` | `TranscriptFileName` (line 30) | +| `content_hash.txt` | `sha256:` of v1 bytes | `ContentHashFileName` (line 38) | + +Plus the root `//metadata.json` gets rewritten to add the new +session to `sessions[]` and recompute aggregate fields (see §3.2). + +`` is the v1 slot. New sessions append (`findSessionIndex` in +`committed.go:326`); if v1 already had session 0 and v2 contributes one new +session, it lands in v1 slot 1. v1 indices and v2 indices for the **same** +checkpoint can differ; only `session_id` is invariant across the two stores. + +Chunking note: `full.jsonl` is chunked via `agent.ChunkTranscript`. Chunks are +`full.jsonl`, `full.jsonl.001`, `full.jsonl.002`, … (`agent/chunking.go:122` +with `ChunkSuffix = ".%03d"`). Index 0 has no suffix. + +Codex caveat: for sessions whose agent is `codex`, `writeTranscript` applies +`codex.SanitizePortableTranscript` before chunking and hashing +(`committed.go:745-747`). The bytes written to v1 may differ from the bytes +read out of v2's `/full/*`, but they are still self-consistent against the new +v1 `content_hash.txt`. + +## 2. Run modes & expected report shape + +```text +$ migrate-v2-checkpoints [--repo PATH] [--since SHA | SHA] [--head SHA] \ + (--list | --dry-run | --apply) +``` + +Default mode is `plan` (same output as `--dry-run`). + +`--list` produces one line per checkpoint: +```text + [ ...] +``` +This is the **universe** discovered in history — NOT the eligible set. + +`--dry-run` / `--apply` produces: +```text +Migration plan: (or "Migration result:" on --apply) + discovered checkpoints: D + already present v1 sessions: A + missing v2 checkpoint metadata: M1 + missing required v2 session metadata: M2 + missing raw transcripts: M3 + checkpoints eligible for migration: EC + sessions eligible for migration: ES + migrated checkpoints: ... (--apply only) + migrated sessions: ... (--apply only) + checkpoints to migrate: + sessions=N commits=[,...] +``` + +Invariants that should always hold on the report: + +- `EC ≤ D`. +- `ES ≥ EC` (each eligible checkpoint contributes ≥ 1 eligible session). +- `ES = Σ candidate.SessionCount`. The candidate list is exhaustive. +- On `--apply`: `migrated checkpoints = EC` and `migrated sessions = ES` if + no write errors. Anything less means a partial write failure — re-run the + tool and the remainder should re-appear as eligible. +- `D = EC + (checkpoints with all v2 sessions in v1) + (checkpoints with + any missing-metadata / missing-transcript failure modes)`. +- Counter sums for skipped sessions: + `A + M2 + M3 = (Σ over all v2 sessions in checkpoints whose v2 summary + exists) − ES`. Useful for spot-checking after `--apply`: if `A` is large + and `EC` is small, most v2 checkpoints are already mirrored. + +## 3. Validation procedure + +The procedure below is the same regardless of repo. Substitute `$REPO` and +`$TOOL` per environment: + +```sh +REPO=/path/to/some-repo # e.g. ~/entire/marvin +TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints +cd "$REPO" +``` + +### 3.1 Pre-flight: confirm both stores exist + +```sh +git -C "$REPO" show-ref entire/checkpoints/v1 +git -C "$REPO" show-ref refs/entire/checkpoints/v2/main +git -C "$REPO" show-ref refs/entire/checkpoints/v2/full/current +git -C "$REPO" for-each-ref 'refs/entire/checkpoints/v2/full/*' \ + --format='%(refname)' +``` + +If `entire/checkpoints/v1` is missing the migration can still apply (it will +be created), but if the v2 refs are missing there is nothing to migrate. + +Also sanity-check the head of v2 isn't surprising — a recent commit means v2 +was being dual-written; a long-stale v2 head matches the rollback narrative: + +```sh +git -C "$REPO" log -1 --format='%h %ci %s' refs/entire/checkpoints/v2/main +``` + +### 3.2 Step A — sanity check the dry-run report + +```sh +"$TOOL" --repo "$REPO" --dry-run | tee /tmp/migrate.plan +``` + +Spot-check the counter math against §2: + +```sh +grep -E "^ (discovered|already|missing|checkpoints eligible|sessions eligible)" \ + /tmp/migrate.plan +``` + +- `EC ≤ D` and `ES ≥ EC`. +- For each candidate line, parse `sessions=N` and sum — must equal `ES`. + +```sh +awk '/^ [0-9a-f]{12} sessions=/ {sub(/sessions=/,"",$2); s+=$2} END {print s}' \ + /tmp/migrate.plan +# Should equal the "sessions eligible for migration" value. +``` + +### 3.3 Step B — confirm every candidate is genuinely v2-only-or-partial + +For every candidate ``: + +```sh +ID=02d9783342a2 # example +SHARD=${ID:0:2}/${ID:2} + +# Does v2 /main carry this checkpoint? +git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ + | jq '{checkpoint_id, sessions: [.sessions[].metadata]}' + +# Does v1 already carry it? (Either the path doesn't exist, or the session +# IDs differ.) +git -C "$REPO" cat-file -p \ + entire/checkpoints/v1:"$SHARD/metadata.json" 2>/dev/null \ + | jq '{checkpoint_id, sessions: [.sessions[].metadata]}' \ + || echo "(absent in v1)" +``` + +The candidate must satisfy at least one of: + +1. `/metadata.json` doesn't exist on `entire/checkpoints/v1` → + **fully v2-only**, all v2 sessions are eligible. +2. It exists on v1, but the v2 summary lists session IDs not present in v1 → + **partial migration** to fill in missing sessions. + +The reverse check — every v2 /main checkpoint should appear in the report +unless it's `already present` / `missing metadata` / `missing transcript`: + +```sh +# Enumerate every checkpoint ID present on v2 /main (sharded layout). +git -C "$REPO" ls-tree -r refs/entire/checkpoints/v2/main \ + | awk '$4 ~ /metadata\.json$/ && $4 !~ /\// {next} \ + $4 ~ /^[0-9a-f]{2}\/[0-9a-f]{10}\/metadata\.json$/ { + split($4, p, "/"); print p[1] p[2] + }' \ + | sort -u > /tmp/v2_ids.txt +wc -l /tmp/v2_ids.txt + +# IDs already in v1 (any session present). +git -C "$REPO" ls-tree -r entire/checkpoints/v1 2>/dev/null \ + | awk '$4 ~ /^[0-9a-f]{2}\/[0-9a-f]{10}\/metadata\.json$/ { \ + split($4, p, "/"); print p[1] p[2] \ + }' \ + | sort -u > /tmp/v1_ids.txt +comm -23 /tmp/v2_ids.txt /tmp/v1_ids.txt > /tmp/v2_only_ids.txt +wc -l /tmp/v2_only_ids.txt +``` + +Every ID in `v2_only_ids.txt` should be either a candidate, or — if v2 has +no session metadata for it / no raw transcript — a contributor to the +`missing v2 checkpoint metadata` / `missing raw transcripts` counters. + +A quick predicate: the eligible candidate count plus the missing-metadata +and missing-raw counters should equal or exceed the v2-only set. If it's +less, something is being silently dropped. + +```sh +EC=$(grep "checkpoints eligible" /tmp/migrate.plan | awk '{print $NF}') +M1=$(grep "missing v2 checkpoint metadata" /tmp/migrate.plan | awk '{print $NF}') +M3=$(grep "missing raw transcripts" /tmp/migrate.plan | awk '{print $NF}') +echo "v2-only on disk: $(wc -l < /tmp/v2_only_ids.txt)" +echo "EC=$EC M1=$M1 M3=$M3 (EC + M1 + M3 must be >= v2-only count)" +``` + +(`>=` rather than `=` because `M1`/`M3` are counted per-checkpoint over the +entire discovered universe, not only the v2-only set.) + +### 3.4 Step C — confirm commit-list accuracy + +The report's `commits=...` are short SHAs of commits in history whose message +carries `Entire-Checkpoint: `. Verify directly: + +```sh +ID=02d9783342a2 +git -C "$REPO" log --all --format='%h %s' --grep "Entire-Checkpoint: $ID" +``` + +The set of short SHAs that this prints should match the report's +`commits=…` for that ID. If they differ: + +- Extra in the report but absent here: the discovery walk picked up a tip + this `--all` view doesn't include (rare). +- Extra here but absent in the report: a tip was filtered out + (`entire/checkpoints/v1`, `entire/trails/v1`, or `HEAD` aliases — the + filter is in `history.go:182-205`). + +A commit may also appear under multiple candidate IDs if it's a squash +merge with multiple trailers; that's expected. + +### 3.5 Step D — DRY-RUN INSPECTION of session count + +For each candidate, the report claims `sessions=N`. Confirm: + +```sh +ID=02d9783342a2 +SHARD=${ID:0:2}/${ID:2} + +# Sessions advertised by the v2 summary (from /main). +git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ + | jq -r '.sessions | length' + +# Session IDs in v2 (read each session's own metadata.json — that field is +# what the migration tool dedupes against, not summary order). +V2_SESSION_COUNT=$(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ + | jq -r '.sessions | length') +for i in $(seq 0 $((V2_SESSION_COUNT-1))); do + git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/$i/metadata.json" \ + | jq -r '.session_id' +done | sort -u > /tmp/v2_sids.txt + +# Session IDs already in v1 for this checkpoint. +if git -C "$REPO" cat-file -e \ + "entire/checkpoints/v1:$SHARD/metadata.json" 2>/dev/null; then + V1_SESSION_COUNT=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/metadata.json" \ + | jq -r '.sessions | length') + for i in $(seq 0 $((V1_SESSION_COUNT-1))); do + git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$i/metadata.json" \ + | jq -r '.session_id' + done | sort -u > /tmp/v1_sids.txt +else + : > /tmp/v1_sids.txt +fi + +# Expected eligible: v2 minus v1, by session ID. +comm -23 /tmp/v2_sids.txt /tmp/v1_sids.txt | wc -l +# This number must equal the report's "sessions=N" for this checkpoint. +``` + +Repeat for a random sample (5–10) across the candidate list. If your +sample matches 1:1, the report's accounting is trustworthy. + +## 4. Apply the migration + +> ⛔ **No `git push` for `entire/checkpoints/v1` from this point until §5 +> has fully passed and the operator has consciously decided to publish.** +> The migration itself never pushes — but the v1 branch is the same ref +> any other tooling on the repo might push as part of its normal flow. +> Before running `--apply`: +> +> - confirm no automatic push hook, scheduler, or CI job will push +> `refs/heads/entire/checkpoints/v1` in the background; +> - if `entire`'s own push path runs in this repo (e.g. on the next +> `entire`-driven commit), pause it until §5 is done; +> - if the repo has `checkpoint_remote` configured, treat that as another +> push target that must stay quiet. +> +> Pushing before §5 passes means a bad migration is now everyone else's +> problem. Pushing after §5 passes is a separate, manual procedure that +> lives outside this runbook. + +**This is the destructive (local) step.** Up to here everything was +read-only. Now we write new commits to the local +`entire/checkpoints/v1` branch. Nothing is pushed to any remote — that's +a separate, explicit decision once the post-apply checks in §5 pass. + +### Preconditions + +- §3 ran clean: the candidate list looks plausible, counter math adds up, + and a spot sample (Steps C and D) confirmed the candidates really are + v2-only / partial migrations. +- The local repo has the v2 refs. If `git -C "$REPO" show-ref + refs/entire/checkpoints/v2/main` is empty, the migration will silently + count everything as "missing v2 checkpoint metadata" and write nothing. + Pre-fetch: + + ```sh + git -C "$REPO" fetch origin \ + 'refs/entire/checkpoints/v2/*:refs/entire/checkpoints/v2/*' \ + --no-tags + ``` + +- Working tree is clean OR you don't mind running with uncommitted changes + in `$REPO`. The tool only touches refs, not the working tree, but a clean + tree makes it easier to roll back if needed. + +### Recommended invocation + +```sh +REPO=/path/to/some-repo +TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints + +# Snapshot the v1 branch tip so you can roll back deterministically. +PRE_APPLY_TIP=$(git -C "$REPO" rev-parse entire/checkpoints/v1 2>/dev/null || echo "none") +echo "pre-apply v1 tip: $PRE_APPLY_TIP" + +# Apply. Tee the report into /tmp/migrate.applied — §5 reads it back. +"$TOOL" --repo "$REPO" --apply | tee /tmp/migrate.applied + +# Sanity-check the report. +grep -E "^ (checkpoints eligible|sessions eligible|migrated)" /tmp/migrate.applied +# migrated checkpoints == checkpoints eligible +# migrated sessions == sessions eligible +# Anything less means at least one write failed silently — re-run --apply +# (idempotent) and inspect logs. + +# Confirm the v1 branch actually advanced. +POST_APPLY_TIP=$(git -C "$REPO" rev-parse entire/checkpoints/v1) +echo "post-apply v1 tip: $POST_APPLY_TIP" +git -C "$REPO" log --format='%h %ci %s' \ + "$PRE_APPLY_TIP".."$POST_APPLY_TIP" 2>/dev/null \ + | head -20 +``` + +### Behavior notes + +- **Idempotent.** Re-running `--apply` after a successful apply yields + `checkpoints eligible for migration: 0` (and re-runs are cheap). Safe + to retry on partial failure. +- **Local only — and stays local for the rest of this runbook.** No + remotes are touched by `--apply` itself. The new v1 commits live on + `refs/heads/entire/checkpoints/v1` locally. **Do not** `git push` this + branch, do not let `entire`'s push path publish it, do not let any + CI/hook/scheduler publish it, and do not let a configured + `checkpoint_remote` mirror it. Push is a separate manual procedure + that is explicitly out of scope here, and is only safe **after** every + step in §5 passes and the operator is satisfied. +- **Per-checkpoint atomicity, not transactional.** Each candidate is + written as its own commit on v1. If `--apply` errors out partway + through, earlier candidates remain written and later ones are + un-written; the next run will pick up the rest. +- **Roll back** by resetting v1 back to `$PRE_APPLY_TIP`: + + ```sh + # Only if you need to undo — this discards the new commits locally. + git -C "$REPO" update-ref refs/heads/entire/checkpoints/v1 "$PRE_APPLY_TIP" + ``` + + Safe before any push. Destructive after push. + +### Operator checkpoint + +**Stop here. Run the apply command yourself and confirm:** + +1. `migrated checkpoints` equals `checkpoints eligible for migration` from + the dry-run. +2. `migrated sessions` equals `sessions eligible for migration` from the + dry-run. +3. `git rev-parse entire/checkpoints/v1` advanced. +4. `/tmp/migrate.applied` contains the full report for §5 to reference. +5. **You have NOT pushed `entire/checkpoints/v1`.** Confirm by checking + that no remote tracking ref has advanced: + + ```sh + git -C "$REPO" for-each-ref \ + --format='%(refname) %(objectname:short)' \ + 'refs/remotes/*/entire/checkpoints/v1' + ``` + + Each remote ref should still point at the pre-apply tip (or be + absent). If a remote ref has already moved to the new local tip, + pause and figure out who pushed — do not proceed to §5 until you've + understood the source of the push and decided whether to roll back. + +Then proceed to §5. Do not push between §4 and §5; do not push during +§5; do not push without the operator's explicit go-ahead after §5 +passes. + +## 5. Post-apply validation + +This section assumes `--apply` has been run and `/tmp/migrate.applied` +holds the report. The `migrated sessions=...` count is the population you +will validate below. + +### 5.1 Step E — root `metadata.json` (CheckpointSummary) on v1 + +For each candidate, decode the v1 root metadata and confirm: + +```sh +ID=02d9783342a2 +SHARD=${ID:0:2}/${ID:2} + +git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/metadata.json" | jq . +``` + +Expected shape (schema lives at +`cmd/entire/cli/checkpoint/checkpoint.go:527-562`): + +```jsonc +{ + "cli_version": "…", // optional + "checkpoint_id": "02d9783342a2", + "strategy": "manual-commit", + "branch": "main", // optional + "checkpoints_count": 1, + "files_touched": ["…"], + "sessions": [ + { + "metadata": "/02/d9783342a2/0/metadata.json", + "transcript": "/02/d9783342a2/0/full.jsonl", // omitempty + "content_hash": "/02/d9783342a2/0/content_hash.txt", // omitempty + "prompt": "/02/d9783342a2/0/prompt.txt" + } + ], + "token_usage": { … }, // omitempty fields + "combined_attribution": { … }, + "has_review": true // omitempty +} +``` + +Field-by-field check against the v2 summary on `/main` for the same ID: + +```sh +diff <(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ + | jq '{checkpoint_id, strategy, branch, checkpoints_count, + files_touched, combined_attribution, has_review, + token_usage}') \ + <(git -C "$REPO" cat-file -p \ + entire/checkpoints/v1:"$SHARD/metadata.json" \ + | jq '{checkpoint_id, strategy, branch, checkpoints_count, + files_touched, combined_attribution, has_review, + token_usage}') +``` + +Acceptable differences: + +- `sessions[]` entries differ — paths point to v1 file names + (`full.jsonl`, `content_hash.txt`), not v2's compact format. +- If v1 already had sessions, `sessions[]` length on v1 may exceed v2's; + the candidate's contributions are appended. +- `combined_attribution`/`token_usage` may differ if the v1 store + aggregates across all sessions present and v1 already had different + sessions. For purely v2-only checkpoints (the typical case the user + cares about) these should match the v2 summary exactly, since the + migration uses `summary.CombinedAttribution` from v2 verbatim + (`migration.go:199`) and per-session token usage is replayed from v2. + +Hard requirements: + +- `checkpoint_id` equals the directory shard. +- `sessions[].metadata`, `sessions[].transcript` (if non-empty), + `sessions[].content_hash` (if non-empty), `sessions[].prompt` all start + with `///` and end with the correct filename constants. + +### 5.2 Step F — per-session `metadata.json` + +For each migrated session, locate it by `session_id` rather than by index: + +```sh +ID=02d9783342a2 +SHARD=${ID:0:2}/${ID:2} +WANT_SID=… # session_id from the v2 side + +V1_SUM=$(git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/metadata.json") +V1_LEN=$(echo "$V1_SUM" | jq '.sessions | length') +for n in $(seq 0 $((V1_LEN-1))); do + SID=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$n/metadata.json" \ + | jq -r '.session_id') + if [ "$SID" = "$WANT_SID" ]; then + V1_SLOT=$n; break + fi +done +echo "session $WANT_SID lives in v1 slot $V1_SLOT" +``` + +Then diff the per-session metadata, comparing **fields that are expected to +survive migration** (`migration.go:173-205` lists them explicitly): + +```sh +V2_SLOT=… # slot the session occupied on v2 (its index in v2 summary) + +diff <(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/$V2_SLOT/metadata.json" \ + | jq '{checkpoint_id, session_id, strategy, branch, + files_touched, checkpoints_count, agent, model, + turn_id, is_task, tool_use_id, + transcript_identifier_at_start, + checkpoint_transcript_start, + token_usage, session_metrics, + initial_attribution, prompt_attributions, + summary, kind, review_skills, review_prompt}') \ + <(git -C "$REPO" cat-file -p \ + entire/checkpoints/v1:"$SHARD/$V1_SLOT/metadata.json" \ + | jq '{checkpoint_id, session_id, strategy, branch, + files_touched, checkpoints_count, agent, model, + turn_id, is_task, tool_use_id, + transcript_identifier_at_start, + checkpoint_transcript_start, + token_usage, session_metrics, + initial_attribution, prompt_attributions, + summary, kind, review_skills, review_prompt}') +``` + +Expected: no diff. Special cases: + +- `created_at` is replayed from v2's `created_at` and also used as v1's + `CommitTime` (`migration.go:178-179`). The two timestamps in the v1 file + should be identical when serialised. +- The migration sets `HasReview = session.Kind(meta.Kind).IsReview()` + (`migration.go:204`). For non-review kinds this is `false` and may have + been absent (omitempty) in v2; that's still a match. +- `cli_version` on the v1 session may differ from v2's. The migration + doesn't pass `CLIVersion`, so v1 inherits whatever default the writer + applies — generally an empty value or the current binary's version. Not + a correctness issue. +- v1 writes the new `combined_attribution` and aggregated `token_usage` + onto the **root** `metadata.json` from the migrating session's data. If + there were prior v1 sessions, the root summary on v1 already aggregated + them; only the new session's session-level metadata matters for §4.2. + +Schema sanity per session: + +```sh +git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/$V1_SLOT/metadata.json" \ + | jq -e 'has("checkpoint_id") and has("session_id") and has("created_at")' \ + > /dev/null && echo OK +``` + +### 5.3 Step G — `prompt.txt` content + +The migration joins v2 prompts (split form on disk) back into a single +`prompt.txt` via `SplitPromptContent` round-trip. The bytes should match +the v2 content: + +```sh +git -C "$REPO" cat-file -p \ + "refs/entire/checkpoints/v2/main:$SHARD/$V2_SLOT/prompt.txt" \ + | sha256sum +git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/prompt.txt" \ + | sha256sum +``` + +Both digests should match. If they don't, inspect with a `diff -u` between +the two `cat-file -p` outputs to see whether it's an ordering / separator +issue. + +### 5.4 Step H — raw transcript & `content_hash.txt` + +This is the most important check. Two layers: + +1. **Self-consistency on v1**: the value in `content_hash.txt` must equal + `sha256:` of the reassembled `full.jsonl[.NNN]` content. +2. **Cross-store match (non-Codex agents)**: reassembled v1 bytes should + equal reassembled v2 `raw_transcript[.NNN]` bytes, and v1's + `content_hash.txt` should equal v2's `raw_transcript_hash.txt`. + +Reassemble logic: ordered list `full.jsonl`, `full.jsonl.001`, +`full.jsonl.002`, … For most agents this is JSONL with `\n` separators +between chunks (`agent/chunking.go:108-118`); for `vogon`, OpenCode etc. +the agent's own `ReassembleTranscript` is used at read time. For +validation, byte-concatenation in chunk order is what the v1 writer +hashed (`committed.go:784` — the hash is over `transcriptBytes` BEFORE +chunking), so the easier check is to read the original v1 input bytes +back via the v1 store API, OR to validate that each chunk blob is what +the v1 writer would have produced. + +The simplest robust shell check: reconstruct via ordered concat and +compute the digest, then compare to `content_hash.txt`. This is exact for +agents whose `ChunkTranscript` is a byte-preserving JSONL chunker +(Claude Code, Gemini CLI, Cursor, Copilot CLI, Codex except for the pre- +chunk sanitization step, and the generic case). It's slightly fuzzy for +agents whose chunking strips/reflows bytes — but in practice the round +trip is byte-exact for the supported set. + +```sh +ID=02d9783342a2; SHARD=${ID:0:2}/${ID:2}; V1_SLOT=0 + +# Enumerate transcript chunks in order. +git -C "$REPO" ls-tree --name-only \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT" \ + | grep -E '^full\.jsonl(\.[0-9]{3})?$' \ + | sort > /tmp/chunks.txt +cat /tmp/chunks.txt + +# Concatenate chunks (no extra separator — chunk files are written as +# they will be read by the agent's reassembler). For JSONL agents, +# the writer already trimmed the trailing newline per chunk; the +# reassembler joins with "\n". Reproduce that here. +tmp=$(mktemp) +first=1 +while IFS= read -r f; do + if [ $first -eq 0 ]; then printf '\n' >> "$tmp"; fi + git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/$f" >> "$tmp" + first=0 +done < /tmp/chunks.txt + +# Recompute and compare. +COMPUTED="sha256:$(sha256sum "$tmp" | awk '{print $1}')" +STORED=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/content_hash.txt") +echo "stored: $STORED" +echo "computed: $COMPUTED" +[ "$STORED" = "$COMPUTED" ] && echo OK || echo MISMATCH +``` + +If `STORED ≠ COMPUTED` for **JSONL-based agents** (Claude Code, Gemini +CLI, etc.), something is wrong with the migration — flag it. For agents +with custom chunkers the shell heuristic above can produce a false +mismatch; in those cases fall back to using the CLI's own reader by +running `entire checkpoint explain ` or, more directly, by writing +a small Go probe that calls `agent.ReassembleTranscript(chunks, agent)` +and re-hashes the result. + +Cross-store comparison (non-Codex): + +```sh +# Same /full ref resolution as the migration (current first, then archives). +FULL_REFS=$(git -C "$REPO" for-each-ref \ + --format='%(refname)' 'refs/entire/checkpoints/v2/full/*' \ + | awk '/full\/current$/ {print "1 " $0; next} {print "0 " $0}' \ + | sort -k1,1nr -k2,2r \ + | awk '{print $2}') +RAW_HASH="" +for r in $FULL_REFS; do + if git -C "$REPO" cat-file -e \ + "$r:$SHARD/$V2_SLOT/raw_transcript_hash.txt" 2>/dev/null; then + RAW_HASH=$(git -C "$REPO" cat-file -p \ + "$r:$SHARD/$V2_SLOT/raw_transcript_hash.txt") + echo "raw transcript found on $r: $RAW_HASH" + break + fi +done +echo "v1 content_hash: $STORED" +echo "v2 raw_transcript: $RAW_HASH" +``` + +For non-Codex agents, the two hashes should match. For Codex (agent +field on the session metadata is `codex`), they are allowed to differ — +v1 sanitizes via `codex.SanitizePortableTranscript` before hashing +(`committed.go:745-747`). The v1 self-consistency check above is still +required in that case. + +### 5.5 Step I — bulk sweep + +Once the per-checkpoint procedure is established, sweep every migrated +checkpoint: + +```sh +TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints +"$TOOL" --repo "$REPO" --dry-run \ + | awk '/^ [0-9a-f]{12} sessions=/ {print $1}' > /tmp/candidates.txt +wc -l /tmp/candidates.txt +``` + +Then for each ID in `/tmp/candidates.txt`, run: + +- §4.1 root metadata diff (`grep -q` for errors). +- §4.2 per-session field diff for every session ID that the candidate + brought in. +- §4.4 hash check on every transcript chunk set. + +A single shell loop is fine, and the validation completes in seconds per +checkpoint. Surface any non-empty diffs or any `MISMATCH` lines. + +### 5.6 After validation passes + +You're done with this runbook only after every step in §5 produced the +expected result on every candidate. Publishing the migration is **out of +scope for this runbook** and explicitly a manual decision. + +When the operator is satisfied and ready to publish: + +1. Re-read §4's push warning. Nothing about it has changed. +2. Decide deliberately, out-of-band, that you want the new v1 commits on + the remote. Coordinate with anyone else who has the repo cloned — + they will pick up the new commits on their next fetch. +3. Use your repo's normal push path. The runbook does not prescribe one + because publishing semantics vary per repo (some use `entire`'s push + integration, some use `checkpoint_remote`, some do a plain + `git push`). Pick the right one explicitly. + +Until that conscious decision is made, `entire/checkpoints/v1` stays +local. If §5 surfaces a problem, roll back with the `update-ref` snippet +from §4's "Behavior notes" — cheap and local, because you did not push. + +## 6. Failure modes and what they mean + +| Symptom in dry-run | Meaning | Action | +|----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------| +| `missing v2 checkpoint metadata: N (large)` | v2 `/main` is missing or its tree lacks summaries for many discovered IDs. | Confirm `refs/entire/checkpoints/v2/main` exists, was fetched, and is reasonably recent. | +| `missing required v2 session metadata: > 0` | v2 session `metadata.json` lacks `checkpoint_id` or `session_id`. Could indicate corruption or a partial v2 write. | Inspect the affected sessions manually; they will be skipped, not failed. | +| `missing raw transcripts: > 0` | v2 `/main` has a session but `/full/current` and archived `/full/*` don't carry its `raw_transcript*` data. | Confirm archived `/full/*` refs are present locally (or accessible via remote fetch). | +| Candidate `commits=` is empty | Shouldn't happen by construction (discovery groups by commit). Investigate the bug. | File a bug. | +| `sessions=N` for a candidate doesn't match the §3.5 expected | Either v1 already has the session (so report should have lower N), or session IDs are non-unique within v2. | Inspect; non-unique session IDs are a v2 corruption. | +| Post-apply, `content_hash.txt` ≠ recomputed SHA-256 | Codex agent + ours-vs-original sanitization difference, OR a bug. Confirm `agent` field on the session. | If non-Codex, file a bug with chunk listing + bytes. | +| Post-apply, `content_hash.txt` matches but v2's `raw_transcript_hash.txt` doesn't | Codex sanitization (expected) OR transcript was rewritten in transit. Confirm agent first. | If non-Codex, file a bug. | +| Re-running `--dry-run` after `--apply` still lists the same candidates | Apply failed silently or didn't get pushed before re-fetch. Look at the `migrated sessions` count. | Re-run with verbose logging; check that v1 branch actually advanced. | + +## 7. Quick reference: file & ref constants + +| Concept | Constant | Value | Source | +|--------------------------|------------------------------------------------|----------------------------------------------------|---------------------------------------| +| v1 branch | `paths.MetadataBranchName` | `entire/checkpoints/v1` (under `refs/heads/`) | `paths/paths.go:43` | +| v2 main ref | `paths.V2MainRefName` | `refs/entire/checkpoints/v2/main` | `paths/paths.go:49` | +| v2 full current ref | `paths.V2FullCurrentRefName` | `refs/entire/checkpoints/v2/full/current` | `paths/paths.go:52` | +| v2 archived full ref | (pattern) | `refs/entire/checkpoints/v2/full/<13-digit-suffix>`| `v2_read.go:523-533` | +| Root summary | `paths.MetadataFileName` | `metadata.json` | `paths/paths.go:36` | +| Session metadata | `paths.MetadataFileName` | `metadata.json` | `paths/paths.go:36` | +| Session prompt | `paths.PromptFileName` | `prompt.txt` | `paths/paths.go:29` | +| v1 transcript | `paths.TranscriptFileName` | `full.jsonl` (+ `.001`, `.002`, …) | `paths/paths.go:30` | +| v1 transcript hash | `paths.ContentHashFileName` | `content_hash.txt` (format `sha256:`) | `paths/paths.go:38`, `committed.go:784` | +| v2 compact transcript | `paths.CompactTranscriptFileName` | `transcript.jsonl` (on `/main`, not migrated) | `paths/paths.go:32` | +| v2 compact hash | `paths.CompactTranscriptHashFileName` | `transcript_hash.txt` (on `/main`, not migrated) | `paths/paths.go:33` | +| v2 raw transcript | `paths.V2RawTranscriptFileName` | `raw_transcript` (+ `.001`, …) on `/full/*` | `paths/paths.go:34` | +| v2 raw hash | `paths.V2RawTranscriptHashFileName` | `raw_transcript_hash.txt` on `/full/*` | `paths/paths.go:35` | +| Sharded path | `id.Path()` | `/` (12-char lowercase hex) | `checkpoint/id/id.go` | +| Trailer key | `trailers.CheckpointTrailerKey` | `Entire-Checkpoint` | `trailers/trailers.go:41` | +| Chunk filename suffix | `agent.ChunkSuffix` | `.%03d` | `agent/chunking.go:19` | + +## 8. Source map + +- Tool entry: `cmd/migrate-v2-checkpoints/main.go` +- History walk: `cmd/migrate-v2-checkpoints/history.go` +- Migration loop: `cmd/migrate-v2-checkpoints/migration.go` +- v1 write: `cmd/entire/cli/checkpoint/committed.go` — `WriteCommitted` + (line 52), `writeStandardCheckpointEntries` (line 310), + `writeSessionToSubdirectory` (line 404), `writeTranscript` (line 720), + `findSessionIndex` (line 326). +- v2 read: `cmd/entire/cli/checkpoint/v2_read.go` — `ReadCommitted` + (line 24), `ReadSessionMetadataAndPrompts` (line 205), + `ReadSessionContent` (line 274), `readTranscriptFromFullRefs` + (line 339), `readTranscriptFromRef` (line 540). +- Schemas: `cmd/entire/cli/checkpoint/checkpoint.go` — `CheckpointSummary` + (line 527), `CommittedMetadata` (line 443), `SessionFilePaths` + (line 517). +- Trailer parsing: `cmd/entire/cli/trailers/trailers.go`. +- Chunking: `cmd/entire/cli/agent/chunking.go`. +- Sanitization (Codex only): `cmd/entire/cli/agent/codex/` + (`SanitizePortableTranscript`). +- ID + sharded path: `cmd/entire/cli/checkpoint/id/id.go`. + +## 9. Notes for re-use on other repos + +- `--repo PATH` works from anywhere; you do not need to `cd`. Bear in mind + the tool walks `refs/remotes/*/*` too, so if the local repo has stale + remote refs the candidate list may include IDs whose underlying commits + are only reachable via those remotes. That's still correct — those + commits really did reference the IDs. +- If the v2 refs aren't fetched locally (the default refspec excludes + `refs/entire/*`), discovery will still find IDs from trailers but the + per-checkpoint v2 reads will fail with "missing v2 checkpoint metadata." + Pre-fetch with: + ```sh + git -C "$REPO" fetch origin \ + 'refs/entire/checkpoints/v2/*:refs/entire/checkpoints/v2/*' + ``` +- The tool is **idempotent** in `--apply` mode. Re-running after a + successful apply should produce `checkpoints eligible for migration: 0` + modulo any new v2 data that landed in the meantime. +- The tool only writes to the local repo. After `--apply`, push the + updated v1 branch yourself when ready. diff --git a/cmd/migrate-v2-checkpoints/history.go b/cmd/migrate-v2-checkpoints/history.go index 7710f465d..1962fb030 100644 --- a/cmd/migrate-v2-checkpoints/history.go +++ b/cmd/migrate-v2-checkpoints/history.go @@ -248,12 +248,16 @@ func rejectAmbiguousCommitPrefix(repo *git.Repository, revision string) error { defer iter.Close() var matches []plumbing.Hash - if err := iter.ForEach(func(commit *object.Commit) error { + err = iter.ForEach(func(commit *object.Commit) error { if strings.HasPrefix(commit.Hash.String(), prefix) { matches = append(matches, commit.Hash) + if len(matches) == 2 { + return storer.ErrStop + } } return nil - }); err != nil { + }) + if err != nil && !errors.Is(err, storer.ErrStop) { return fmt.Errorf("scan commit objects for revision %q: %w", revision, err) } if len(matches) < 2 { diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go index d424a188c..fae3188c6 100644 --- a/cmd/migrate-v2-checkpoints/main.go +++ b/cmd/migrate-v2-checkpoints/main.go @@ -172,6 +172,7 @@ func openRepository(ctx context.Context, repoPath string) (string, *git.Reposito if err != nil { return "", nil, fmt.Errorf("open repository %q: %w", repoPath, err) } + defer detector.Close() repoRoot := repoPath if worktree, worktreeErr := detector.Worktree(); worktreeErr == nil { repoRoot = worktree.Filesystem().Root() diff --git a/cmd/migrate-v2-checkpoints/v2_fixture_test.go b/cmd/migrate-v2-checkpoints/v2_fixture_test.go index 5a96721aa..aa92bca92 100644 --- a/cmd/migrate-v2-checkpoints/v2_fixture_test.go +++ b/cmd/migrate-v2-checkpoints/v2_fixture_test.go @@ -4,6 +4,7 @@ import ( "context" "crypto/sha256" "encoding/json" + "errors" "fmt" "io" "testing" @@ -229,9 +230,10 @@ func readTestV2RefEntries(t *testing.T, repo *git.Repository, refName plumbing.R entries := make(map[string]object.TreeEntry) ref, err := repo.Reference(refName, true) - if err != nil { + if errors.Is(err, plumbing.ErrReferenceNotFound) { return plumbing.ZeroHash, entries } + require.NoError(t, err) commit, err := repo.CommitObject(ref.Hash()) require.NoError(t, err) tree, err := commit.Tree() diff --git a/mise-tasks/build b/mise-tasks/build index 8039a8657..2ad5a3295 100755 --- a/mise-tasks/build +++ b/mise-tasks/build @@ -9,3 +9,5 @@ OUTPUT="entire" case "$(uname -s)" in MINGW*|MSYS*|CYGWIN*|Windows_NT) OUTPUT="entire.exe" ;; esac go build -ldflags "-X github.com/entireio/cli/cmd/entire/cli/versioninfo.Version=${VERSION} -X github.com/entireio/cli/cmd/entire/cli/versioninfo.Commit=${COMMIT}" -o "$OUTPUT" ./cmd/entire + +go build -ldflags "-X github.com/entireio/cli/cmd/entire/cli/versioninfo.Version=${VERSION} -X github.com/entireio/cli/cmd/entire/cli/versioninfo.Commit=${COMMIT}" -o "migrate-v2-checkpoints" ./cmd/migrate-v2-checkpoints/ From 6883a325019e600efe773bb12e71f30598a81a49 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 13:48:16 -0700 Subject: [PATCH 19/35] Discover v2 orphan checkpoints Entire-Checkpoint: 93e1fd5362dc --- cmd/migrate-v2-checkpoints/history.go | 120 ++++++++++++++++++++++- cmd/migrate-v2-checkpoints/main.go | 3 +- cmd/migrate-v2-checkpoints/main_test.go | 122 ++++++++++++++++++++++++ cmd/migrate-v2-checkpoints/migration.go | 14 ++- 4 files changed, 252 insertions(+), 7 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/history.go b/cmd/migrate-v2-checkpoints/history.go index 1962fb030..7b2c432a7 100644 --- a/cmd/migrate-v2-checkpoints/history.go +++ b/cmd/migrate-v2-checkpoints/history.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/entireio/cli/cmd/entire/cli/checkpoint" checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" "github.com/entireio/cli/cmd/entire/cli/paths" "github.com/entireio/cli/cmd/entire/cli/trailers" @@ -47,14 +48,34 @@ type discoveryScope struct { } func discoverCheckpointHistory(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, error) { + checkpoints, _, err := discoverCheckpointHistoryWithSkippedOrphans(ctx, repo, opts) + return checkpoints, err +} + +func discoverCheckpointHistoryWithSkippedOrphans(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, int, error) { + checkpoints, checkpointIndexes, err := discoverTrailerCheckpointHistory(ctx, repo, opts) + if err != nil { + return nil, 0, err + } + + v2OrphansSkipped, err := addV2OrphanCheckpoints(ctx, repo, opts, checkpointIndexes, &checkpoints) + if err != nil { + return nil, 0, err + } + + sortDiscoveredCheckpoints(checkpoints) + return checkpoints, v2OrphansSkipped, nil +} + +func discoverTrailerCheckpointHistory(ctx context.Context, repo *git.Repository, opts discoveryOptions) ([]discoveredCheckpoint, map[string]int, error) { scope, err := newDiscoveryScope(ctx, repo, opts.since) if err != nil { - return nil, err + return nil, nil, err } tips, err := historyTips(ctx, repo, opts.head, scope) if err != nil { - return nil, err + return nil, nil, err } seenCommits := make(map[plumbing.Hash]bool) @@ -63,12 +84,11 @@ func discoverCheckpointHistory(ctx context.Context, repo *git.Repository, opts d for _, tip := range tips { if err := scanTip(ctx, repo, tip, scope.excluded, seenCommits, checkpointIndexes, &checkpoints); err != nil { - return nil, err + return nil, nil, err } } - sortDiscoveredCheckpoints(checkpoints) - return checkpoints, nil + return checkpoints, checkpointIndexes, nil } func newDiscoveryScope(ctx context.Context, repo *git.Repository, since string) (discoveryScope, error) { @@ -341,6 +361,86 @@ func scanTip(ctx context.Context, repo *git.Repository, tip historyTip, excluded return nil } +func addV2OrphanCheckpoints(ctx context.Context, repo *git.Repository, opts discoveryOptions, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) (int, error) { + v2CheckpointIDs, err := listV2MainCheckpointIDs(ctx, repo) + if err != nil { + return 0, err + } + if len(v2CheckpointIDs) == 0 { + return 0, nil + } + + if hasCommitScope(opts) { + _, unscopedIndexes, err := discoverTrailerCheckpointHistory(ctx, repo, discoveryOptions{}) + if err != nil { + return 0, err + } + + return countMissingCheckpointIDs(v2CheckpointIDs, unscopedIndexes), nil + } + + for _, cpID := range v2CheckpointIDs { + key := cpID.String() + if _, exists := checkpointIndexes[key]; exists { + continue + } + checkpointIndexes[key] = len(*checkpoints) + *checkpoints = append(*checkpoints, discoveredCheckpoint{ID: cpID}) + } + + return 0, nil +} + +func hasCommitScope(opts discoveryOptions) bool { + return opts.since != "" || opts.head != "" +} + +func countMissingCheckpointIDs(ids []checkpointID.CheckpointID, indexes map[string]int) int { + missing := 0 + for _, cpID := range ids { + if _, exists := indexes[cpID.String()]; !exists { + missing++ + } + } + return missing +} + +func listV2MainCheckpointIDs(ctx context.Context, repo *git.Repository) ([]checkpointID.CheckpointID, error) { + v2Store := checkpoint.NewV2GitStore(repo) + _, rootTreeHash, err := v2Store.GetRefState(plumbing.ReferenceName(paths.V2MainRefName)) + if errors.Is(err, plumbing.ErrReferenceNotFound) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("read %s ref state: %w", paths.V2MainRefName, err) + } + + rootTree, err := repo.TreeObject(rootTreeHash) + if err != nil { + return nil, fmt.Errorf("read %s root tree: %w", paths.V2MainRefName, err) + } + + var ids []checkpointID.CheckpointID + err = checkpoint.WalkCheckpointShards(ctx, repo, rootTree, func(cpID checkpointID.CheckpointID, cpTreeHash plumbing.Hash) error { + cpTree, cpTreeErr := repo.TreeObject(cpTreeHash) + if cpTreeErr != nil { + return fmt.Errorf("read v2 checkpoint %s tree: %w", cpID, cpTreeErr) + } + if _, fileErr := cpTree.File(paths.MetadataFileName); fileErr == nil { + ids = append(ids, cpID) + } + return nil + }) + if err != nil { + return nil, fmt.Errorf("walk %s checkpoints: %w", paths.V2MainRefName, err) + } + + sort.Slice(ids, func(i, j int) bool { + return ids[i].String() < ids[j].String() + }) + return ids, nil +} + func addCheckpointCommit(commit *object.Commit, checkpointIndexes map[string]int, checkpoints *[]discoveredCheckpoint) { ids := trailers.ParseAllCheckpoints(commit.Message) if len(ids) == 0 { @@ -384,6 +484,9 @@ func sortDiscoveredCheckpoints(checkpoints []discoveredCheckpoint) { func writeCheckpointList(w io.Writer, checkpoints []discoveredCheckpoint) { for _, checkpoint := range checkpoints { fmt.Fprint(w, checkpoint.ID) + if len(checkpoint.Commits) == 0 { + fmt.Fprint(w, " (orphan)") + } for _, commit := range checkpoint.Commits { fmt.Fprintf(w, " %s", commit.ShortSHA) } @@ -391,6 +494,13 @@ func writeCheckpointList(w io.Writer, checkpoints []discoveredCheckpoint) { } } +func writeDiscoveryWarnings(w io.Writer, v2OrphansSkipped int) { + if v2OrphansSkipped == 0 { + return + } + fmt.Fprintf(w, "warning: %d v2 orphans skipped; re-run without --since/--head to include them\n", v2OrphansSkipped) +} + func shortHash(hash plumbing.Hash) string { full := hash.String() if len(full) <= checkpointID.ShortIDLength { diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go index fae3188c6..a84b67712 100644 --- a/cmd/migrate-v2-checkpoints/main.go +++ b/cmd/migrate-v2-checkpoints/main.go @@ -55,13 +55,14 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { } ctx = settings.WithWorktreeRoot(ctx, repoRoot) - checkpoints, err := discoverCheckpointHistory(ctx, repo, discoveryOptions{ + checkpoints, v2OrphansSkipped, err := discoverCheckpointHistoryWithSkippedOrphans(ctx, repo, discoveryOptions{ since: opts.since, head: opts.head, }) if err != nil { return err } + writeDiscoveryWarnings(stdout, v2OrphansSkipped) switch opts.mode { case modeList: diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 2917b88aa..41d346d07 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -149,6 +149,18 @@ func TestDiscoverCheckpointHistory_ExcludesInternalRefs(t *testing.T) { require.Equal(t, []string{mainCheckpointID, featureCheckpointID, featureCheckpointID2}, discoveredCheckpointIDs(checkpoints)) } +func TestDiscoverCheckpointHistory_IncludesV2OrphansWithoutScope(t *testing.T) { + t.Parallel() + + fixture := setupMigrationOrphanRepo(t, "555555555555") + + checkpoints, err := discoverCheckpointHistory(context.Background(), fixture.repo, discoveryOptions{}) + require.NoError(t, err) + + require.Equal(t, []string{"555555555555"}, discoveredCheckpointIDs(checkpoints)) + require.Empty(t, checkpoints[0].Commits) +} + func TestResolveRevisionRejectsAmbiguousShortCommitPrefix(t *testing.T) { t.Parallel() @@ -212,6 +224,21 @@ func TestRunListModeOpensRepoFromSubdirectory(t *testing.T) { require.Equal(t, mainCheckpointID+" "+shortHash(fixture.mainHash)+"\n", stdout.String()) } +func TestRunListModePrintsV2Orphans(t *testing.T) { + t.Parallel() + + fixture := setupMigrationOrphanRepo(t, "666666666666") + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", fixture.dir, + "--list", + }, &stdout) + require.NoError(t, err) + + require.Equal(t, "666666666666 (orphan)\n", stdout.String()) +} + func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { t.Parallel() @@ -285,6 +312,45 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { require.True(t, commit.Author.When.Equal(createdAt), "author time = %s, want %s", commit.Author.When, createdAt) } +func TestRunApplyMigratesV2OrphanCheckpointAndIsIdempotent(t *testing.T) { + t.Parallel() + + cpID := id.MustCheckpointID("777777777777") + fixture := setupMigrationOrphanRepo(t, cpID.String()) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", fixture.dir, + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 1") + require.Contains(t, stdout.String(), "v2 orphan checkpoints eligible for migration: 1") + require.Contains(t, stdout.String(), "migrated checkpoints: 1") + require.Contains(t, stdout.String(), cpID.String()+" sessions=1 commits=(orphan)") + + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.Len(t, summary.Sessions, 1) + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.Equal(t, "orphan-session", content.Metadata.SessionID) + require.JSONEq(t, `{"message":"orphan"}`, string(content.Transcript)) + + stdout.Reset() + err = run(context.Background(), []string{ + "--repo", fixture.dir, + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "already present v1 sessions: 1") + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 0") + require.Contains(t, stdout.String(), "v2 orphan checkpoints eligible for migration: 0") + require.NotContains(t, stdout.String(), cpID.String()+" sessions=1 commits=(orphan)") +} + func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { t.Parallel() @@ -308,6 +374,39 @@ func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { require.Nil(t, summary) } +func TestRunDryRunSkipsV2OrphansWhenScoped(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + flag string + id string + }{ + {name: "since", flag: "--since", id: "888888888888"}, + {name: "head", flag: "--head", id: "999999999999"}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + fixture := setupMigrationOrphanRepo(t, tc.id) + args := []string{ + "--repo", fixture.dir, + tc.flag, fixture.baseHash.String(), + "--dry-run", + } + var stdout bytes.Buffer + err := run(context.Background(), args, &stdout) + require.NoError(t, err) + + require.Contains(t, stdout.String(), "warning: 1 v2 orphans skipped; re-run without --since/--head to include them") + require.Contains(t, stdout.String(), "discovered checkpoints: 0") + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 0") + require.NotContains(t, stdout.String(), tc.id+" sessions=1 commits=(orphan)") + }) + } +} + func TestRunApplySkipsExistingV1SessionsAndMigratesMissingSessions(t *testing.T) { t.Parallel() @@ -480,6 +579,29 @@ func setupMigrationHistoryRepo(t *testing.T) migrationHistoryFixture { } } +func setupMigrationOrphanRepo(t *testing.T, checkpointID string) migrationHistoryFixture { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + + baseHash := commitMigrationTestFile(t, dir, "initial.txt", "initial\n", "initial commit") + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + + writeTestV2Checkpoint(t, repo, testV2CheckpointOptions{ + CheckpointID: id.MustCheckpointID(checkpointID), + SessionID: "orphan-session", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"orphan\"}\n")), + }) + + return migrationHistoryFixture{ + dir: dir, + repo: repo, + baseHash: baseHash, + } +} + func commitMigrationTestFile(t *testing.T, dir, name, content, message string) plumbing.Hash { t.Helper() diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 1abb1e7f7..e7b58310d 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -25,6 +25,7 @@ type migrationReport struct { MissingV2SessionMetadata int MissingRawTranscripts int EligibleCheckpoints int + V2OrphanCheckpoints int EligibleSessions int MigratedCheckpoints int MigratedSessions int @@ -68,6 +69,9 @@ func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, dis continue } report.EligibleCheckpoints++ + if len(discoveredCheckpoint.Commits) == 0 { + report.V2OrphanCheckpoints++ + } report.Candidates = append(report.Candidates, migrationCandidateFromDiscovered(discoveredCheckpoint, eligibleSessions)) if opts.apply { report.MigratedCheckpoints++ @@ -217,6 +221,7 @@ func writeMigrationReport(w io.Writer, report migrationReport, applied bool) { fmt.Fprintf(w, " missing required v2 session metadata: %d\n", report.MissingV2SessionMetadata) fmt.Fprintf(w, " missing raw transcripts: %d\n", report.MissingRawTranscripts) fmt.Fprintf(w, " checkpoints eligible for migration: %d\n", report.EligibleCheckpoints) + fmt.Fprintf(w, " v2 orphan checkpoints eligible for migration: %d\n", report.V2OrphanCheckpoints) fmt.Fprintf(w, " sessions eligible for migration: %d\n", report.EligibleSessions) if applied { fmt.Fprintf(w, " migrated checkpoints: %d\n", report.MigratedCheckpoints) @@ -238,7 +243,14 @@ func writeMigrationCandidates(w io.Writer, candidates []migrationCandidate, appl fmt.Fprintf(w, " %s sessions=%d commits=%s\n", candidate.CheckpointID, candidate.SessionCount, - strings.Join(candidate.CommitSHAs, ","), + candidateCommitLabel(candidate), ) } } + +func candidateCommitLabel(candidate migrationCandidate) string { + if len(candidate.CommitSHAs) == 0 { + return "(orphan)" + } + return strings.Join(candidate.CommitSHAs, ",") +} From 5a3760e62b326311ad34dd49b4b4c1705b8027df Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 14:57:26 -0700 Subject: [PATCH 20/35] Update validation instructions Entire-Checkpoint: e699268f21c8 --- cmd/migrate-v2-checkpoints/VALIDATION.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index e57a4ba45..740f2ebf6 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -379,17 +379,19 @@ a separate, explicit decision once the post-apply checks in §5 pass. ```sh REPO=/path/to/some-repo +REPO_NAME=$(basename "$REPO") TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints +APPLIED_REPORT="/tmp/migrate-${REPO_NAME}.applied" # Snapshot the v1 branch tip so you can roll back deterministically. PRE_APPLY_TIP=$(git -C "$REPO" rev-parse entire/checkpoints/v1 2>/dev/null || echo "none") echo "pre-apply v1 tip: $PRE_APPLY_TIP" -# Apply. Tee the report into /tmp/migrate.applied — §5 reads it back. -"$TOOL" --repo "$REPO" --apply | tee /tmp/migrate.applied +# Apply. Tee the report into /tmp/migrate-${REPO_NAME}.applied — §5 reads it back. +"$TOOL" --repo "$REPO" --apply | tee "$APPLIED_REPORT" # Sanity-check the report. -grep -E "^ (checkpoints eligible|sessions eligible|migrated)" /tmp/migrate.applied +grep -E "^ (checkpoints eligible|sessions eligible|migrated)" "$APPLIED_REPORT" # migrated checkpoints == checkpoints eligible # migrated sessions == sessions eligible # Anything less means at least one write failed silently — re-run --apply @@ -438,7 +440,7 @@ git -C "$REPO" log --format='%h %ci %s' \ 2. `migrated sessions` equals `sessions eligible for migration` from the dry-run. 3. `git rev-parse entire/checkpoints/v1` advanced. -4. `/tmp/migrate.applied` contains the full report for §5 to reference. +4. `/tmp/migrate-${REPO_NAME}.applied` contains the full report for §5 to reference. 5. **You have NOT pushed `entire/checkpoints/v1`.** Confirm by checking that no remote tracking ref has advanced: @@ -459,9 +461,9 @@ passes. ## 5. Post-apply validation -This section assumes `--apply` has been run and `/tmp/migrate.applied` -holds the report. The `migrated sessions=...` count is the population you -will validate below. +This section assumes `--apply` has been run and +`/tmp/migrate-${REPO_NAME}.applied` holds the report. The +`migrated sessions=...` count is the population you will validate below. ### 5.1 Step E — root `metadata.json` (CheckpointSummary) on v1 From 2f140a39675e19c28c7b3fb65ccbaa9e27da052f Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 15:41:14 -0700 Subject: [PATCH 21/35] Handle sparse v1 checkpoint sessions Entire-Checkpoint: 3b731107bd53 --- cmd/migrate-v2-checkpoints/main_test.go | 82 +++++++++++++++++++++++++ cmd/migrate-v2-checkpoints/migration.go | 39 +++++++++++- 2 files changed, 120 insertions(+), 1 deletion(-) diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 41d346d07..d912fb904 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -15,6 +15,7 @@ import ( "github.com/entireio/cli/cmd/entire/cli/agent" "github.com/entireio/cli/cmd/entire/cli/checkpoint" "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/jsonutil" "github.com/entireio/cli/cmd/entire/cli/paths" "github.com/entireio/cli/cmd/entire/cli/session" "github.com/entireio/cli/cmd/entire/cli/testutil" @@ -456,6 +457,43 @@ func TestRunApplySkipsExistingV1SessionsAndMigratesMissingSessions(t *testing.T) require.JSONEq(t, `{"message":"from v2 new"}`, string(content.Transcript)) } +func TestRunDryRunReadsSparseExistingV1SessionPaths(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 existing\"}\n")), + }) + + v1Store := checkpoint.NewGitStore(fixture.repo) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 zero\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-two", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 two\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + rewriteV1SecondSessionToSparseSlot(t, fixture.repo, cpID) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--dry-run") + require.Contains(t, stdout, "already present v1 sessions: 1") + require.Contains(t, stdout, "checkpoints eligible for migration: 0") +} + func TestRunApplyMigratesTaskMetadata(t *testing.T) { t.Parallel() @@ -602,6 +640,50 @@ func setupMigrationOrphanRepo(t *testing.T, checkpointID string) migrationHistor } } +func rewriteV1SecondSessionToSparseSlot(t *testing.T, repo *git.Repository, cpID id.CheckpointID) { + t.Helper() + + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + parentHash, entries := readTestV2RefEntries(t, repo, refName) + basePath := cpID.Path() + "/" + rootMetadataPath := basePath + paths.MetadataFileName + rootEntry := entries[rootMetadataPath] + summary := readTestJSONFromBlob[checkpoint.CheckpointSummary](t, repo, rootEntry.Hash) + require.Len(t, summary.Sessions, 2) + summary.Sessions[1] = rewriteSessionFilePathSlot(summary.Sessions[1], "/1/", "/2/") + + summaryJSON, err := jsonutil.MarshalIndentWithNewline(summary, "", " ") + require.NoError(t, err) + summaryBlob, err := checkpoint.CreateBlobFromContent(repo, summaryJSON) + require.NoError(t, err) + entries[rootMetadataPath] = object.TreeEntry{ + Name: rootMetadataPath, + Mode: rootEntry.Mode, + Hash: summaryBlob, + } + + oldPrefix := basePath + "1/" + newPrefix := basePath + "2/" + for entryPath, entry := range entries { + if !strings.HasPrefix(entryPath, oldPrefix) { + continue + } + newPath := newPrefix + strings.TrimPrefix(entryPath, oldPrefix) + entry.Name = newPath + entries[newPath] = entry + delete(entries, entryPath) + } + writeTestV2RefEntries(t, repo, refName, parentHash, entries, "test sparse v1 fixture") +} + +func rewriteSessionFilePathSlot(sessionPaths checkpoint.SessionFilePaths, oldSlot, newSlot string) checkpoint.SessionFilePaths { + sessionPaths.Metadata = strings.Replace(sessionPaths.Metadata, oldSlot, newSlot, 1) + sessionPaths.Transcript = strings.Replace(sessionPaths.Transcript, oldSlot, newSlot, 1) + sessionPaths.ContentHash = strings.Replace(sessionPaths.ContentHash, oldSlot, newSlot, 1) + sessionPaths.Prompt = strings.Replace(sessionPaths.Prompt, oldSlot, newSlot, 1) + return sessionPaths +} + func commitMigrationTestFile(t *testing.T, dir, name, content, message string) plumbing.Hash { t.Helper() diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index e7b58310d..7c58a6ce4 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -5,9 +5,12 @@ import ( "errors" "fmt" "io" + "strconv" "strings" "github.com/entireio/cli/cmd/entire/cli/checkpoint" + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" "github.com/entireio/cli/cmd/entire/cli/session" "github.com/entireio/cli/redact" @@ -157,9 +160,19 @@ func (m checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered if summary == nil { return existing, nil } - for sessionIndex := range summary.Sessions { + for summaryIndex, sessionPaths := range summary.Sessions { + sessionIndex, ok, err := v1SessionIndexFromSummary(discovered.ID, sessionPaths) + if err != nil { + return nil, fmt.Errorf("resolve v1 checkpoint %s session %d metadata path: %w", discovered.ID, summaryIndex, err) + } + if !ok { + continue + } content, err := m.v1Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) if err != nil { + if errors.Is(err, checkpoint.ErrCheckpointNotFound) { + continue + } return nil, fmt.Errorf("read v1 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) } if content.Metadata.SessionID == "" { @@ -170,6 +183,30 @@ func (m checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered return existing, nil } +func v1SessionIndexFromSummary(cpID checkpointID.CheckpointID, sessionPaths checkpoint.SessionFilePaths) (int, bool, error) { + if sessionPaths.Metadata == "" { + return 0, false, nil + } + + metadataPath := strings.TrimPrefix(sessionPaths.Metadata, "/") + expectedPrefix := cpID.Path() + "/" + relativePath, ok := strings.CutPrefix(metadataPath, expectedPrefix) + if !ok { + return 0, false, fmt.Errorf("metadata path %q is outside checkpoint path %q", sessionPaths.Metadata, cpID.Path()) + } + + sessionDir, fileName, ok := strings.Cut(relativePath, "/") + if !ok || fileName != paths.MetadataFileName { + return 0, false, fmt.Errorf("metadata path %q does not point to a session metadata file", sessionPaths.Metadata) + } + + sessionIndex, err := strconv.Atoi(sessionDir) + if err != nil || sessionIndex < 0 { + return 0, false, fmt.Errorf("metadata path %q has invalid session index %q", sessionPaths.Metadata, sessionDir) + } + return sessionIndex, true, nil +} + func hasRequiredV2Metadata(content *checkpoint.SessionContent) bool { return !content.Metadata.CheckpointID.IsEmpty() && content.Metadata.SessionID != "" } From 86f6dbb9f1d7ea705e1ace5221872e95591ea2e8 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 16:04:47 -0700 Subject: [PATCH 22/35] Update validation instructions Entire-Checkpoint: 0b3b7a44bfd6 --- cmd/migrate-v2-checkpoints/VALIDATION.md | 106 ++++++++++++++++++++--- 1 file changed, 95 insertions(+), 11 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index 740f2ebf6..92143a36b 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -464,10 +464,19 @@ passes. This section assumes `--apply` has been run and `/tmp/migrate-${REPO_NAME}.applied` holds the report. The `migrated sessions=...` count is the population you will validate below. +Extract the migrated checkpoint IDs once and reuse that list for every +bulk check: + +```sh +MIGRATED_IDS="/tmp/migrate-${REPO_NAME}.migrated-checkpoints" +awk '/^ [0-9a-f]{12} sessions=/ {print $1}' "$APPLIED_REPORT" \ + > "$MIGRATED_IDS" +wc -l "$MIGRATED_IDS" +``` ### 5.1 Step E — root `metadata.json` (CheckpointSummary) on v1 -For each candidate, decode the v1 root metadata and confirm: +For each migrated checkpoint, decode the v1 root metadata and confirm: ```sh ID=02d9783342a2 @@ -733,26 +742,101 @@ Once the per-checkpoint procedure is established, sweep every migrated checkpoint: ```sh -TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints -"$TOOL" --repo "$REPO" --dry-run \ - | awk '/^ [0-9a-f]{12} sessions=/ {print $1}' > /tmp/candidates.txt -wc -l /tmp/candidates.txt +wc -l "$MIGRATED_IDS" ``` -Then for each ID in `/tmp/candidates.txt`, run: +Then for each ID in `$MIGRATED_IDS`, run: -- §4.1 root metadata diff (`grep -q` for errors). -- §4.2 per-session field diff for every session ID that the candidate +- §5.1 root metadata diff (`grep -q` for errors). +- §5.2 per-session field diff for every session ID that the checkpoint brought in. -- §4.4 hash check on every transcript chunk set. +- §5.4 hash check on every transcript chunk set. A single shell loop is fine, and the validation completes in seconds per checkpoint. Surface any non-empty diffs or any `MISMATCH` lines. -### 5.6 After validation passes +### 5.6 Step J — `entire explain` parity after removing v2 dual reads + +Validate every migrated checkpoint with a current build from the branch that +removes v2-first dual reads, and compare it to the Homebrew-installed +`entire`. The outputs must be identical. Any mismatch means the migration +did not restore enough v1 data for the new read path, or the two binaries +changed explain behavior independently; flag it and keep the diff for +investigation. + +Build the comparison binary immediately before the sweep: + +```sh +FIX_WORKTREE=/Users/pfleidi/entire/cli/.worktrees/fix/checkpoints-v2-remove-dual-reads +FIX_ENTIRE="/tmp/entire-${REPO_NAME}-remove-dual-reads" +BREW_ENTIRE="$(brew --prefix)/bin/entire" + +git -C "$FIX_WORKTREE" status --short --branch +(cd "$FIX_WORKTREE" && go build -o "$FIX_ENTIRE" ./cmd/entire) + +"$FIX_ENTIRE" version +"$BREW_ENTIRE" version +``` + +Run both binaries from the migrated repo and compare exit status, stdout, and +stderr for every migrated checkpoint: + +```sh +EXPLAIN_DIR="/tmp/migrate-${REPO_NAME}-explain" +mkdir -p "$EXPLAIN_DIR" +: > "$EXPLAIN_DIR/mismatches.txt" +set +e + +while IFS= read -r ID; do + FIX_RESULT="$EXPLAIN_DIR/$ID.fix" + BREW_RESULT="$EXPLAIN_DIR/$ID.brew" + DIFF_FILE="$EXPLAIN_DIR/$ID.diff" + + (cd "$REPO" && "$FIX_ENTIRE" explain "$ID") \ + > "$FIX_RESULT.out" 2> "$FIX_RESULT.err" + FIX_STATUS=$? + (cd "$REPO" && "$BREW_ENTIRE" explain "$ID") \ + > "$BREW_RESULT.out" 2> "$BREW_RESULT.err" + BREW_STATUS=$? + + { + echo "status=$FIX_STATUS" + cat "$FIX_RESULT.out" + printf '\n--- stderr ---\n' + cat "$FIX_RESULT.err" + } > "$FIX_RESULT" + { + echo "status=$BREW_STATUS" + cat "$BREW_RESULT.out" + printf '\n--- stderr ---\n' + cat "$BREW_RESULT.err" + } > "$BREW_RESULT" + + if ! diff -u "$BREW_RESULT" "$FIX_RESULT" > "$DIFF_FILE"; then + echo "$ID" >> "$EXPLAIN_DIR/mismatches.txt" + echo "MISMATCH $ID (see $DIFF_FILE)" + else + rm -f "$DIFF_FILE" + fi +done < "$MIGRATED_IDS" + +if [ -s "$EXPLAIN_DIR/mismatches.txt" ]; then + echo "explain mismatches:" + cat "$EXPLAIN_DIR/mismatches.txt" + exit 1 +fi + +echo "all migrated checkpoints matched entire explain output" +``` + +Do not ignore mismatches, even if the rendered output looks close. Record the +checkpoint ID and keep the corresponding `.diff`, `.fix`, and `.brew` files +for follow-up. + +### 5.7 After validation passes You're done with this runbook only after every step in §5 produced the -expected result on every candidate. Publishing the migration is **out of +expected result on every migrated checkpoint. Publishing the migration is **out of scope for this runbook** and explicitly a manual decision. When the operator is satisfied and ready to publish: From 15b3e67028608e2d48de298ec5b3a36b9479f961 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 16:46:25 -0700 Subject: [PATCH 23/35] Fetch v2 refs before checkpoint migration Entire-Checkpoint: 1db8610b3316 --- cmd/migrate-v2-checkpoints/main.go | 10 ++ cmd/migrate-v2-checkpoints/main_test.go | 85 +++++++++++ cmd/migrate-v2-checkpoints/v2_preflight.go | 167 +++++++++++++++++++++ 3 files changed, 262 insertions(+) create mode 100644 cmd/migrate-v2-checkpoints/v2_preflight.go diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go index a84b67712..b08a482ca 100644 --- a/cmd/migrate-v2-checkpoints/main.go +++ b/cmd/migrate-v2-checkpoints/main.go @@ -55,6 +55,12 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { } ctx = settings.WithWorktreeRoot(ctx, repoRoot) + if shouldEnsureV2Refs(opts) { + if err := ensureLatestV2Refs(ctx, repoRoot, repo); err != nil { + return err + } + } + checkpoints, v2OrphansSkipped, err := discoverCheckpointHistoryWithSkippedOrphans(ctx, repo, discoveryOptions{ since: opts.since, head: opts.head, @@ -87,6 +93,10 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { } } +func shouldEnsureV2Refs(opts options) bool { + return opts.mode == modePlan || opts.mode == modeDryRun || opts.mode == modeApply +} + func parseOptions(args []string) (options, error) { var opts options opts.mode = modePlan diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index d912fb904..7f88039c8 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -240,6 +240,55 @@ func TestRunListModePrintsV2Orphans(t *testing.T) { require.Equal(t, "666666666666 (orphan)\n", stdout.String()) } +func TestRunDryRunFetchesRemoteV2RefsBeforePlanning(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-remote-v2", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"remote v2\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + cloneRepo, err := git.PlainOpen(cloneDir) + require.NoError(t, err) + _, err = cloneRepo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + require.ErrorIs(t, err, plumbing.ErrReferenceNotFound) + + var stdout bytes.Buffer + err = run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--dry-run", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "checkpoints eligible for migration: 1") + require.Contains(t, stdout.String(), "sessions eligible for migration: 1") + + _, err = cloneRepo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + require.NoError(t, err) + _, err = cloneRepo.Reference(plumbing.ReferenceName(paths.V2FullCurrentRefName), true) + require.NoError(t, err) +} + +func TestRunDryRunFailsWhenRemoteV2MainIsUnavailable(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--dry-run", + }, &stdout) + require.ErrorContains(t, err, paths.V2MainRefName+" not found on remote") +} + func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { t.Parallel() @@ -640,6 +689,42 @@ func setupMigrationOrphanRepo(t *testing.T, checkpointID string) migrationHistor } } +func cloneMigrationRepoWithOrigin(t *testing.T, fixture migrationHistoryFixture) string { + t.Helper() + + remoteDir := filepath.Join(t.TempDir(), "origin.git") + runMigrationGit(t, "", "init", "--bare", remoteDir) + runMigrationGit(t, remoteDir, "symbolic-ref", "HEAD", "refs/heads/main") + runMigrationGit(t, fixture.dir, "remote", "add", "origin", remoteDir) + + refspecs := []string{ + fixture.mainHash.String() + ":refs/heads/main", + fixture.featureHash.String() + ":refs/heads/" + testFeatureBranchName, + } + if refExists(t, fixture.repo, plumbing.ReferenceName(paths.V2MainRefName)) { + refspecs = append(refspecs, paths.V2MainRefName+":"+paths.V2MainRefName) + } + if refExists(t, fixture.repo, plumbing.ReferenceName(paths.V2FullCurrentRefName)) { + refspecs = append(refspecs, paths.V2FullCurrentRefName+":"+paths.V2FullCurrentRefName) + } + runMigrationGit(t, fixture.dir, append([]string{"push", "origin"}, refspecs...)...) + + cloneDir := filepath.Join(t.TempDir(), "clone") + runMigrationGit(t, "", "clone", remoteDir, cloneDir) + return cloneDir +} + +func refExists(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName) bool { + t.Helper() + + _, err := repo.Reference(refName, true) + if err == nil { + return true + } + require.ErrorIs(t, err, plumbing.ErrReferenceNotFound) + return false +} + func rewriteV1SecondSessionToSparseSlot(t *testing.T, repo *git.Repository, cpID id.CheckpointID) { t.Helper() diff --git a/cmd/migrate-v2-checkpoints/v2_preflight.go b/cmd/migrate-v2-checkpoints/v2_preflight.go new file mode 100644 index 000000000..c300da305 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v2_preflight.go @@ -0,0 +1,167 @@ +package main + +import ( + "context" + "errors" + "fmt" + "sort" + "strings" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint/remote" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/strategy" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" +) + +const ( + v2RefFetchTimeout = 2 * time.Minute + v2MainFetchTmpRef = strategy.FetchTmpRefPrefix + "migrate-v2-main" +) + +func ensureLatestV2Refs(ctx context.Context, repoRoot string, repo *git.Repository) error { + fetchTarget, err := remote.FetchURL(ctx, remote.FetchURLOptions{WorktreeRoot: repoRoot}) + if err != nil { + if localV2MainRefExists(repo) { + return nil + } + return fmt.Errorf("resolve v2 checkpoint fetch target: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, v2RefFetchTimeout) + defer cancel() + + remoteRefs, err := listRemoteV2Refs(ctx, repoRoot, fetchTarget) + if err != nil { + return err + } + if _, ok := remoteRefs[paths.V2MainRefName]; !ok { + return fmt.Errorf("%s not found on remote %s", paths.V2MainRefName, remote.RedactURL(fetchTarget)) + } + + if err := fetchV2MainRef(ctx, repoRoot, repo, fetchTarget); err != nil { + return err + } + if err := fetchV2FullRefs(ctx, repoRoot, fetchTarget, remoteRefs); err != nil { + return err + } + return nil +} + +func localV2MainRefExists(repo *git.Repository) bool { + _, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + return err == nil +} + +func listRemoteV2Refs(ctx context.Context, repoRoot, fetchTarget string) (map[string]struct{}, error) { + output, err := remote.LsRemoteInDir(ctx, repoRoot, fetchTarget, "refs/entire/checkpoints/v2/*") + if err != nil { + return nil, fmt.Errorf("list remote v2 checkpoint refs from %s: %w", remote.RedactURL(fetchTarget), err) + } + + refs := make(map[string]struct{}) + for line := range strings.SplitSeq(strings.TrimSpace(string(output)), "\n") { + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + refs[fields[1]] = struct{}{} + } + return refs, nil +} + +func fetchV2MainRef(ctx context.Context, repoRoot string, repo *git.Repository, fetchTarget string) error { + refSpec := fmt.Sprintf("+%s:%s", paths.V2MainRefName, v2MainFetchTmpRef) + output, err := remote.Fetch(ctx, remote.FetchOptions{ + Remote: fetchTarget, + RefSpecs: []string{refSpec}, + NoTags: true, + NoFilter: true, + Dir: repoRoot, + }) + if err != nil { + return fetchV2RefsError("fetch v2 /main", fetchTarget, output, err) + } + + tmpRefName := plumbing.ReferenceName(v2MainFetchTmpRef) + defer func() { _ = repo.Storer.RemoveReference(tmpRefName) }() //nolint:errcheck // cleanup is best-effort + + tmpRef, err := repo.Reference(tmpRefName, true) + if err != nil { + return fmt.Errorf("v2 /main not found after fetch (tmp ref %s missing): %w", tmpRefName, err) + } + if err := strategy.SafelyAdvanceLocalRef(ctx, repo, plumbing.ReferenceName(paths.V2MainRefName), tmpRef.Hash()); err != nil { + return fmt.Errorf("advance local %s: %w", paths.V2MainRefName, err) + } + return nil +} + +func fetchV2FullRefs(ctx context.Context, repoRoot, fetchTarget string, remoteRefs map[string]struct{}) error { + refSpecs := v2FullRefSpecs(remoteRefs) + if len(refSpecs) == 0 { + return nil + } + + output, err := remote.Fetch(ctx, remote.FetchOptions{ + Remote: fetchTarget, + RefSpecs: refSpecs, + NoTags: true, + NoFilter: true, + Dir: repoRoot, + }) + if err != nil { + return fetchV2RefsError("fetch v2 /full refs", fetchTarget, output, err) + } + return nil +} + +func v2FullRefSpecs(remoteRefs map[string]struct{}) []string { + refSpecs := make([]string, 0, len(remoteRefs)) + for refName := range remoteRefs { + if !isV2FullRefName(refName) { + continue + } + refSpec := refName + ":" + refName + if refName == paths.V2FullCurrentRefName { + refSpec = "+" + refSpec + } + refSpecs = append(refSpecs, refSpec) + } + sort.Strings(refSpecs) + return refSpecs +} + +func isV2FullRefName(refName string) bool { + prefix := strings.TrimSuffix(paths.V2FullCurrentRefName, "current") + if !strings.HasPrefix(refName, prefix) { + return false + } + suffix := strings.TrimPrefix(refName, prefix) + if suffix == "current" { + return true + } + if len(suffix) != 13 { + return false + } + for _, r := range suffix { + if r < '0' || r > '9' { + return false + } + } + return true +} + +func fetchV2RefsError(action, fetchTarget string, output []byte, err error) error { + if errors.Is(err, context.DeadlineExceeded) { + return fmt.Errorf("%s timed out after %s", action, v2RefFetchTimeout) + } + + redactedTarget := remote.RedactURL(fetchTarget) + msg := strings.TrimSpace(strings.ReplaceAll(string(output), fetchTarget, redactedTarget)) + if msg != "" { + return fmt.Errorf("%s from %s failed: %s: %w", action, redactedTarget, msg, err) + } + return fmt.Errorf("%s from %s failed: %w", action, redactedTarget, err) +} From 773c30453cbab1b5f33d663f0c42a2f2b81d85bf Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Wed, 27 May 2026 17:36:05 -0700 Subject: [PATCH 24/35] Preserve v2 checkpoint authors during migration Entire-Checkpoint: 238dcfa519fc --- cmd/migrate-v2-checkpoints/VALIDATION.md | 300 +++++++++++++----- cmd/migrate-v2-checkpoints/main_test.go | 48 ++- cmd/migrate-v2-checkpoints/migration.go | 38 +-- cmd/migrate-v2-checkpoints/v2_author.go | 61 ++++ cmd/migrate-v2-checkpoints/v2_fixture_test.go | 35 +- 5 files changed, 376 insertions(+), 106 deletions(-) create mode 100644 cmd/migrate-v2-checkpoints/v2_author.go diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index 92143a36b..e5092be4b 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -29,17 +29,31 @@ Tested against the `tmp-migrate-v2-script-go` branch of the CLI at ### 1.1 Discovery (`cmd/migrate-v2-checkpoints/history.go`) - Walks every history tip (branches under `refs/heads/*` and `refs/remotes/*/*`, - excluding `entire/checkpoints/v1` and `entire/trails/v1`). + excluding `entire/checkpoints/v1`, `entire/trails/v1`, and any `*/HEAD` + symbolic ref). Falls back to `HEAD` if no other tips qualify. - For each commit on those tips, parses `Entire-Checkpoint: ` trailers (`trailers.ParseAllCheckpoints`, key constant `trailers/trailers.go:41`). One commit can carry many trailers (squash merges). -- Produces a list of `discoveredCheckpoint{ID, Commits}` — every checkpoint ID - ever referenced in commit history, plus the commits that mention it. +- After the trailer walk, lists every checkpoint ID on + `refs/entire/checkpoints/v2/main` (`addV2OrphanCheckpoints`). Any v2 /main + ID not already discovered through a commit trailer is appended as an + **orphan** — a `discoveredCheckpoint{ID, Commits: nil}` with no commit + attribution. Orphans flow through the migration filter the same way as + commit-attributed candidates; only their reporting label differs. +- Produces a list of `discoveredCheckpoint{ID, Commits}` — every checkpoint + ID ever referenced in commit history plus every v2 /main ID, sorted by ID. - `--since `/positional commit narrows to commits not reachable from - the named commit. `--head ` restricts to a single tip. -- Discovery is **not** v2-specific. It is a universe of "every checkpoint we - ever ran on a commit reachable from a real ref." + the named commit. `--head ` restricts to a single tip. **Either + flag suppresses the v2 /main orphan augmentation**: when commit scope is + set the tool re-runs the trailer walk unscoped, counts how many v2 /main + IDs would have been newly discovered as orphans, and prints + `warning: N v2 orphans skipped; re-run without --since/--head to include + them` to stdout before the report. Those IDs are **not** added to the + migration plan in the scoped run. +- Discovery is **not** v2-specific by default, but the orphan augmentation + reaches into v2 /main, so v2 refs (or at least the local copy) influence + the candidate set. ### 1.2 Migration filter (`cmd/migrate-v2-checkpoints/migration.go`) @@ -47,7 +61,9 @@ For each discovered checkpoint: 1. Read v1 summary from `entire/checkpoints/v1`. If present, collect existing v1 session IDs by reading each session's `metadata.json` (`session_id` - field). + field). v1 session paths are recovered from + `summary.Sessions[*].Metadata` via `v1SessionIndexFromSummary`, so sparse + or non-contiguous v1 indices are handled correctly. 2. Read v2 summary from `refs/entire/checkpoints/v2/main`. If absent or has no sessions → `missing v2 checkpoint metadata` and skip. 3. For every session index in the v2 summary: @@ -58,12 +74,22 @@ For each discovered checkpoint: `/full/<13-digit-suffix>` refs. `ErrNoTranscript` → `missing raw transcripts`. - Otherwise: count `sessions eligible for migration`, and on `--apply` - write to v1 via `GitStore.WriteCommitted` using v2-sourced fields. + resolve the v2 `/main` commit that last touched that session's + `metadata.json`, then write to v1 via `GitStore.WriteCommitted` using + v2-sourced fields and that original v2 commit author line. The transcript + is wrapped in `redact.AlreadyRedacted(...)` so the v1 writer does not + re-redact bytes that were already redacted on v2. A checkpoint is **eligible** if at least one v2 session is missing from v1 and fully readable from v2. The candidate's `sessions=N` is that net count, not the v2 session count. +Additionally, the report tracks how many eligible checkpoints were orphans +(discovered through v2 /main alone, with no commit trailer attribution). An +eligible checkpoint with `len(discovered.Commits) == 0` increments the +`v2 orphan checkpoints eligible for migration` counter; this is a subset of +`checkpoints eligible for migration`, never larger than `EC`. + ### 1.3 What ends up on v1 after `--apply` For each migrated session, the v1 tree at `///` gains: @@ -77,21 +103,26 @@ For each migrated session, the v1 tree at `///` gains: Plus the root `//metadata.json` gets rewritten to add the new session to `sessions[]` and recompute aggregate fields (see §3.2). +Each migrated v1 metadata-branch commit uses the author name, email, and author +timestamp from the v2 `/main` commit that wrote the corresponding v2 session +`metadata.json`; the session metadata's own `created_at` remains the v2 JSON +value. `` is the v1 slot. New sessions append (`findSessionIndex` in -`committed.go:326`); if v1 already had session 0 and v2 contributes one new +`committed.go:610`); if v1 already had session 0 and v2 contributes one new session, it lands in v1 slot 1. v1 indices and v2 indices for the **same** checkpoint can differ; only `session_id` is invariant across the two stores. -Chunking note: `full.jsonl` is chunked via `agent.ChunkTranscript`. Chunks are -`full.jsonl`, `full.jsonl.001`, `full.jsonl.002`, … (`agent/chunking.go:122` -with `ChunkSuffix = ".%03d"`). Index 0 has no suffix. +Chunking note: `full.jsonl` is chunked via `agent.ChunkTranscript`. Chunks +are `full.jsonl`, `full.jsonl.001`, `full.jsonl.002`, … +(`agent/chunking.go:126` `ChunkFileName`, with +`ChunkSuffix = ".%03d"` at line 19). Index 0 has no suffix. Codex caveat: for sessions whose agent is `codex`, `writeTranscript` applies `codex.SanitizePortableTranscript` before chunking and hashing -(`committed.go:745-747`). The bytes written to v1 may differ from the bytes -read out of v2's `/full/*`, but they are still self-consistent against the new -v1 `content_hash.txt`. +(`committed.go:746`). The bytes written to v1 may differ from the bytes +read out of v2's `/full/*`, but they are still self-consistent against the +new v1 `content_hash.txt`. ## 2. Run modes & expected report shape @@ -102,11 +133,21 @@ $ migrate-v2-checkpoints [--repo PATH] [--since SHA | SHA] [--head SHA] \ Default mode is `plan` (same output as `--dry-run`). +For `--dry-run` and `--apply` (but not `--list`), the tool resolves the +checkpoint fetch remote and refreshes `refs/entire/checkpoints/v2/main` plus +every `refs/entire/checkpoints/v2/full/*` ref before discovery (see +`ensureLatestV2Refs` in `v2_preflight.go`). If the remote can't be resolved +and a local v2 /main ref exists, fetch is skipped silently; if neither +condition holds, the tool errors out before doing any work. + `--list` produces one line per checkpoint: ```text [ ...] + (orphan) ``` -This is the **universe** discovered in history — NOT the eligible set. +The first form is for commit-attributed IDs; the second is for orphans +(IDs present on v2 /main with no commit trailer in history). This is the +**universe** discovered — NOT the eligible set. `--dry-run` / `--apply` produces: ```text @@ -117,27 +158,46 @@ Migration plan: (or "Migration result:" on --apply) missing required v2 session metadata: M2 missing raw transcripts: M3 checkpoints eligible for migration: EC + v2 orphan checkpoints eligible for migration: EO sessions eligible for migration: ES migrated checkpoints: ... (--apply only) migrated sessions: ... (--apply only) - checkpoints to migrate: + checkpoints to migrate: (or "migrated checkpoint details:" on --apply) sessions=N commits=[,...] + sessions=N commits=(orphan) +``` + +If `--since` or `--head` is set and the v2 /main ref carries IDs the scoped +trailer walk wouldn't have found, the tool prints a single line **before** +the report: +```text +warning: N v2 orphans skipped; re-run without --since/--head to include them ``` Invariants that should always hold on the report: - `EC ≤ D`. +- `EO ≤ EC` (orphan-eligible is a subset of eligible). - `ES ≥ EC` (each eligible checkpoint contributes ≥ 1 eligible session). - `ES = Σ candidate.SessionCount`. The candidate list is exhaustive. +- The candidate list is sorted by `` ascending; commit SHAs within + a candidate are sorted by commit date descending (most recent first), + ties broken by hash. Orphan candidates print `commits=(orphan)` instead + of a SHA list. - On `--apply`: `migrated checkpoints = EC` and `migrated sessions = ES` if - no write errors. Anything less means a partial write failure — re-run the - tool and the remainder should re-appear as eligible. -- `D = EC + (checkpoints with all v2 sessions in v1) + (checkpoints with - any missing-metadata / missing-transcript failure modes)`. + no write errors. Anything less means a partial write failure — re-run + the tool and the remainder should re-appear as eligible. +- `D = EC + (checkpoints with v2 summary but eligibleSessions==0) + M1`. + The middle term covers both "all v2 sessions already in v1" and "every + v2 session was unreadable (missing metadata or transcript)" — those land + in the per-session counters `A`, `M2`, `M3` rather than dropping the + checkpoint at the summary level. - Counter sums for skipped sessions: `A + M2 + M3 = (Σ over all v2 sessions in checkpoints whose v2 summary - exists) − ES`. Useful for spot-checking after `--apply`: if `A` is large - and `EC` is small, most v2 checkpoints are already mirrored. + exists) − ES`. Useful for spot-checking after `--apply`: if `A` is + large and `EC` is small, most v2 checkpoints are already mirrored. If + `EO` is close to `EC` and `A` is small, this repo skipped v2 entirely + and the migration is largely "import from v2 /main." ## 3. Validation procedure @@ -160,11 +220,21 @@ git -C "$REPO" for-each-ref 'refs/entire/checkpoints/v2/full/*' \ --format='%(refname)' ``` -If `entire/checkpoints/v1` is missing the migration can still apply (it will -be created), but if the v2 refs are missing there is nothing to migrate. +If `entire/checkpoints/v1` is missing the migration can still apply (it +will be created), but if the v2 refs are missing there is nothing to +migrate. -Also sanity-check the head of v2 isn't surprising — a recent commit means v2 -was being dual-written; a long-stale v2 head matches the rollback narrative: +`--dry-run` and `--apply` auto-fetch v2 refs from the repo's checkpoint +remote before discovery (`ensureLatestV2Refs`), so the local state +*after* those modes runs will reflect the remote. Pre-flight is still +useful to (a) catch a missing local v1 branch and (b) sanity-check that +the local v2 /main looks like it's frozen rather than actively +advancing. `--list` does **not** auto-fetch; if you intend to inspect +the universe via `--list`, pre-fetch manually (see §9). + +Also sanity-check the head of v2 isn't surprising — a recent commit +means v2 was being dual-written; a long-stale v2 head matches the +rollback narrative: ```sh git -C "$REPO" log -1 --format='%h %ci %s' refs/entire/checkpoints/v2/main @@ -179,17 +249,28 @@ git -C "$REPO" log -1 --format='%h %ci %s' refs/entire/checkpoints/v2/main Spot-check the counter math against §2: ```sh -grep -E "^ (discovered|already|missing|checkpoints eligible|sessions eligible)" \ +grep -E "^ (discovered|already|missing|checkpoints eligible|v2 orphan|sessions eligible)" \ /tmp/migrate.plan ``` -- `EC ≤ D` and `ES ≥ EC`. +- `EC ≤ D` and `EO ≤ EC` and `ES ≥ EC`. - For each candidate line, parse `sessions=N` and sum — must equal `ES`. +- The number of candidate lines with `commits=(orphan)` must equal `EO`. ```sh awk '/^ [0-9a-f]{12} sessions=/ {sub(/sessions=/,"",$2); s+=$2} END {print s}' \ /tmp/migrate.plan # Should equal the "sessions eligible for migration" value. + +grep -cE '^ [0-9a-f]{12} sessions=[0-9]+ commits=\(orphan\)$' /tmp/migrate.plan +# Should equal the "v2 orphan checkpoints eligible for migration" value. +``` + +If the run was launched with `--since` or `--head`, also confirm the +orphan-skip warning matches expectations: + +```sh +grep "^warning: " /tmp/migrate.plan || echo "(no scope-orphan warning)" ``` ### 3.3 Step B — confirm every candidate is genuinely v2-only-or-partial @@ -246,26 +327,33 @@ wc -l /tmp/v2_only_ids.txt Every ID in `v2_only_ids.txt` should be either a candidate, or — if v2 has no session metadata for it / no raw transcript — a contributor to the `missing v2 checkpoint metadata` / `missing raw transcripts` counters. +Orphan candidates also live in this set: they are exactly v2 /main IDs +with no commit attribution but with intact v2 metadata + transcripts. A quick predicate: the eligible candidate count plus the missing-metadata and missing-raw counters should equal or exceed the v2-only set. If it's less, something is being silently dropped. ```sh -EC=$(grep "checkpoints eligible" /tmp/migrate.plan | awk '{print $NF}') +EC=$(grep "checkpoints eligible for migration" /tmp/migrate.plan | awk '{print $NF}') +EO=$(grep "v2 orphan checkpoints" /tmp/migrate.plan | awk '{print $NF}') M1=$(grep "missing v2 checkpoint metadata" /tmp/migrate.plan | awk '{print $NF}') M3=$(grep "missing raw transcripts" /tmp/migrate.plan | awk '{print $NF}') echo "v2-only on disk: $(wc -l < /tmp/v2_only_ids.txt)" -echo "EC=$EC M1=$M1 M3=$M3 (EC + M1 + M3 must be >= v2-only count)" +echo "EC=$EC EO=$EO M1=$M1 M3=$M3" +echo " EC + M1 + M3 must be >= v2-only count" +echo " EO <= EC must hold (orphan is a subset of eligible)" ``` -(`>=` rather than `=` because `M1`/`M3` are counted per-checkpoint over the -entire discovered universe, not only the v2-only set.) +(`>=` rather than `=` because `M1`/`M3` are counted per-checkpoint / +per-session over the entire discovered universe, not only the v2-only +set. `EO` is exactly the subset of `EC` whose discovery came from v2 +/main alone.) ### 3.4 Step C — confirm commit-list accuracy -The report's `commits=...` are short SHAs of commits in history whose message -carries `Entire-Checkpoint: `. Verify directly: +The report's `commits=...` are short SHAs of commits in history whose +message carries `Entire-Checkpoint: `. Verify directly: ```sh ID=02d9783342a2 @@ -275,11 +363,16 @@ git -C "$REPO" log --all --format='%h %s' --grep "Entire-Checkpoint: $ID" The set of short SHAs that this prints should match the report's `commits=…` for that ID. If they differ: +- `commits=(orphan)` in the report means the ID is on v2 /main but no + reachable commit message carries its trailer. `git log --grep` should + produce **no** output for that ID. If it does produce output, something + is wrong — either the trailer walk dropped the commit or the orphan + pass mislabelled the candidate. - Extra in the report but absent here: the discovery walk picked up a tip this `--all` view doesn't include (rare). - Extra here but absent in the report: a tip was filtered out - (`entire/checkpoints/v1`, `entire/trails/v1`, or `HEAD` aliases — the - filter is in `history.go:182-205`). + (`entire/checkpoints/v1`, `entire/trails/v1`, or `*/HEAD` symbolic refs + — see `isInternalHistoryRefName` / `isHistoryRef` in `history.go`). A commit may also appear under multiple candidate IDs if it's a squash merge with multiple trailers; that's expected. @@ -333,6 +426,13 @@ sample matches 1:1, the report's accounting is trustworthy. ## 4. Apply the migration +> ⛔ **Human operator only. Agents must not run `--apply`.** If an agent is +> helping with this runbook, it may prepare commands, inspect dry-run output, +> update documentation, and analyze validation results, but it must stop before +> executing any command that includes `--apply`. The repository owner/operator +> runs the apply command manually in their own terminal and then shares the +> resulting report/output for follow-up validation. + > ⛔ **No `git push` for `entire/checkpoints/v1` from this point until §5 > has fully passed and the operator has consciously decided to publish.** > The migration itself never pushes — but the v1 branch is the same ref @@ -360,10 +460,14 @@ a separate, explicit decision once the post-apply checks in §5 pass. - §3 ran clean: the candidate list looks plausible, counter math adds up, and a spot sample (Steps C and D) confirmed the candidates really are v2-only / partial migrations. -- The local repo has the v2 refs. If `git -C "$REPO" show-ref - refs/entire/checkpoints/v2/main` is empty, the migration will silently - count everything as "missing v2 checkpoint metadata" and write nothing. - Pre-fetch: +- The repo has a resolvable checkpoint fetch remote, OR a local + `refs/entire/checkpoints/v2/main` ref. `--apply` calls + `ensureLatestV2Refs` first and will refresh v2 /main and every + `refs/entire/checkpoints/v2/full/*` ref from the remote (forced fetch of + `/full/current`, fast-forward fetch of archives). If the fetch target + can't be resolved and no local v2 /main exists, the tool errors out + before doing any work. A manual pre-fetch is no longer required, but + remains a safe no-op: ```sh git -C "$REPO" fetch origin \ @@ -387,13 +491,15 @@ APPLIED_REPORT="/tmp/migrate-${REPO_NAME}.applied" PRE_APPLY_TIP=$(git -C "$REPO" rev-parse entire/checkpoints/v1 2>/dev/null || echo "none") echo "pre-apply v1 tip: $PRE_APPLY_TIP" +# USER ONLY: an agent must not execute this command. # Apply. Tee the report into /tmp/migrate-${REPO_NAME}.applied — §5 reads it back. "$TOOL" --repo "$REPO" --apply | tee "$APPLIED_REPORT" # Sanity-check the report. -grep -E "^ (checkpoints eligible|sessions eligible|migrated)" "$APPLIED_REPORT" +grep -E "^ (checkpoints eligible|v2 orphan|sessions eligible|migrated)" "$APPLIED_REPORT" # migrated checkpoints == checkpoints eligible # migrated sessions == sessions eligible +# v2 orphan ... == subset of EC (informational; not a pass/fail gate) # Anything less means at least one write failed silently — re-run --apply # (idempotent) and inspect logs. @@ -433,7 +539,8 @@ git -C "$REPO" log --format='%h %ci %s' \ ### Operator checkpoint -**Stop here. Run the apply command yourself and confirm:** +**Stop here. If you are an agent, do not run `--apply`. The human operator +must run the apply command themselves and confirm:** 1. `migrated checkpoints` equals `checkpoints eligible for migration` from the dry-run. @@ -486,7 +593,7 @@ git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/metadata.json" | jq . ``` Expected shape (schema lives at -`cmd/entire/cli/checkpoint/checkpoint.go:527-562`): +`cmd/entire/cli/checkpoint/checkpoint.go:545-563`): ```jsonc { @@ -533,10 +640,11 @@ Acceptable differences: the candidate's contributions are appended. - `combined_attribution`/`token_usage` may differ if the v1 store aggregates across all sessions present and v1 already had different - sessions. For purely v2-only checkpoints (the typical case the user - cares about) these should match the v2 summary exactly, since the - migration uses `summary.CombinedAttribution` from v2 verbatim - (`migration.go:199`) and per-session token usage is replayed from v2. + sessions. For purely v2-only checkpoints (the typical case, which + includes all orphan candidates) these should match the v2 summary + exactly, since the migration uses `summary.CombinedAttribution` from + v2 verbatim (`migration.go:242`) and per-session token usage is + replayed from v2. Hard requirements: @@ -568,7 +676,7 @@ echo "session $WANT_SID lives in v1 slot $V1_SLOT" ``` Then diff the per-session metadata, comparing **fields that are expected to -survive migration** (`migration.go:173-205` lists them explicitly): +survive migration** (`migration.go:216-248` lists them explicitly): ```sh V2_SLOT=… # slot the session occupied on v2 (its index in v2 summary) @@ -597,12 +705,13 @@ diff <(git -C "$REPO" cat-file -p \ Expected: no diff. Special cases: -- `created_at` is replayed from v2's `created_at` and also used as v1's - `CommitTime` (`migration.go:178-179`). The two timestamps in the v1 file - should be identical when serialised. +- `created_at` is replayed from v2's `created_at` into the v1 metadata JSON. + The v1 metadata-branch commit timestamp is a separate git author timestamp + copied from the v2 `/main` commit that last touched this session's + `metadata.json`. - The migration sets `HasReview = session.Kind(meta.Kind).IsReview()` - (`migration.go:204`). For non-review kinds this is `false` and may have - been absent (omitempty) in v2; that's still a match. + (`migration.go:247`). For non-review kinds this is `false` and may + have been absent (omitempty) in v2; that's still a match. - `cli_version` on the v1 session may differ from v2's. The migration doesn't pass `CLIVersion`, so v1 inherits whatever default the writer applies — generally an empty value or the current binary's version. Not @@ -620,6 +729,23 @@ git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/$V1_SLOT/metadata.json" > /dev/null && echo OK ``` +Author parity for the metadata-branch commit: + +```sh +V2_AUTHOR=$(git -C "$REPO" log -1 --format='%an <%ae> %aI' \ + refs/entire/checkpoints/v2/main -- "$SHARD/$V2_SLOT/metadata.json") +V1_AUTHOR=$(git -C "$REPO" log -1 --format='%an <%ae> %aI' \ + entire/checkpoints/v1 -- "$SHARD/$V1_SLOT/metadata.json") + +echo "v2: $V2_AUTHOR" +echo "v1: $V1_AUTHOR" +[ "$V1_AUTHOR" = "$V2_AUTHOR" ] && echo OK || echo MISMATCH +``` + +Expected: exact match. For orphan candidates this is still valid: the v2 +`/main` path history is the source of the author line even though no user +commit trailer exists. + ### 5.3 Step G — `prompt.txt` content The migration joins v2 prompts (split form on disk) back into a single @@ -651,13 +777,13 @@ This is the most important check. Two layers: Reassemble logic: ordered list `full.jsonl`, `full.jsonl.001`, `full.jsonl.002`, … For most agents this is JSONL with `\n` separators -between chunks (`agent/chunking.go:108-118`); for `vogon`, OpenCode etc. -the agent's own `ReassembleTranscript` is used at read time. For -validation, byte-concatenation in chunk order is what the v1 writer -hashed (`committed.go:784` — the hash is over `transcriptBytes` BEFORE -chunking), so the easier check is to read the original v1 input bytes -back via the v1 store API, OR to validate that each chunk blob is what -the v1 writer would have produced. +between chunks (`agent.ReassembleJSONL` in `agent/chunking.go:109-118`); +for `vogon`, OpenCode etc. the agent's own `ReassembleTranscript` is +used at read time. For validation, byte-concatenation in chunk order is +what the v1 writer hashed (`committed.go:784` — the hash is over +`transcriptBytes` BEFORE chunking), so the easier check is to read the +original v1 input bytes back via the v1 store API, OR to validate that +each chunk blob is what the v1 writer would have produced. The simplest robust shell check: reconstruct via ordered concat and compute the digest, then compare to `content_hash.txt`. This is exact for @@ -733,7 +859,7 @@ echo "v2 raw_transcript: $RAW_HASH" For non-Codex agents, the two hashes should match. For Codex (agent field on the session metadata is `codex`), they are allowed to differ — v1 sanitizes via `codex.SanitizePortableTranscript` before hashing -(`committed.go:745-747`). The v1 self-consistency check above is still +(`committed.go:746`). The v1 self-consistency check above is still required in that case. ### 5.5 Step I — bulk sweep @@ -861,7 +987,9 @@ from §4's "Behavior notes" — cheap and local, because you did not push. | `missing v2 checkpoint metadata: N (large)` | v2 `/main` is missing or its tree lacks summaries for many discovered IDs. | Confirm `refs/entire/checkpoints/v2/main` exists, was fetched, and is reasonably recent. | | `missing required v2 session metadata: > 0` | v2 session `metadata.json` lacks `checkpoint_id` or `session_id`. Could indicate corruption or a partial v2 write. | Inspect the affected sessions manually; they will be skipped, not failed. | | `missing raw transcripts: > 0` | v2 `/main` has a session but `/full/current` and archived `/full/*` don't carry its `raw_transcript*` data. | Confirm archived `/full/*` refs are present locally (or accessible via remote fetch). | -| Candidate `commits=` is empty | Shouldn't happen by construction (discovery groups by commit). Investigate the bug. | File a bug. | +| Candidate `commits=(orphan)` | The ID is on v2 /main with no commit-trailer attribution in history. Expected and benign; counted by `EO`. | None — verify against `git log --grep` in §3.4 to confirm there's no missed trailer. | +| `warning: N v2 orphans skipped` on a `--since`/`--head` run | Commit-scoped run found N v2 /main IDs that an unscoped walk would have surfaced as orphan candidates. | Re-run without `--since`/`--head` to include them, or accept the scope deliberately. | +| `v2 orphan checkpoints eligible for migration > checkpoints eligible for migration` | Should be impossible (`EO ⊆ EC` by construction). | File a bug. | | `sessions=N` for a candidate doesn't match the §3.5 expected | Either v1 already has the session (so report should have lower N), or session IDs are non-unique within v2. | Inspect; non-unique session IDs are a v2 corruption. | | Post-apply, `content_hash.txt` ≠ recomputed SHA-256 | Codex agent + ours-vs-original sanitization difference, OR a bug. Confirm `agent` field on the session. | If non-Codex, file a bug with chunk listing + bytes. | | Post-apply, `content_hash.txt` matches but v2's `raw_transcript_hash.txt` doesn't | Codex sanitization (expected) OR transcript was rewritten in transit. Confirm agent first. | If non-Codex, file a bug. | @@ -891,19 +1019,32 @@ from §4's "Behavior notes" — cheap and local, because you did not push. ## 8. Source map - Tool entry: `cmd/migrate-v2-checkpoints/main.go` -- History walk: `cmd/migrate-v2-checkpoints/history.go` -- Migration loop: `cmd/migrate-v2-checkpoints/migration.go` +- History walk: `cmd/migrate-v2-checkpoints/history.go` — + `discoverCheckpointHistoryWithSkippedOrphans` (line 55), + `addV2OrphanCheckpoints` (line 364), `listV2MainCheckpointIDs` + (line 408), `writeCheckpointList` (line 484, includes the `(orphan)` + label), `writeDiscoveryWarnings` (line 497, prints the scope-skip + warning). +- v2 ref auto-fetch: `cmd/migrate-v2-checkpoints/v2_preflight.go` — + `ensureLatestV2Refs` (line 24), `fetchV2MainRef` (line 75), + `fetchV2FullRefs` (line 101). +- Migration loop: `cmd/migrate-v2-checkpoints/migration.go` — + `migrateDiscoveredCheckpoints` (line 53), `migrateCheckpoint` + (line 98), `writeOptionsFromV2Content` (line 214), + `writeMigrationReport` (line 249), `candidateCommitLabel` (line 288, + emits `(orphan)`). - v1 write: `cmd/entire/cli/checkpoint/committed.go` — `WriteCommitted` - (line 52), `writeStandardCheckpointEntries` (line 310), + (line 58), `writeStandardCheckpointEntries` (line 310), `writeSessionToSubdirectory` (line 404), `writeTranscript` (line 720), - `findSessionIndex` (line 326). + `findSessionIndex` (line 610). - v2 read: `cmd/entire/cli/checkpoint/v2_read.go` — `ReadCommitted` - (line 24), `ReadSessionMetadataAndPrompts` (line 205), + (line 26), `ReadSessionMetadataAndPrompts` (line 205), `ReadSessionContent` (line 274), `readTranscriptFromFullRefs` - (line 339), `readTranscriptFromRef` (line 540). + (line 342), `readTranscriptFromRef` (line 540), + `isV2ArchivedFullRefSuffix` (line 523). - Schemas: `cmd/entire/cli/checkpoint/checkpoint.go` — `CheckpointSummary` - (line 527), `CommittedMetadata` (line 443), `SessionFilePaths` - (line 517). + (line 545), `CommittedMetadata` (line 444), `SessionFilePaths` + (line 520). - Trailer parsing: `cmd/entire/cli/trailers/trailers.go`. - Chunking: `cmd/entire/cli/agent/chunking.go`. - Sanitization (Codex only): `cmd/entire/cli/agent/codex/` @@ -917,16 +1058,23 @@ from §4's "Behavior notes" — cheap and local, because you did not push. remote refs the candidate list may include IDs whose underlying commits are only reachable via those remotes. That's still correct — those commits really did reference the IDs. -- If the v2 refs aren't fetched locally (the default refspec excludes - `refs/entire/*`), discovery will still find IDs from trailers but the - per-checkpoint v2 reads will fail with "missing v2 checkpoint metadata." - Pre-fetch with: +- `--dry-run` / `--apply` auto-fetch v2 refs from the repo's checkpoint + remote (`ensureLatestV2Refs`). If the remote resolves, you get an + up-to-date local copy of `refs/entire/checkpoints/v2/main` and every + `refs/entire/checkpoints/v2/full/*`; if it doesn't, the tool only + proceeds when a local v2 /main ref is already present. `--list` does + **not** auto-fetch — if you want a candidate universe that reflects + the remote, refresh manually first: ```sh git -C "$REPO" fetch origin \ 'refs/entire/checkpoints/v2/*:refs/entire/checkpoints/v2/*' ``` +- Orphan augmentation is enabled by default. Pass `--since` or `--head` + if you intentionally want to exclude v2-only IDs from migration; the + tool will still print a single-line warning summarising how many were + skipped. - The tool is **idempotent** in `--apply` mode. Re-running after a successful apply should produce `checkpoints eligible for migration: 0` modulo any new v2 data that landed in the meantime. - The tool only writes to the local repo. After `--apply`, push the - updated v1 branch yourself when ready. + updated v1 branch yourself when ready (and only after §5 passes). diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 7f88039c8..8ee39616f 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -295,7 +295,10 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { fixture := setupMigrationHistoryRepo(t) cpID := id.MustCheckpointID(mainCheckpointID) createdAt := time.Date(2024, 5, 6, 7, 8, 9, 0, time.UTC) + v2AuthorWhen := time.Date(2024, 5, 6, 8, 9, 10, 0, time.UTC) transcript := []byte("{\"type\":\"assistant\",\"message\":\"migrated\"}\n") + v2AuthorName := "Original V2 Author" + v2AuthorEmail := "original-v2@example.com" writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ CheckpointID: cpID, @@ -307,8 +310,9 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { Prompts: []string{"first prompt", "second prompt"}, FilesTouched: []string{"main.go"}, CheckpointsCount: 2, - AuthorName: testAuthorName, - AuthorEmail: testAuthorEmail, + AuthorName: v2AuthorName, + AuthorEmail: v2AuthorEmail, + AuthorWhen: v2AuthorWhen, Agent: agent.AgentTypeClaudeCode, Model: "claude-test-model", TurnID: "turn-1", @@ -359,14 +363,23 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { require.NoError(t, err) commit, err := fixture.repo.CommitObject(ref.Hash()) require.NoError(t, err) - require.True(t, commit.Author.When.Equal(createdAt), "author time = %s, want %s", commit.Author.When, createdAt) + require.Equal(t, v2AuthorName, commit.Author.Name) + require.Equal(t, v2AuthorEmail, commit.Author.Email) + require.True(t, commit.Author.When.Equal(v2AuthorWhen), "author time = %s, want %s", commit.Author.When, v2AuthorWhen) } func TestRunApplyMigratesV2OrphanCheckpointAndIsIdempotent(t *testing.T) { t.Parallel() cpID := id.MustCheckpointID("777777777777") - fixture := setupMigrationOrphanRepo(t, cpID.String()) + v2AuthorWhen := time.Date(2024, 7, 8, 9, 10, 11, 0, time.UTC) + v2AuthorName := "Original Orphan Author" + v2AuthorEmail := "original-orphan@example.com" + fixture := setupMigrationOrphanRepoWithOptions(t, cpID.String(), testV2CheckpointOptions{ + AuthorName: v2AuthorName, + AuthorEmail: v2AuthorEmail, + AuthorWhen: v2AuthorWhen, + }) var stdout bytes.Buffer err := run(context.Background(), []string{ @@ -389,6 +402,14 @@ func TestRunApplyMigratesV2OrphanCheckpointAndIsIdempotent(t *testing.T) { require.Equal(t, "orphan-session", content.Metadata.SessionID) require.JSONEq(t, `{"message":"orphan"}`, string(content.Transcript)) + ref, err := fixture.repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + require.NoError(t, err) + commit, err := fixture.repo.CommitObject(ref.Hash()) + require.NoError(t, err) + require.Equal(t, v2AuthorName, commit.Author.Name) + require.Equal(t, v2AuthorEmail, commit.Author.Email) + require.True(t, commit.Author.When.Equal(v2AuthorWhen), "author time = %s, want %s", commit.Author.When, v2AuthorWhen) + stdout.Reset() err = run(context.Background(), []string{ "--repo", fixture.dir, @@ -669,6 +690,12 @@ func setupMigrationHistoryRepo(t *testing.T) migrationHistoryFixture { func setupMigrationOrphanRepo(t *testing.T, checkpointID string) migrationHistoryFixture { t.Helper() + return setupMigrationOrphanRepoWithOptions(t, checkpointID, testV2CheckpointOptions{}) +} + +func setupMigrationOrphanRepoWithOptions(t *testing.T, checkpointID string, opts testV2CheckpointOptions) migrationHistoryFixture { + t.Helper() + dir := t.TempDir() testutil.InitRepo(t, dir) @@ -676,11 +703,14 @@ func setupMigrationOrphanRepo(t *testing.T, checkpointID string) migrationHistor repo, err := git.PlainOpen(dir) require.NoError(t, err) - writeTestV2Checkpoint(t, repo, testV2CheckpointOptions{ - CheckpointID: id.MustCheckpointID(checkpointID), - SessionID: "orphan-session", - Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"orphan\"}\n")), - }) + opts.CheckpointID = id.MustCheckpointID(checkpointID) + if opts.SessionID == "" { + opts.SessionID = "orphan-session" + } + if opts.Transcript.Len() == 0 { + opts.Transcript = redact.AlreadyRedacted([]byte("{\"message\":\"orphan\"}\n")) + } + writeTestV2Checkpoint(t, repo, opts) return migrationHistoryFixture{ dir: dir, diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 7c58a6ce4..409cea0ee 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -15,6 +15,7 @@ import ( "github.com/entireio/cli/redact" "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing/object" ) type migrationOptions struct { @@ -42,25 +43,22 @@ type migrationCandidate struct { } type checkpointMigrator struct { - v1Store *checkpoint.GitStore - v2Store *checkpoint.V2GitStore - opts migrationOptions - authorName string - authorEmail string - report *migrationReport + repo *git.Repository + v1Store *checkpoint.GitStore + v2Store *checkpoint.V2GitStore + opts migrationOptions + report *migrationReport } func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, discovered []discoveredCheckpoint, opts migrationOptions) (migrationReport, error) { - authorName, authorEmail := checkpoint.GetGitAuthorFromRepo(repo) v2Store := checkpoint.NewV2GitStore(repo) report := migrationReport{DiscoveredCheckpoints: len(discovered)} migrator := checkpointMigrator{ - v1Store: checkpoint.NewGitStore(repo), - v2Store: v2Store, - opts: opts, - authorName: authorName, - authorEmail: authorEmail, - report: &report, + repo: repo, + v1Store: checkpoint.NewGitStore(repo), + v2Store: v2Store, + opts: opts, + report: &report, } for _, discoveredCheckpoint := range discovered { @@ -144,7 +142,11 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di m.report.EligibleSessions++ if m.opts.apply { - writeOpts := writeOptionsFromV2Content(content, summary, m.authorName, m.authorEmail) + author, err := findV2SessionAuthor(ctx, m.repo, discovered.ID, sessionIndex) + if err != nil { + return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, sessionIndex, err) + } + writeOpts := writeOptionsFromV2Content(content, summary, author) if err := m.v1Store.WriteCommitted(ctx, writeOpts); err != nil { return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) } @@ -211,21 +213,21 @@ func hasRequiredV2Metadata(content *checkpoint.SessionContent) bool { return !content.Metadata.CheckpointID.IsEmpty() && content.Metadata.SessionID != "" } -func writeOptionsFromV2Content(content *checkpoint.SessionContent, summary *checkpoint.CheckpointSummary, authorName, authorEmail string) checkpoint.WriteCommittedOptions { +func writeOptionsFromV2Content(content *checkpoint.SessionContent, summary *checkpoint.CheckpointSummary, author object.Signature) checkpoint.WriteCommittedOptions { meta := content.Metadata return checkpoint.WriteCommittedOptions{ CheckpointID: meta.CheckpointID, SessionID: meta.SessionID, CreatedAt: meta.CreatedAt, - CommitTime: meta.CreatedAt, + CommitTime: author.When, Strategy: meta.Strategy, Branch: meta.Branch, Transcript: redact.AlreadyRedacted(content.Transcript), Prompts: checkpoint.SplitPromptContent(content.Prompts), FilesTouched: meta.FilesTouched, CheckpointsCount: meta.CheckpointsCount, - AuthorName: authorName, - AuthorEmail: authorEmail, + AuthorName: author.Name, + AuthorEmail: author.Email, Agent: meta.Agent, Model: meta.Model, TurnID: meta.TurnID, diff --git a/cmd/migrate-v2-checkpoints/v2_author.go b/cmd/migrate-v2-checkpoints/v2_author.go new file mode 100644 index 000000000..bacf40172 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v2_author.go @@ -0,0 +1,61 @@ +package main + +import ( + "context" + "errors" + "fmt" + "strconv" + + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/object" +) + +var errFoundV2SessionAuthor = errors.New("found v2 session author") + +func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpointID.CheckpointID, sessionIndex int) (object.Signature, error) { + if err := ctx.Err(); err != nil { + return object.Signature{}, err //nolint:wrapcheck // Propagating context cancellation + } + + ref, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + if err != nil { + return object.Signature{}, fmt.Errorf("resolve %s: %w", paths.V2MainRefName, err) + } + + metadataPath := v2SessionMetadataPath(cpID, sessionIndex) + iter, err := repo.Log(&git.LogOptions{ + From: ref.Hash(), + Order: git.LogOrderCommitterTime, + PathFilter: func(path string) bool { + return path == metadataPath + }, + }) + if err != nil { + return object.Signature{}, fmt.Errorf("read %s history for %s: %w", paths.V2MainRefName, metadataPath, err) + } + defer iter.Close() + + var author object.Signature + err = iter.ForEach(func(commit *object.Commit) error { + if err := ctx.Err(); err != nil { + return err //nolint:wrapcheck // Propagating context cancellation + } + author = commit.Author + return errFoundV2SessionAuthor + }) + if errors.Is(err, errFoundV2SessionAuthor) { + return author, nil + } + if err != nil { + return object.Signature{}, fmt.Errorf("walk %s history for %s: %w", paths.V2MainRefName, metadataPath, err) + } + return object.Signature{}, fmt.Errorf("%s not found in %s history", metadataPath, paths.V2MainRefName) +} + +func v2SessionMetadataPath(cpID checkpointID.CheckpointID, sessionIndex int) string { + return cpID.Path() + "/" + strconv.Itoa(sessionIndex) + "/" + paths.MetadataFileName +} diff --git a/cmd/migrate-v2-checkpoints/v2_fixture_test.go b/cmd/migrate-v2-checkpoints/v2_fixture_test.go index aa92bca92..ae8356c05 100644 --- a/cmd/migrate-v2-checkpoints/v2_fixture_test.go +++ b/cmd/migrate-v2-checkpoints/v2_fixture_test.go @@ -39,6 +39,7 @@ type testV2CheckpointOptions struct { CheckpointsCount int AuthorName string AuthorEmail string + AuthorWhen time.Time Agent types.AgentType Model string TurnID string @@ -76,6 +77,9 @@ func writeTestV2Checkpoint(t *testing.T, repo *git.Repository, opts testV2Checkp if opts.AuthorEmail == "" { opts.AuthorEmail = testAuthorEmail } + if opts.AuthorWhen.IsZero() { + opts.AuthorWhen = opts.CreatedAt + } sessionIndex := writeTestV2MainCheckpoint(t, repo, opts) if opts.Transcript.Len() > 0 { @@ -197,7 +201,11 @@ func writeTestV2MainCheckpoint(t *testing.T, repo *git.Repository, opts testV2Ch Hash: summaryBlob, } - writeTestV2RefEntries(t, repo, refName, parentHash, entries, "test v2 main fixture") + writeTestV2RefEntriesWithAuthor(t, repo, refName, parentHash, entries, "test v2 main fixture", object.Signature{ + Name: opts.AuthorName, + Email: opts.AuthorEmail, + When: opts.AuthorWhen, + }) return sessionIndex } @@ -251,10 +259,31 @@ func readTestV2RefEntries(t *testing.T, repo *git.Repository, refName plumbing.R func writeTestV2RefEntries(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName, parentHash plumbing.Hash, entries map[string]object.TreeEntry, message string) { t.Helper() + authorName, authorEmail := checkpoint.GetGitAuthorFromRepo(repo) + writeTestV2RefEntriesWithAuthor(t, repo, refName, parentHash, entries, message, object.Signature{ + Name: authorName, + Email: authorEmail, + When: time.Now(), + }) +} + +func writeTestV2RefEntriesWithAuthor(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName, parentHash plumbing.Hash, entries map[string]object.TreeEntry, message string, author object.Signature) { + t.Helper() + treeHash, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, entries) require.NoError(t, err) - authorName, authorEmail := checkpoint.GetGitAuthorFromRepo(repo) - commitHash, err := checkpoint.CreateCommit(context.Background(), repo, treeHash, parentHash, message, authorName, authorEmail) + commit := &object.Commit{ + TreeHash: treeHash, + Author: author, + Committer: author, + Message: message, + } + if parentHash != plumbing.ZeroHash { + commit.ParentHashes = []plumbing.Hash{parentHash} + } + encoded := repo.Storer.NewEncodedObject() + require.NoError(t, commit.Encode(encoded)) + commitHash, err := repo.Storer.SetEncodedObject(encoded) require.NoError(t, err) require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(refName, commitHash))) } From 3d9c2430661505f6aaf777d6edbb32c1c2bc81ce Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 10:16:28 -0700 Subject: [PATCH 25/35] Update migration validation runbook Entire-Checkpoint: 1d716c292e2f --- cmd/migrate-v2-checkpoints/VALIDATION.md | 367 +++++++++++++++++------ cmd/migrate-v2-checkpoints/main_test.go | 1 - 2 files changed, 275 insertions(+), 93 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index e5092be4b..769b63ddc 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -111,7 +111,7 @@ value. `` is the v1 slot. New sessions append (`findSessionIndex` in `committed.go:610`); if v1 already had session 0 and v2 contributes one new session, it lands in v1 slot 1. v1 indices and v2 indices for the **same** -checkpoint can differ; only `session_id` is invariant across the two stores. +checkpoint can differ; `session_id` is the stable cross-store identifier. Chunking note: `full.jsonl` is chunked via `agent.ChunkTranscript`. Chunks are `full.jsonl`, `full.jsonl.001`, `full.jsonl.002`, … @@ -137,8 +137,10 @@ For `--dry-run` and `--apply` (but not `--list`), the tool resolves the checkpoint fetch remote and refreshes `refs/entire/checkpoints/v2/main` plus every `refs/entire/checkpoints/v2/full/*` ref before discovery (see `ensureLatestV2Refs` in `v2_preflight.go`). If the remote can't be resolved -and a local v2 /main ref exists, fetch is skipped silently; if neither -condition holds, the tool errors out before doing any work. +and a local v2 /main ref exists, fetch is skipped silently. If the remote +does resolve, the remote must advertise v2 /main; a stale local v2 /main does +not bypass a missing remote v2 /main. If neither a usable remote nor local v2 +/main is available, the tool errors out before doing any work. `--list` produces one line per checkpoint: ```text @@ -174,7 +176,7 @@ the report: warning: N v2 orphans skipped; re-run without --since/--head to include them ``` -Invariants that should always hold on the report: +Checks that should always hold on the report: - `EC ≤ D`. - `EO ≤ EC` (orphan-eligible is a subset of eligible). @@ -208,6 +210,22 @@ The procedure below is the same regardless of repo. Substitute `$REPO` and REPO=/path/to/some-repo # e.g. ~/entire/marvin TOOL=~/entire/cli/.worktrees/review/migrate-v2-checkpoints cd "$REPO" + +sha256_stdin() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum | awk '{print $1}' + else + shasum -a 256 | awk '{print $1}' + fi +} + +sha256_file() { + if command -v sha256sum >/dev/null 2>&1; then + sha256sum "$1" | awk '{print $1}' + else + shasum -a 256 "$1" | awk '{print $1}' + fi +} ``` ### 3.1 Pre-flight: confirm both stores exist @@ -324,31 +342,32 @@ comm -23 /tmp/v2_ids.txt /tmp/v1_ids.txt > /tmp/v2_only_ids.txt wc -l /tmp/v2_only_ids.txt ``` -Every ID in `v2_only_ids.txt` should be either a candidate, or — if v2 has -no session metadata for it / no raw transcript — a contributor to the -`missing v2 checkpoint metadata` / `missing raw transcripts` counters. -Orphan candidates also live in this set: they are exactly v2 /main IDs -with no commit attribution but with intact v2 metadata + transcripts. +Every ID in `v2_only_ids.txt` should be either a candidate or accounted for +by missing v2 checkpoint metadata, missing required v2 session metadata, or +missing raw transcript skips. Orphan candidates also live in this set: they +are exactly v2 /main IDs with no commit attribution but with intact v2 +metadata + transcripts. -A quick predicate: the eligible candidate count plus the missing-metadata -and missing-raw counters should equal or exceed the v2-only set. If it's -less, something is being silently dropped. +A quick predicate: the eligible candidate count plus the missing summary, +required-metadata, and missing-raw counters should equal or exceed the +v2-only set. If it's less, something is being silently dropped. ```sh EC=$(grep "checkpoints eligible for migration" /tmp/migrate.plan | awk '{print $NF}') EO=$(grep "v2 orphan checkpoints" /tmp/migrate.plan | awk '{print $NF}') M1=$(grep "missing v2 checkpoint metadata" /tmp/migrate.plan | awk '{print $NF}') +M2=$(grep "missing required v2 session metadata" /tmp/migrate.plan | awk '{print $NF}') M3=$(grep "missing raw transcripts" /tmp/migrate.plan | awk '{print $NF}') echo "v2-only on disk: $(wc -l < /tmp/v2_only_ids.txt)" -echo "EC=$EC EO=$EO M1=$M1 M3=$M3" -echo " EC + M1 + M3 must be >= v2-only count" +echo "EC=$EC EO=$EO M1=$M1 M2=$M2 M3=$M3" +echo " EC + M1 + M2 + M3 must be >= v2-only count" echo " EO <= EC must hold (orphan is a subset of eligible)" ``` -(`>=` rather than `=` because `M1`/`M3` are counted per-checkpoint / -per-session over the entire discovered universe, not only the v2-only -set. `EO` is exactly the subset of `EC` whose discovery came from v2 -/main alone.) +(`>=` rather than `=` because `M1`, `M2`, and `M3` are counted over the +entire discovered universe, not only the v2-only set; `M2` and `M3` are +also per-session counters. `EO` is exactly the subset of `EC` whose +discovery came from v2 /main alone.) ### 3.4 Step C — confirm commit-list accuracy @@ -379,27 +398,20 @@ merge with multiple trailers; that's expected. ### 3.5 Step D — DRY-RUN INSPECTION of session count -For each candidate, the report claims `sessions=N`. Confirm: +For each candidate, the report claims `sessions=N`. Confirm by counting v2 +sessions that are not already in v1 **and** are eligible by the same filters +the migration applies: required metadata present and raw transcript present on +`/full/current` or an archived `/full/*` ref. ```sh ID=02d9783342a2 SHARD=${ID:0:2}/${ID:2} +EXPECTED_SESSIONS=1 # report's sessions=N for this checkpoint # Sessions advertised by the v2 summary (from /main). -git -C "$REPO" cat-file -p \ - refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ - | jq -r '.sessions | length' - -# Session IDs in v2 (read each session's own metadata.json — that field is -# what the migration tool dedupes against, not summary order). V2_SESSION_COUNT=$(git -C "$REPO" cat-file -p \ refs/entire/checkpoints/v2/main:"$SHARD/metadata.json" \ | jq -r '.sessions | length') -for i in $(seq 0 $((V2_SESSION_COUNT-1))); do - git -C "$REPO" cat-file -p \ - refs/entire/checkpoints/v2/main:"$SHARD/$i/metadata.json" \ - | jq -r '.session_id' -done | sort -u > /tmp/v2_sids.txt # Session IDs already in v1 for this checkpoint. if git -C "$REPO" cat-file -e \ @@ -416,9 +428,45 @@ else : > /tmp/v1_sids.txt fi -# Expected eligible: v2 minus v1, by session ID. -comm -23 /tmp/v2_sids.txt /tmp/v1_sids.txt | wc -l -# This number must equal the report's "sessions=N" for this checkpoint. +FULL_REFS=$(git -C "$REPO" for-each-ref \ + --format='%(refname)' 'refs/entire/checkpoints/v2/full/*' \ + | awk '/full\/current$/ {print "1 " $0; next} {print "0 " $0}' \ + | sort -k1,1nr -k2,2r \ + | awk '{print $2}') + +eligible=0 +for i in $(seq 0 $((V2_SESSION_COUNT-1))); do + META=$(git -C "$REPO" cat-file -p \ + refs/entire/checkpoints/v2/main:"$SHARD/$i/metadata.json" 2>/dev/null) \ + || continue + + SID=$(echo "$META" | jq -r '.session_id // ""') + CPID=$(echo "$META" | jq -r '.checkpoint_id // ""') + if [ -z "$SID" ] || [ -z "$CPID" ]; then + continue + fi + if grep -qxF "$SID" /tmp/v1_sids.txt; then + continue + fi + + has_raw=0 + for r in $FULL_REFS; do + if git -C "$REPO" cat-file -e \ + "$r:$SHARD/$i/raw_transcript" 2>/dev/null || + git -C "$REPO" ls-tree --name-only "$r:$SHARD/$i" 2>/dev/null \ + | grep -qE '^raw_transcript\.[0-9]{3}$'; then + has_raw=1 + break + fi + done + if [ "$has_raw" = 1 ]; then + eligible=$((eligible+1)) + fi +done + +echo "eligible sessions from v2: $eligible" +echo "report sessions=N: $EXPECTED_SESSIONS" +[ "$eligible" -eq "$EXPECTED_SESSIONS" ] && echo OK || echo MISMATCH ``` Repeat for a random sample (5–10) across the candidate list. If your @@ -460,14 +508,15 @@ a separate, explicit decision once the post-apply checks in §5 pass. - §3 ran clean: the candidate list looks plausible, counter math adds up, and a spot sample (Steps C and D) confirmed the candidates really are v2-only / partial migrations. -- The repo has a resolvable checkpoint fetch remote, OR a local - `refs/entire/checkpoints/v2/main` ref. `--apply` calls - `ensureLatestV2Refs` first and will refresh v2 /main and every +- The repo has either a checkpoint fetch remote that advertises + `refs/entire/checkpoints/v2/main`, or no resolvable fetch remote but an + already-present local `refs/entire/checkpoints/v2/main` ref. `--apply` + calls `ensureLatestV2Refs` first and will refresh v2 /main and every `refs/entire/checkpoints/v2/full/*` ref from the remote (forced fetch of `/full/current`, fast-forward fetch of archives). If the fetch target - can't be resolved and no local v2 /main exists, the tool errors out - before doing any work. A manual pre-fetch is no longer required, but - remains a safe no-op: + resolves but lacks v2 /main, the tool errors out even if a local v2 /main + ref exists; that prevents silently using stale local rollback data. A + manual pre-fetch is no longer required, but remains a safe no-op: ```sh git -C "$REPO" fetch origin \ @@ -524,15 +573,25 @@ git -C "$REPO" log --format='%h %ci %s' \ `checkpoint_remote` mirror it. Push is a separate manual procedure that is explicitly out of scope here, and is only safe **after** every step in §5 passes and the operator is satisfied. -- **Per-checkpoint atomicity, not transactional.** Each candidate is +- **Per-session atomicity, not transactional.** Each migrated session is written as its own commit on v1. If `--apply` errors out partway - through, earlier candidates remain written and later ones are - un-written; the next run will pick up the rest. + through a checkpoint with multiple eligible sessions, earlier sessions + remain written and later sessions reappear on the next run. +- **v1 commit author matches v2.** Each new v1 commit is authored with + the same name, email, and author timestamp as the v2 `/main` commit that + wrote the migrated session's `metadata.json`, so `git log` against v1 + and v2 attributes the same checkpoint session to the same author. §5.6 + treats the `author` header in `entire explain` as a required check; a + mismatch is a regression, not an accepted divergence. - **Roll back** by resetting v1 back to `$PRE_APPLY_TIP`: ```sh # Only if you need to undo — this discards the new commits locally. - git -C "$REPO" update-ref refs/heads/entire/checkpoints/v1 "$PRE_APPLY_TIP" + if [ "$PRE_APPLY_TIP" = "none" ]; then + git -C "$REPO" update-ref -d refs/heads/entire/checkpoints/v1 + else + git -C "$REPO" update-ref refs/heads/entire/checkpoints/v1 "$PRE_APPLY_TIP" + fi ``` Safe before any push. Destructive after push. @@ -617,7 +676,8 @@ Expected shape (schema lives at } ``` -Field-by-field check against the v2 summary on `/main` for the same ID: +For checkpoints that were fully v2-only and whose v2 sessions all migrated, +the root summary should match the v2 summary for the stable fields below: ```sh diff <(git -C "$REPO" cat-file -p \ @@ -638,13 +698,15 @@ Acceptable differences: (`full.jsonl`, `content_hash.txt`), not v2's compact format. - If v1 already had sessions, `sessions[]` length on v1 may exceed v2's; the candidate's contributions are appended. -- `combined_attribution`/`token_usage` may differ if the v1 store - aggregates across all sessions present and v1 already had different - sessions. For purely v2-only checkpoints (the typical case, which - includes all orphan candidates) these should match the v2 summary - exactly, since the migration uses `summary.CombinedAttribution` from - v2 verbatim (`migration.go:242`) and per-session token usage is - replayed from v2. +- If only some v2 sessions migrated (because others were already present, + lacked required metadata, or lacked raw transcripts), aggregate fields + such as `checkpoints_count`, `files_touched`, `token_usage`, and + `has_review` may differ. The v1 writer reaggregates those fields from + the sessions actually present in v1, not from every session in v2. +- `combined_attribution` may also differ when v1 already had sessions. For + purely v2-only checkpoints with all v2 sessions migrated, it should match + the v2 summary exactly because the migration uses + `summary.CombinedAttribution` from v2 verbatim (`migration.go:242`). Hard requirements: @@ -716,10 +778,10 @@ Expected: no diff. Special cases: doesn't pass `CLIVersion`, so v1 inherits whatever default the writer applies — generally an empty value or the current binary's version. Not a correctness issue. -- v1 writes the new `combined_attribution` and aggregated `token_usage` - onto the **root** `metadata.json` from the migrating session's data. If - there were prior v1 sessions, the root summary on v1 already aggregated - them; only the new session's session-level metadata matters for §4.2. +- Root summary aggregation is covered in §5.1. Session-level comparison here + should ignore root-only fields such as `combined_attribution`; per-session + `token_usage` is copied from v2 and then folded into the root summary by + the v1 writer. Schema sanity per session: @@ -749,21 +811,32 @@ commit trailer exists. ### 5.3 Step G — `prompt.txt` content The migration joins v2 prompts (split form on disk) back into a single -`prompt.txt` via `SplitPromptContent` round-trip. The bytes should match -the v2 content: +`prompt.txt` via `SplitPromptContent` round-trip. If `prompt.txt` exists on +v2, the v1 bytes should match. If it is absent on v2, it should also be +absent on v1. ```sh -git -C "$REPO" cat-file -p \ - "refs/entire/checkpoints/v2/main:$SHARD/$V2_SLOT/prompt.txt" \ - | sha256sum -git -C "$REPO" cat-file -p \ - "entire/checkpoints/v1:$SHARD/$V1_SLOT/prompt.txt" \ - | sha256sum +if git -C "$REPO" cat-file -e \ + "refs/entire/checkpoints/v2/main:$SHARD/$V2_SLOT/prompt.txt" 2>/dev/null; then + V2_PROMPT_HASH=$(git -C "$REPO" cat-file -p \ + "refs/entire/checkpoints/v2/main:$SHARD/$V2_SLOT/prompt.txt" \ + | sha256_stdin) + V1_PROMPT_HASH=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/prompt.txt" \ + | sha256_stdin) + echo "v2 prompt: $V2_PROMPT_HASH" + echo "v1 prompt: $V1_PROMPT_HASH" + [ "$V1_PROMPT_HASH" = "$V2_PROMPT_HASH" ] && echo OK || echo MISMATCH +else + git -C "$REPO" cat-file -e \ + "entire/checkpoints/v1:$SHARD/$V1_SLOT/prompt.txt" 2>/dev/null \ + && echo "MISMATCH: v1 prompt exists but v2 prompt is absent" \ + || echo "OK: prompt absent in both stores" +fi ``` -Both digests should match. If they don't, inspect with a `diff -u` between -the two `cat-file -p` outputs to see whether it's an ordering / separator -issue. +If the digests don't match, inspect with a `diff -u` between the two +`cat-file -p` outputs to see whether it's an ordering / separator issue. ### 5.4 Step H — raw transcript & `content_hash.txt` @@ -817,7 +890,7 @@ while IFS= read -r f; do done < /tmp/chunks.txt # Recompute and compare. -COMPUTED="sha256:$(sha256sum "$tmp" | awk '{print $1}')" +COMPUTED="sha256:$(sha256_file "$tmp")" STORED=$(git -C "$REPO" cat-file -p \ "entire/checkpoints/v1:$SHARD/$V1_SLOT/content_hash.txt") echo "stored: $STORED" @@ -876,19 +949,55 @@ Then for each ID in `$MIGRATED_IDS`, run: - §5.1 root metadata diff (`grep -q` for errors). - §5.2 per-session field diff for every session ID that the checkpoint brought in. +- §5.3 prompt presence/content check for every migrated session. - §5.4 hash check on every transcript chunk set. +- §5.6 `entire explain` comparison against the dual-reads-removed binary. A single shell loop is fine, and the validation completes in seconds per checkpoint. Surface any non-empty diffs or any `MISMATCH` lines. ### 5.6 Step J — `entire explain` parity after removing v2 dual reads -Validate every migrated checkpoint with a current build from the branch that -removes v2-first dual reads, and compare it to the Homebrew-installed -`entire`. The outputs must be identical. Any mismatch means the migration -did not restore enough v1 data for the new read path, or the two binaries -changed explain behavior independently; flag it and keep the diff for -investigation. +Run a current build from the branch that removes v2-first dual reads and +compare it to the Homebrew-installed `entire` for every migrated checkpoint. +On a real repo the two binaries will **not** produce identical output — +some divergence is structural and expected. The gate here is that every +diff falls into a known, bounded category and the required checks hold: the +file list, displayed session id, author header, and exit status agree on +every checkpoint. + +#### Expected divergences + +1. **Codex sessions: sanitized transcript on v1** (§1.3). + `codex.SanitizePortableTranscript` runs at write time, so the v1 + transcript bytes are not equal to v2's raw transcript bytes for any + session with `agent == "Codex"`. Self-consistency on each side still + holds — verified in §5.4. +2. **v2 compact transcript not migrated** (§7). The v2 store holds two + transcripts per session: the raw form on `/full/*` (migrated to v1) and + the compact form on `/main/transcript.jsonl` (not migrated). BREW + renders explain output from the compact form when available; FIX has to + parse the raw JSONL on v1 and pick fields ad hoc. Two visible + consequences: + - For Claude Code multi-argument tools (`Glob`, `Grep`, …), the tool + summary line picks different arguments. BREW tends to surface + `path`; FIX tends to surface `pattern`. Both arguments are present + in the raw JSONL. + - For the **Intent** block, BREW shows a prompt derived from the + compact transcript; FIX picks a user message from the raw + transcript. The underlying `prompt.txt` blobs are byte-identical + between v1 and v2 — only the renderer's selection differs. + +#### Required checks that must still hold + +- Exit status of both binaries matches per checkpoint. +- `## Files` (the touched-files list) is byte-identical. +- The displayed `session ` header line is identical (both binaries + choose the same session id on both sides). +- The `author` header is identical (the migration preserves the v2 + author identity onto v1 — see §4 Behavior notes). + +Anything that violates one of those is **unaccounted for** — flag it. Build the comparison binary immediately before the sweep: @@ -904,13 +1013,13 @@ git -C "$FIX_WORKTREE" status --short --branch "$BREW_ENTIRE" version ``` -Run both binaries from the migrated repo and compare exit status, stdout, and -stderr for every migrated checkpoint: +Run both binaries from the migrated repo for every migrated checkpoint and +audit each diff against the required checks above: ```sh EXPLAIN_DIR="/tmp/migrate-${REPO_NAME}-explain" mkdir -p "$EXPLAIN_DIR" -: > "$EXPLAIN_DIR/mismatches.txt" +: > "$EXPLAIN_DIR/unaccounted.txt" set +e while IFS= read -r ID; do @@ -938,26 +1047,97 @@ while IFS= read -r ID; do cat "$BREW_RESULT.err" } > "$BREW_RESULT" - if ! diff -u "$BREW_RESULT" "$FIX_RESULT" > "$DIFF_FILE"; then - echo "$ID" >> "$EXPLAIN_DIR/mismatches.txt" - echo "MISMATCH $ID (see $DIFF_FILE)" - else + if diff -u "$BREW_RESULT" "$FIX_RESULT" > "$DIFF_FILE" 2>&1; then rm -f "$DIFF_FILE" + continue + fi + + # --- Required checks --------------------------------------------------- + if [ "$FIX_STATUS" != "$BREW_STATUS" ]; then + echo "$ID reason=exit-status brew=$BREW_STATUS fix=$FIX_STATUS" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue + fi + BREW_FILES=$(awk '/^## Files/{f=1} /^── Transcript/{f=0} f' \ + "$BREW_RESULT" | sha256_stdin) + FIX_FILES=$(awk '/^## Files/{f=1} /^── Transcript/{f=0} f' \ + "$FIX_RESULT" | sha256_stdin) + if [ "$BREW_FILES" != "$FIX_FILES" ]; then + echo "$ID reason=files-list-diverges" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue + fi + BREW_SID=$(awk '/^ session /{print $2; exit}' "$BREW_RESULT") + FIX_SID=$(awk '/^ session /{print $2; exit}' "$FIX_RESULT") + if [ "$BREW_SID" != "$FIX_SID" ]; then + echo "$ID reason=session-id-mismatch brew=$BREW_SID fix=$FIX_SID" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue fi + BREW_AUTHOR=$(grep -m1 '^ author ' "$BREW_RESULT") + FIX_AUTHOR=$(grep -m1 '^ author ' "$FIX_RESULT") + if [ "$BREW_AUTHOR" != "$FIX_AUTHOR" ]; then + echo "$ID reason=author-mismatch" \ + >> "$EXPLAIN_DIR/unaccounted.txt" + continue + fi + # Remaining diffs fall in the expected buckets above. Leave $DIFF_FILE + # on disk for spot-checking. done < "$MIGRATED_IDS" -if [ -s "$EXPLAIN_DIR/mismatches.txt" ]; then - echo "explain mismatches:" - cat "$EXPLAIN_DIR/mismatches.txt" +if [ -s "$EXPLAIN_DIR/unaccounted.txt" ]; then + echo "unaccounted-for explain divergences:" + cat "$EXPLAIN_DIR/unaccounted.txt" exit 1 fi -echo "all migrated checkpoints matched entire explain output" +echo "all explain divergences fall in accepted buckets" +``` + +#### Optional: divergence distribution + +Useful for spotting a sudden shift in divergence shape between releases. +Bucket each checkpoint as `identical`, `body-with-codex` (sanitization + +compact-rendering), or `body-without-codex` (compact-rendering only): + +```sh +identical=0; codex_body=0; non_codex_body=0 +while IFS= read -r ID; do + DIFF="$EXPLAIN_DIR/$ID.diff" + if [ ! -f "$DIFF" ]; then + identical=$((identical+1)); continue + fi + SHARD=${ID:0:2}/${ID:2} + has_codex=0 + V1_LEN=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/metadata.json" \ + | jq -r '.sessions | length') + for i in $(seq 0 $((V1_LEN-1))); do + AGENT=$(git -C "$REPO" cat-file -p \ + "entire/checkpoints/v1:$SHARD/$i/metadata.json" \ + | jq -r '.agent // ""') + if [ "$AGENT" = "Codex" ] || [ "$AGENT" = "codex" ]; then + has_codex=1; break + fi + done + if [ "$has_codex" = 1 ]; then + codex_body=$((codex_body+1)) + else + non_codex_body=$((non_codex_body+1)) + fi +done < "$MIGRATED_IDS" + +printf 'identical: %4d\n' "$identical" +printf 'body diff, has codex: %4d (expected: §1.3 sanitization + §7 compact-not-migrated)\n' \ + "$codex_body" +printf 'body diff, no codex: %4d (expected: §7 compact-not-migrated)\n' \ + "$non_codex_body" ``` -Do not ignore mismatches, even if the rendered output looks close. Record the -checkpoint ID and keep the corresponding `.diff`, `.fix`, and `.brew` files -for follow-up. +`$EXPLAIN_DIR/*.diff`, `*.fix`, and `*.brew` are kept on disk for +inspection. If a body diff in the no-codex bucket touches anything other +than transcript-tool-argument rendering or Intent text, file a bug — that +would be a real read-path regression. ### 5.7 After validation passes @@ -1030,9 +1210,11 @@ from §4's "Behavior notes" — cheap and local, because you did not push. `fetchV2FullRefs` (line 101). - Migration loop: `cmd/migrate-v2-checkpoints/migration.go` — `migrateDiscoveredCheckpoints` (line 53), `migrateCheckpoint` - (line 98), `writeOptionsFromV2Content` (line 214), - `writeMigrationReport` (line 249), `candidateCommitLabel` (line 288, + (line 96), `writeOptionsFromV2Content` (line 216), + `writeMigrationReport` (line 251), `candidateCommitLabel` (line 290, emits `(orphan)`). +- v2 session author lookup: `cmd/migrate-v2-checkpoints/v2_author.go` — + `findV2SessionAuthor` (line 19), `v2SessionMetadataPath` (line 59). - v1 write: `cmd/entire/cli/checkpoint/committed.go` — `WriteCommitted` (line 58), `writeStandardCheckpointEntries` (line 310), `writeSessionToSubdirectory` (line 404), `writeTranscript` (line 720), @@ -1061,9 +1243,10 @@ from §4's "Behavior notes" — cheap and local, because you did not push. - `--dry-run` / `--apply` auto-fetch v2 refs from the repo's checkpoint remote (`ensureLatestV2Refs`). If the remote resolves, you get an up-to-date local copy of `refs/entire/checkpoints/v2/main` and every - `refs/entire/checkpoints/v2/full/*`; if it doesn't, the tool only - proceeds when a local v2 /main ref is already present. `--list` does - **not** auto-fetch — if you want a candidate universe that reflects + `refs/entire/checkpoints/v2/full/*`, and the tool errors if that remote + does not advertise v2 /main. If the remote cannot be resolved at all, the + tool only proceeds when a local v2 /main ref is already present. `--list` + does **not** auto-fetch — if you want a candidate universe that reflects the remote, refresh manually first: ```sh git -C "$REPO" fetch origin \ diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 8ee39616f..bc07490a0 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -1,4 +1,3 @@ -//nolint:goconst // Repeated CLI flag literals keep argument-list tests readable. package main import ( From f1e4ad89b71d2e6cea57cd5d5b95022a1faf9d21 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 10:49:59 -0700 Subject: [PATCH 26/35] Fetch v1 checkpoints before migration Entire-Checkpoint: 3141746402c0 --- cmd/migrate-v2-checkpoints/main.go | 7 +- cmd/migrate-v2-checkpoints/main_test.go | 100 ++++++++++++++++++- cmd/migrate-v2-checkpoints/v1_preflight.go | 108 +++++++++++++++++++++ 3 files changed, 212 insertions(+), 3 deletions(-) create mode 100644 cmd/migrate-v2-checkpoints/v1_preflight.go diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go index b08a482ca..a9e6889d3 100644 --- a/cmd/migrate-v2-checkpoints/main.go +++ b/cmd/migrate-v2-checkpoints/main.go @@ -55,7 +55,10 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { } ctx = settings.WithWorktreeRoot(ctx, repoRoot) - if shouldEnsureV2Refs(opts) { + if shouldEnsureCheckpointRefs(opts) { + if err := ensureLatestV1Ref(ctx, repoRoot, repo); err != nil { + return err + } if err := ensureLatestV2Refs(ctx, repoRoot, repo); err != nil { return err } @@ -93,7 +96,7 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { } } -func shouldEnsureV2Refs(opts options) bool { +func shouldEnsureCheckpointRefs(opts options) bool { return opts.mode == modePlan || opts.mode == modeDryRun || opts.mode == modeApply } diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index bc07490a0..fbb4d4277 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -1,3 +1,4 @@ +//nolint:goconst // Repeated CLI flag literals keep argument-list tests readable. package main import ( @@ -135,7 +136,7 @@ func TestDiscoverCheckpointHistory_ExcludesInternalRefs(t *testing.T) { t.Parallel() fixture := setupMigrationHistoryRepo(t) - runMigrationGit(t, fixture.dir, "checkout", "-b", paths.MetadataBranchName, fixture.mainHash.String()) + runMigrationGit(t, fixture.dir, "checkout", paths.MetadataBranchName) commitMigrationTestFile(t, fixture.dir, "internal.txt", "internal\n", "internal checkpoint\n\nEntire-Checkpoint: "+unrelatedCheckpointID) @@ -288,6 +289,73 @@ func TestRunDryRunFailsWhenRemoteV2MainIsUnavailable(t *testing.T) { require.ErrorContains(t, err, paths.V2MainRefName+" not found on remote") } +func TestRunDryRunFailsWhenRemoteV1RefIsUnavailable(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepoWithoutV1(t) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: id.MustCheckpointID(mainCheckpointID), + SessionID: "session-remote-v2", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"remote v2\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + + var stdout bytes.Buffer + err := run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--dry-run", + }, &stdout) + + require.ErrorContains(t, err, plumbing.NewBranchReferenceName(paths.MetadataBranchName).String()+" not found on remote") +} + +func TestRunApplySeedsLocalV1FromRemoteBeforeMigrating(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-seeded-v1", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"seeded v1\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + cloneRepo, err := git.PlainOpen(cloneDir) + require.NoError(t, err) + + localV1RefName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + _, err = cloneRepo.Reference(localV1RefName, true) + require.ErrorIs(t, err, plumbing.ErrReferenceNotFound) + remoteV1Ref, err := cloneRepo.Reference(plumbing.NewRemoteReferenceName("origin", paths.MetadataBranchName), true) + require.NoError(t, err) + + var stdout bytes.Buffer + err = run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "migrated checkpoints: 1") + require.Contains(t, stdout.String(), "migrated sessions: 1") + + cloneRepo, err = git.PlainOpen(cloneDir) + require.NoError(t, err) + localV1Ref, err := cloneRepo.Reference(localV1RefName, true) + require.NoError(t, err) + commit, err := cloneRepo.CommitObject(localV1Ref.Hash()) + require.NoError(t, err) + require.Equal(t, []plumbing.Hash{remoteV1Ref.Hash()}, commit.ParentHashes) + + summary, err := checkpoint.NewGitStore(cloneRepo).ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.NotNil(t, summary) + require.Len(t, summary.Sessions, 1) +} + func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { t.Parallel() @@ -662,6 +730,18 @@ type migrationHistoryFixture struct { func setupMigrationHistoryRepo(t *testing.T) migrationHistoryFixture { t.Helper() + return setupMigrationHistoryRepoWithV1(t, true) +} + +func setupMigrationHistoryRepoWithoutV1(t *testing.T) migrationHistoryFixture { + t.Helper() + + return setupMigrationHistoryRepoWithV1(t, false) +} + +func setupMigrationHistoryRepoWithV1(t *testing.T, seedV1 bool) migrationHistoryFixture { + t.Helper() + dir := t.TempDir() testutil.InitRepo(t, dir) @@ -676,6 +756,9 @@ func setupMigrationHistoryRepo(t *testing.T) migrationHistoryFixture { repo, err := git.PlainOpen(dir) require.NoError(t, err) + if seedV1 { + writeTestV1Baseline(t, repo) + } return migrationHistoryFixture{ dir: dir, @@ -701,6 +784,7 @@ func setupMigrationOrphanRepoWithOptions(t *testing.T, checkpointID string, opts baseHash := commitMigrationTestFile(t, dir, "initial.txt", "initial\n", "initial commit") repo, err := git.PlainOpen(dir) require.NoError(t, err) + writeTestV1Baseline(t, repo) opts.CheckpointID = id.MustCheckpointID(checkpointID) if opts.SessionID == "" { @@ -730,6 +814,10 @@ func cloneMigrationRepoWithOrigin(t *testing.T, fixture migrationHistoryFixture) fixture.mainHash.String() + ":refs/heads/main", fixture.featureHash.String() + ":refs/heads/" + testFeatureBranchName, } + v1RefName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + if refExists(t, fixture.repo, v1RefName) { + refspecs = append(refspecs, v1RefName.String()+":"+v1RefName.String()) + } if refExists(t, fixture.repo, plumbing.ReferenceName(paths.V2MainRefName)) { refspecs = append(refspecs, paths.V2MainRefName+":"+paths.V2MainRefName) } @@ -743,6 +831,16 @@ func cloneMigrationRepoWithOrigin(t *testing.T, fixture migrationHistoryFixture) return cloneDir } +func writeTestV1Baseline(t *testing.T, repo *git.Repository) { + t.Helper() + + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + if refExists(t, repo, refName) { + return + } + writeTestV2RefEntries(t, repo, refName, plumbing.ZeroHash, map[string]object.TreeEntry{}, "test v1 baseline") +} + func refExists(t *testing.T, repo *git.Repository, refName plumbing.ReferenceName) bool { t.Helper() diff --git a/cmd/migrate-v2-checkpoints/v1_preflight.go b/cmd/migrate-v2-checkpoints/v1_preflight.go new file mode 100644 index 000000000..fa138b4f8 --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v1_preflight.go @@ -0,0 +1,108 @@ +package main + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint/remote" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/strategy" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" +) + +const ( + v1RefFetchTimeout = 2 * time.Minute + v1FetchTmpRef = strategy.FetchTmpRefPrefix + "migrate-v1" +) + +func ensureLatestV1Ref(ctx context.Context, repoRoot string, repo *git.Repository) error { + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + fetchTarget, err := remote.FetchURL(ctx, remote.FetchURLOptions{WorktreeRoot: repoRoot}) + if err != nil { + if localV1RefExists(repo) { + return nil + } + return fmt.Errorf("resolve v1 checkpoint fetch target: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, v1RefFetchTimeout) + defer cancel() + + remoteHasV1, err := remoteRefExists(ctx, repoRoot, fetchTarget, refName.String()) + if err != nil { + return err + } + if !remoteHasV1 { + return fmt.Errorf("%s not found on remote %s", refName, remote.RedactURL(fetchTarget)) + } + + if err := fetchV1Ref(ctx, repoRoot, repo, fetchTarget); err != nil { + return err + } + return nil +} + +func localV1RefExists(repo *git.Repository) bool { + _, err := repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + return err == nil +} + +func remoteRefExists(ctx context.Context, repoRoot, fetchTarget, refName string) (bool, error) { + output, err := remote.LsRemoteInDir(ctx, repoRoot, fetchTarget, refName) + if err != nil { + return false, fmt.Errorf("list remote %s from %s: %w", refName, remote.RedactURL(fetchTarget), err) + } + + for line := range strings.SplitSeq(strings.TrimSpace(string(output)), "\n") { + fields := strings.Fields(line) + if len(fields) >= 2 && fields[1] == refName { + return true, nil + } + } + return false, nil +} + +func fetchV1Ref(ctx context.Context, repoRoot string, repo *git.Repository, fetchTarget string) error { + refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + tmpRefName := plumbing.ReferenceName(v1FetchTmpRef) + refSpec := fmt.Sprintf("+%s:%s", refName, tmpRefName) + output, err := remote.Fetch(ctx, remote.FetchOptions{ + Remote: fetchTarget, + RefSpecs: []string{refSpec}, + NoTags: true, + NoFilter: true, + Dir: repoRoot, + }) + if err != nil { + return fetchV1RefError("fetch v1 checkpoint ref", fetchTarget, output, err) + } + + defer func() { _ = repo.Storer.RemoveReference(tmpRefName) }() //nolint:errcheck // cleanup is best-effort + + tmpRef, err := repo.Reference(tmpRefName, true) + if err != nil { + return fmt.Errorf("v1 checkpoint ref not found after fetch (tmp ref %s missing): %w", tmpRefName, err) + } + if err := strategy.SafelyAdvanceLocalRef(ctx, repo, refName, tmpRef.Hash()); err != nil { + return fmt.Errorf("advance local %s: %w", refName, err) + } + return nil +} + +func fetchV1RefError(action, fetchTarget string, output []byte, err error) error { + if errors.Is(err, context.DeadlineExceeded) { + return fmt.Errorf("%s timed out after %s", action, v1RefFetchTimeout) + } + + redactedTarget := remote.RedactURL(fetchTarget) + msg := strings.TrimSpace(strings.ReplaceAll(string(output), fetchTarget, redactedTarget)) + if msg != "" { + return fmt.Errorf("%s from %s failed: %s: %w", action, redactedTarget, msg, err) + } + return fmt.Errorf("%s from %s failed: %w", action, redactedTarget, err) +} From 620378f0e9b7ec9cd12a5a27ebb44a2b98dfd7e3 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 11:02:45 -0700 Subject: [PATCH 27/35] Update migration validation runbook Entire-Checkpoint: 5b8d4688d945 --- cmd/migrate-v2-checkpoints/VALIDATION.md | 200 +++++++++++++++++------ 1 file changed, 147 insertions(+), 53 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index 769b63ddc..cb8ceb730 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -133,14 +133,19 @@ $ migrate-v2-checkpoints [--repo PATH] [--since SHA | SHA] [--head SHA] \ Default mode is `plan` (same output as `--dry-run`). -For `--dry-run` and `--apply` (but not `--list`), the tool resolves the -checkpoint fetch remote and refreshes `refs/entire/checkpoints/v2/main` plus -every `refs/entire/checkpoints/v2/full/*` ref before discovery (see -`ensureLatestV2Refs` in `v2_preflight.go`). If the remote can't be resolved -and a local v2 /main ref exists, fetch is skipped silently. If the remote -does resolve, the remote must advertise v2 /main; a stale local v2 /main does -not bypass a missing remote v2 /main. If neither a usable remote nor local v2 -/main is available, the tool errors out before doing any work. +For `plan`, `--dry-run`, and `--apply` (but not `--list`), the tool resolves +the checkpoint fetch remote and refreshes the local v1 branch plus v2 refs +before discovery: + +- `refs/heads/entire/checkpoints/v1` via `ensureLatestV1Ref` +- `refs/entire/checkpoints/v2/main` plus every + `refs/entire/checkpoints/v2/full/*` ref via `ensureLatestV2Refs` + +These modes intentionally write local refs even when no migration data is +written. If the remote resolves, it must advertise both v1 and v2 /main; stale +local refs do not bypass a missing remote v1 or v2 /main. If no fetch target +can be resolved, the tool only proceeds when the required local refs already +exist. Otherwise it errors out before doing any analysis. `--list` produces one line per checkpoint: ```text @@ -151,7 +156,7 @@ The first form is for commit-attributed IDs; the second is for orphans (IDs present on v2 /main with no commit trailer in history). This is the **universe** discovered — NOT the eligible set. -`--dry-run` / `--apply` produces: +`plan`, `--dry-run`, and `--apply` produce: ```text Migration plan: (or "Migration result:" on --apply) discovered checkpoints: D @@ -160,7 +165,7 @@ Migration plan: (or "Migration result:" on --apply) missing required v2 session metadata: M2 missing raw transcripts: M3 checkpoints eligible for migration: EC - v2 orphan checkpoints eligible for migration: EO + v2 orphan checkpoints eligible for migration: V2O sessions eligible for migration: ES migrated checkpoints: ... (--apply only) migrated sessions: ... (--apply only) @@ -179,16 +184,23 @@ warning: N v2 orphans skipped; re-run without --since/--head to include them Checks that should always hold on the report: - `EC ≤ D`. -- `EO ≤ EC` (orphan-eligible is a subset of eligible). +- `V2O ≤ EC` (orphan-eligible is a subset of eligible). - `ES ≥ EC` (each eligible checkpoint contributes ≥ 1 eligible session). - `ES = Σ candidate.SessionCount`. The candidate list is exhaustive. - The candidate list is sorted by `` ascending; commit SHAs within a candidate are sorted by commit date descending (most recent first), ties broken by hash. Orphan candidates print `commits=(orphan)` instead of a SHA list. +- A ` sessions=N commits=(orphan)` line corresponds to one of the `V2O` + checkpoints; its trailer never appears on any history tip included in the + discovery walk. - On `--apply`: `migrated checkpoints = EC` and `migrated sessions = ES` if no write errors. Anything less means a partial write failure — re-run the tool and the remainder should re-appear as eligible. +- Do not try to balance `D` with + `eligible non-orphan + V2O + already-present + M1 + M3`. `D` is a + checkpoint discovery count and includes both trailer-discovered and + v2-orphan IDs; `A`, `M2`, and `M3` are session counters. - `D = EC + (checkpoints with v2 summary but eligibleSessions==0) + M1`. The middle term covers both "all v2 sessions already in v1" and "every v2 session was unreadable (missing metadata or transcript)" — those land @@ -198,7 +210,7 @@ Checks that should always hold on the report: `A + M2 + M3 = (Σ over all v2 sessions in checkpoints whose v2 summary exists) − ES`. Useful for spot-checking after `--apply`: if `A` is large and `EC` is small, most v2 checkpoints are already mirrored. If - `EO` is close to `EC` and `A` is small, this repo skipped v2 entirely + `V2O` is close to `EC` and `A` is small, this repo skipped v2 entirely and the migration is largely "import from v2 /main." ## 3. Validation procedure @@ -231,24 +243,34 @@ sha256_file() { ### 3.1 Pre-flight: confirm both stores exist ```sh -git -C "$REPO" show-ref entire/checkpoints/v1 +git -C "$REPO" show-ref refs/heads/entire/checkpoints/v1 \ + || git -C "$REPO" show-ref refs/remotes/origin/entire/checkpoints/v1 git -C "$REPO" show-ref refs/entire/checkpoints/v2/main git -C "$REPO" show-ref refs/entire/checkpoints/v2/full/current git -C "$REPO" for-each-ref 'refs/entire/checkpoints/v2/full/*' \ --format='%(refname)' ``` -If `entire/checkpoints/v1` is missing the migration can still apply (it -will be created), but if the v2 refs are missing there is nothing to -migrate. - -`--dry-run` and `--apply` auto-fetch v2 refs from the repo's checkpoint -remote before discovery (`ensureLatestV2Refs`), so the local state -*after* those modes runs will reflect the remote. Pre-flight is still -useful to (a) catch a missing local v1 branch and (b) sanity-check that -the local v2 /main looks like it's frozen rather than actively -advancing. `--list` does **not** auto-fetch; if you intend to inspect -the universe via `--list`, pre-fetch manually (see §9). +If `entire/checkpoints/v1` is missing locally but present on the remote, the +migration tool will fetch it and create the local branch before planning or +applying. If both local and remote v1 are missing, the tool aborts; it will +not synthesize a fresh orphan v1 baseline for this rollback migration. + +`plan`, `--dry-run`, and `--apply` auto-fetch checkpoint refs from the repo's +checkpoint remote before discovery (`ensureLatestV1Ref` and +`ensureLatestV2Refs`), so the local state *after* those modes runs will +reflect the remote. This is intentional even for `--dry-run`: the tool refuses +to analyze a stale checkpoint snapshot. If the remote lacks v1 or v2 /main, or +rejects a fetch of `refs/entire/checkpoints/v2/full/*`, the tool exits +non-zero before analysis. Ensure the repo can reach a checkpoint remote with +v1 and v2 refs before running. To work against a strictly local copy, +temporarily remove or disable the checkpoint fetch remote and keep local v1 +and v2 refs present. + +Pre-flight is still useful to sanity-check that the local v2 /main looks like +it's frozen rather than actively advancing. `--list` does **not** auto-fetch; +if you intend to inspect the universe via `--list`, pre-fetch manually (see +§9). Also sanity-check the head of v2 isn't surprising — a recent commit means v2 was being dual-written; a long-stale v2 head matches the @@ -271,9 +293,9 @@ grep -E "^ (discovered|already|missing|checkpoints eligible|v2 orphan|sessions /tmp/migrate.plan ``` -- `EC ≤ D` and `EO ≤ EC` and `ES ≥ EC`. +- `EC ≤ D` and `V2O ≤ EC` and `ES ≥ EC`. - For each candidate line, parse `sessions=N` and sum — must equal `ES`. -- The number of candidate lines with `commits=(orphan)` must equal `EO`. +- The number of candidate lines with `commits=(orphan)` must equal `V2O`. ```sh awk '/^ [0-9a-f]{12} sessions=/ {sub(/sessions=/,"",$2); s+=$2} END {print s}' \ @@ -298,6 +320,12 @@ For every candidate ``: ```sh ID=02d9783342a2 # example SHARD=${ID:0:2}/${ID:2} +if git -C "$REPO" rev-parse --verify --quiet \ + refs/heads/entire/checkpoints/v1 >/dev/null; then + V1_REF=refs/heads/entire/checkpoints/v1 +else + V1_REF=refs/remotes/origin/entire/checkpoints/v1 +fi # Does v2 /main carry this checkpoint? git -C "$REPO" cat-file -p \ @@ -307,11 +335,18 @@ git -C "$REPO" cat-file -p \ # Does v1 already carry it? (Either the path doesn't exist, or the session # IDs differ.) git -C "$REPO" cat-file -p \ - entire/checkpoints/v1:"$SHARD/metadata.json" 2>/dev/null \ + "$V1_REF:$SHARD/metadata.json" 2>/dev/null \ | jq '{checkpoint_id, sessions: [.sessions[].metadata]}' \ || echo "(absent in v1)" ``` +Use the **effective** v1 baseline the binary reads, not only the local branch. +The v1 store reads `refs/heads/entire/checkpoints/v1` first and falls back to +`refs/remotes/origin/entire/checkpoints/v1` if the local branch is missing. +This distinction matters before the tool has run in a fresh clone; after +`plan`/`--dry-run` or `--apply`, the v1 preflight should have created the +local branch from the remote baseline. + The candidate must satisfy at least one of: 1. `/metadata.json` doesn't exist on `entire/checkpoints/v1` → @@ -332,8 +367,14 @@ git -C "$REPO" ls-tree -r refs/entire/checkpoints/v2/main \ | sort -u > /tmp/v2_ids.txt wc -l /tmp/v2_ids.txt -# IDs already in v1 (any session present). -git -C "$REPO" ls-tree -r entire/checkpoints/v1 2>/dev/null \ +# IDs already in effective v1 (any session present). +if git -C "$REPO" rev-parse --verify --quiet \ + refs/heads/entire/checkpoints/v1 >/dev/null; then + V1_REF=refs/heads/entire/checkpoints/v1 +else + V1_REF=refs/remotes/origin/entire/checkpoints/v1 +fi +git -C "$REPO" ls-tree -r "$V1_REF" 2>/dev/null \ | awk '$4 ~ /^[0-9a-f]{2}\/[0-9a-f]{10}\/metadata\.json$/ { \ split($4, p, "/"); print p[1] p[2] \ }' \ @@ -354,19 +395,19 @@ v2-only set. If it's less, something is being silently dropped. ```sh EC=$(grep "checkpoints eligible for migration" /tmp/migrate.plan | awk '{print $NF}') -EO=$(grep "v2 orphan checkpoints" /tmp/migrate.plan | awk '{print $NF}') +V2O=$(grep "v2 orphan checkpoints" /tmp/migrate.plan | awk '{print $NF}') M1=$(grep "missing v2 checkpoint metadata" /tmp/migrate.plan | awk '{print $NF}') M2=$(grep "missing required v2 session metadata" /tmp/migrate.plan | awk '{print $NF}') M3=$(grep "missing raw transcripts" /tmp/migrate.plan | awk '{print $NF}') echo "v2-only on disk: $(wc -l < /tmp/v2_only_ids.txt)" -echo "EC=$EC EO=$EO M1=$M1 M2=$M2 M3=$M3" +echo "EC=$EC V2O=$V2O M1=$M1 M2=$M2 M3=$M3" echo " EC + M1 + M2 + M3 must be >= v2-only count" -echo " EO <= EC must hold (orphan is a subset of eligible)" +echo " V2O <= EC must hold (orphan is a subset of eligible)" ``` (`>=` rather than `=` because `M1`, `M2`, and `M3` are counted over the entire discovered universe, not only the v2-only set; `M2` and `M3` are -also per-session counters. `EO` is exactly the subset of `EC` whose +also per-session counters. `V2O` is exactly the subset of `EC` whose discovery came from v2 /main alone.) ### 3.4 Step C — confirm commit-list accuracy @@ -508,17 +549,21 @@ a separate, explicit decision once the post-apply checks in §5 pass. - §3 ran clean: the candidate list looks plausible, counter math adds up, and a spot sample (Steps C and D) confirmed the candidates really are v2-only / partial migrations. -- The repo has either a checkpoint fetch remote that advertises - `refs/entire/checkpoints/v2/main`, or no resolvable fetch remote but an - already-present local `refs/entire/checkpoints/v2/main` ref. `--apply` - calls `ensureLatestV2Refs` first and will refresh v2 /main and every +- The repo has either a checkpoint fetch remote that advertises both + `refs/heads/entire/checkpoints/v1` and `refs/entire/checkpoints/v2/main`, + or no resolvable fetch remote but already-present local v1 and v2 /main + refs. `--apply` calls `ensureLatestV1Ref` and `ensureLatestV2Refs` first; + it refreshes the local v1 branch, v2 /main, and every `refs/entire/checkpoints/v2/full/*` ref from the remote (forced fetch of - `/full/current`, fast-forward fetch of archives). If the fetch target - resolves but lacks v2 /main, the tool errors out even if a local v2 /main - ref exists; that prevents silently using stale local rollback data. A + v2 `/full/current`, fast-forward fetch of archives). If the fetch target + resolves but lacks v1 or v2 /main, the tool errors out even if a stale + local ref exists; that prevents silently using stale rollback data. A manual pre-fetch is no longer required, but remains a safe no-op: ```sh + git -C "$REPO" fetch origin \ + 'refs/heads/entire/checkpoints/v1:refs/heads/entire/checkpoints/v1' \ + --no-tags git -C "$REPO" fetch origin \ 'refs/entire/checkpoints/v2/*:refs/entire/checkpoints/v2/*' \ --no-tags @@ -726,6 +771,7 @@ WANT_SID=… # session_id from the v2 side V1_SUM=$(git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/metadata.json") V1_LEN=$(echo "$V1_SUM" | jq '.sessions | length') +V1_SLOT= for n in $(seq 0 $((V1_LEN-1))); do SID=$(git -C "$REPO" cat-file -p \ "entire/checkpoints/v1:$SHARD/$n/metadata.json" \ @@ -734,11 +780,49 @@ for n in $(seq 0 $((V1_LEN-1))); do V1_SLOT=$n; break fi done -echo "session $WANT_SID lives in v1 slot $V1_SLOT" +if [ -n "$V1_SLOT" ]; then + echo "session $WANT_SID lives in v1 slot $V1_SLOT" +else + echo "session $WANT_SID is absent from v1" +fi +``` + +If a v2 `session_id` is present in v2 /main but absent from v1 after apply, +check that it was not skipped because v2 no longer has a raw transcript. Those +sessions are counted in `missing raw transcripts` and are intentionally not +written to v1. + +```sh +V2_SLOT=… # slot the session occupied on v2 (its index in v2 summary) +if [ -z "$V1_SLOT" ]; then + V2_FULL_REFS=$(git -C "$REPO" for-each-ref \ + --format='%(refname)' 'refs/entire/checkpoints/v2/full/*' \ + | awk '/full\/current$/ {print "1 " $0; next} {print "0 " $0}' \ + | sort -k1,1nr -k2,2r \ + | awk '{print $2}') + + RAW_FOUND= + for r in $V2_FULL_REFS; do + if git -C "$REPO" cat-file -e \ + "$r:$SHARD/$V2_SLOT/raw_transcript" 2>/dev/null || + git -C "$REPO" ls-tree --name-only "$r:$SHARD/$V2_SLOT" 2>/dev/null \ + | grep -qE '^raw_transcript\.[0-9]{3}$'; then + RAW_FOUND=1 + break + fi + done + + if [ -z "$RAW_FOUND" ]; then + echo "session $WANT_SID absent in v1: M3 skip, expected" + else + echo "MISMATCH: session $WANT_SID has raw v2 transcript but is absent in v1" + fi +fi ``` -Then diff the per-session metadata, comparing **fields that are expected to -survive migration** (`migration.go:216-248` lists them explicitly): +When `V1_SLOT` is non-empty, diff the per-session metadata, comparing +**fields that are expected to survive migration** (`migration.go:216-248` +lists them explicitly): ```sh V2_SLOT=… # slot the session occupied on v2 (its index in v2 summary) @@ -1167,14 +1251,18 @@ from §4's "Behavior notes" — cheap and local, because you did not push. | `missing v2 checkpoint metadata: N (large)` | v2 `/main` is missing or its tree lacks summaries for many discovered IDs. | Confirm `refs/entire/checkpoints/v2/main` exists, was fetched, and is reasonably recent. | | `missing required v2 session metadata: > 0` | v2 session `metadata.json` lacks `checkpoint_id` or `session_id`. Could indicate corruption or a partial v2 write. | Inspect the affected sessions manually; they will be skipped, not failed. | | `missing raw transcripts: > 0` | v2 `/main` has a session but `/full/current` and archived `/full/*` don't carry its `raw_transcript*` data. | Confirm archived `/full/*` refs are present locally (or accessible via remote fetch). | -| Candidate `commits=(orphan)` | The ID is on v2 /main with no commit-trailer attribution in history. Expected and benign; counted by `EO`. | None — verify against `git log --grep` in §3.4 to confirm there's no missed trailer. | +| Candidate `commits=(orphan)` | The ID is on v2 /main with no commit-trailer attribution in history. Expected and benign; counted by `V2O`. | None — verify against `git log --grep` in §3.4 to confirm there's no missed trailer. | | `warning: N v2 orphans skipped` on a `--since`/`--head` run | Commit-scoped run found N v2 /main IDs that an unscoped walk would have surfaced as orphan candidates. | Re-run without `--since`/`--head` to include them, or accept the scope deliberately. | -| `v2 orphan checkpoints eligible for migration > checkpoints eligible for migration` | Should be impossible (`EO ⊆ EC` by construction). | File a bug. | +| `v2 orphan checkpoints eligible for migration > checkpoints eligible for migration` | Should be impossible (`V2O ⊆ EC` by construction). | File a bug. | | `sessions=N` for a candidate doesn't match the §3.5 expected | Either v1 already has the session (so report should have lower N), or session IDs are non-unique within v2. | Inspect; non-unique session IDs are a v2 corruption. | | Post-apply, `content_hash.txt` ≠ recomputed SHA-256 | Codex agent + ours-vs-original sanitization difference, OR a bug. Confirm `agent` field on the session. | If non-Codex, file a bug with chunk listing + bytes. | | Post-apply, `content_hash.txt` matches but v2's `raw_transcript_hash.txt` doesn't | Codex sanitization (expected) OR transcript was rewritten in transit. Confirm agent first. | If non-Codex, file a bug. | | Re-running `--dry-run` after `--apply` still lists the same candidates | Apply failed silently or didn't get pushed before re-fetch. Look at the `migrated sessions` count. | Re-run with verbose logging; check that v1 branch actually advanced. | +The report does not enumerate the exact checkpoint/session IDs behind `M1`, +`M2`, or `M3`. Manual inspection requires re-walking v2 /main and v2 /full +refs as shown in §3.3 and §3.5. + ## 7. Quick reference: file & ref constants | Concept | Constant | Value | Source | @@ -1208,6 +1296,9 @@ from §4's "Behavior notes" — cheap and local, because you did not push. - v2 ref auto-fetch: `cmd/migrate-v2-checkpoints/v2_preflight.go` — `ensureLatestV2Refs` (line 24), `fetchV2MainRef` (line 75), `fetchV2FullRefs` (line 101). +- v1 ref auto-fetch: `cmd/migrate-v2-checkpoints/v1_preflight.go` — + `ensureLatestV1Ref` (line 23), `remoteRefExists` (line 55), + `fetchV1Ref` (line 70). - Migration loop: `cmd/migrate-v2-checkpoints/migration.go` — `migrateDiscoveredCheckpoints` (line 53), `migrateCheckpoint` (line 96), `writeOptionsFromV2Content` (line 216), @@ -1240,15 +1331,18 @@ from §4's "Behavior notes" — cheap and local, because you did not push. remote refs the candidate list may include IDs whose underlying commits are only reachable via those remotes. That's still correct — those commits really did reference the IDs. -- `--dry-run` / `--apply` auto-fetch v2 refs from the repo's checkpoint - remote (`ensureLatestV2Refs`). If the remote resolves, you get an - up-to-date local copy of `refs/entire/checkpoints/v2/main` and every - `refs/entire/checkpoints/v2/full/*`, and the tool errors if that remote - does not advertise v2 /main. If the remote cannot be resolved at all, the - tool only proceeds when a local v2 /main ref is already present. `--list` - does **not** auto-fetch — if you want a candidate universe that reflects - the remote, refresh manually first: +- `plan`, `--dry-run`, and `--apply` auto-fetch checkpoint refs from the repo's + checkpoint remote (`ensureLatestV1Ref`, `ensureLatestV2Refs`). If the + remote resolves, you get an up-to-date local copy of + `refs/heads/entire/checkpoints/v1`, `refs/entire/checkpoints/v2/main`, and + every `refs/entire/checkpoints/v2/full/*`; the tool errors if that remote + does not advertise v1 or v2 /main. If the remote cannot be resolved at all, + the tool only proceeds when local v1 and v2 /main refs are already present. + `--list` does **not** auto-fetch — if you want a candidate universe that + reflects the remote, refresh manually first: ```sh + git -C "$REPO" fetch origin \ + 'refs/heads/entire/checkpoints/v1:refs/heads/entire/checkpoints/v1' git -C "$REPO" fetch origin \ 'refs/entire/checkpoints/v2/*:refs/entire/checkpoints/v2/*' ``` From 125b214dd5e1910c427608a858ebff0b1a5c50ed Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 15:46:36 -0700 Subject: [PATCH 28/35] Disable signing for migrated checkpoint commits Entire-Checkpoint: b13ed8c7932b --- cmd/entire/cli/checkpoint/committed.go | 17 +++++++++++++++++ .../cli/checkpoint/committed_signing_test.go | 16 ++++++++++++++++ cmd/migrate-v2-checkpoints/main_test.go | 2 ++ cmd/migrate-v2-checkpoints/migration.go | 3 ++- 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/cmd/entire/cli/checkpoint/committed.go b/cmd/entire/cli/checkpoint/committed.go index 465cc2a2f..6b5a9b887 100644 --- a/cmd/entire/cli/checkpoint/committed.go +++ b/cmd/entire/cli/checkpoint/committed.go @@ -43,12 +43,26 @@ import ( // errStopIteration is used to stop commit iteration early in GetCheckpointAuthor. var errStopIteration = errors.New("stop iteration") +type commitSigningDisabledContextKey struct{} + // chunkTranscript is an indirection over agent.ChunkTranscript so tests can // count or intercept chunking calls (e.g., to verify the short-circuit avoids // re-chunking identical content). Production code paths always use the // unwrapped function. var chunkTranscript = agent.ChunkTranscript +// WithCommitSigningDisabled returns a context that prevents metadata branch +// commit signing. Use for replay/migration writes whose author line is sourced +// from historical data rather than the local operator. +func WithCommitSigningDisabled(ctx context.Context) context.Context { + return context.WithValue(ctx, commitSigningDisabledContextKey{}, true) +} + +func commitSigningDisabled(ctx context.Context) bool { + disabled, ok := ctx.Value(commitSigningDisabledContextKey{}).(bool) + return ok && disabled +} + // WriteCommitted writes a committed checkpoint to the entire/checkpoints/v1 branch. // Checkpoints are stored at sharded paths: // // @@ -1985,6 +1999,9 @@ func createCommitObject(ctx context.Context, repo *git.Repository, treeHash, par // If signing is disabled, no signer can be created, or signing fails, the commit // is left unsigned and the error is logged. func SignCommitBestEffort(ctx context.Context, commit *object.Commit) { + if commitSigningDisabled(ctx) { + return + } if !settings.IsSignCheckpointCommitsEnabled(ctx) { return } diff --git a/cmd/entire/cli/checkpoint/committed_signing_test.go b/cmd/entire/cli/checkpoint/committed_signing_test.go index 6cc7e8278..927537402 100644 --- a/cmd/entire/cli/checkpoint/committed_signing_test.go +++ b/cmd/entire/cli/checkpoint/committed_signing_test.go @@ -100,6 +100,22 @@ func TestSignCommitBestEffort_SkipsWhenDisabled(t *testing.T) { //nolint:paralle } } +func TestSignCommitBestEffort_SkipsWhenContextDisabled(t *testing.T) { //nolint:paralleltest // t.Chdir requires non-parallel + setupSigningEnv(t, false) + + objectSignerLoader = func(context.Context) (plugin.Signer, bool) { + t.Fatal("signer should not be called when commit signing is disabled by context") + return nil, true + } + + commit := newTestCommit() + SignCommitBestEffort(WithCommitSigningDisabled(context.Background()), commit) + + if commit.Signature != "" { + t.Errorf("expected empty signature, got %q", commit.Signature) + } +} + func TestSignCommitBestEffort_ErrorIsBestEffort(t *testing.T) { //nolint:paralleltest // t.Chdir requires non-parallel setupSigningEnv(t, false) diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index fbb4d4277..9ac00b0dc 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -433,6 +433,7 @@ func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { require.Equal(t, v2AuthorName, commit.Author.Name) require.Equal(t, v2AuthorEmail, commit.Author.Email) require.True(t, commit.Author.When.Equal(v2AuthorWhen), "author time = %s, want %s", commit.Author.When, v2AuthorWhen) + require.Empty(t, commit.Signature) } func TestRunApplyMigratesV2OrphanCheckpointAndIsIdempotent(t *testing.T) { @@ -476,6 +477,7 @@ func TestRunApplyMigratesV2OrphanCheckpointAndIsIdempotent(t *testing.T) { require.Equal(t, v2AuthorName, commit.Author.Name) require.Equal(t, v2AuthorEmail, commit.Author.Email) require.True(t, commit.Author.When.Equal(v2AuthorWhen), "author time = %s, want %s", commit.Author.When, v2AuthorWhen) + require.Empty(t, commit.Signature) stdout.Reset() err = run(context.Background(), []string{ diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 409cea0ee..6c843ee68 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -147,7 +147,8 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, sessionIndex, err) } writeOpts := writeOptionsFromV2Content(content, summary, author) - if err := m.v1Store.WriteCommitted(ctx, writeOpts); err != nil { + writeCtx := checkpoint.WithCommitSigningDisabled(ctx) + if err := m.v1Store.WriteCommitted(writeCtx, writeOpts); err != nil { return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) } m.report.MigratedSessions++ From 118296fe27254f019f92fd83c446902a7e89a23e Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 15:55:31 -0700 Subject: [PATCH 29/35] Document unsigned migration commits Entire-Checkpoint: 54aa5a6f6fb1 --- cmd/migrate-v2-checkpoints/VALIDATION.md | 28 ++++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index cb8ceb730..54da9e647 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -628,6 +628,11 @@ git -C "$REPO" log --format='%h %ci %s' \ and v2 attributes the same checkpoint session to the same author. §5.6 treats the `author` header in `entire explain` as a required check; a mismatch is a regression, not an accepted divergence. +- **Migration commits are unsigned.** The tool disables checkpoint commit + signing for migrated writes, even if normal checkpoint signing is enabled + in the repo. The v1 author line is replayed from v2 history; adding a local + operator signature to that replayed author would be misleading. Any signed + migrated v1 commit is a bug. - **Roll back** by resetting v1 back to `$PRE_APPLY_TIP`: ```sh @@ -875,20 +880,26 @@ git -C "$REPO" cat-file -p "entire/checkpoints/v1:$SHARD/$V1_SLOT/metadata.json" > /dev/null && echo OK ``` -Author parity for the metadata-branch commit: +Author and signature status for the metadata-branch commit: ```sh V2_AUTHOR=$(git -C "$REPO" log -1 --format='%an <%ae> %aI' \ refs/entire/checkpoints/v2/main -- "$SHARD/$V2_SLOT/metadata.json") V1_AUTHOR=$(git -C "$REPO" log -1 --format='%an <%ae> %aI' \ entire/checkpoints/v1 -- "$SHARD/$V1_SLOT/metadata.json") +V1_SIGNATURE_STATUS=$(git -C "$REPO" log -1 --format='%G?' \ + entire/checkpoints/v1 -- "$SHARD/$V1_SLOT/metadata.json") echo "v2: $V2_AUTHOR" echo "v1: $V1_AUTHOR" [ "$V1_AUTHOR" = "$V2_AUTHOR" ] && echo OK || echo MISMATCH + +echo "v1 signature status: $V1_SIGNATURE_STATUS" +[ "$V1_SIGNATURE_STATUS" = "N" ] && echo OK || echo MISMATCH ``` -Expected: exact match. For orphan candidates this is still valid: the v2 +Expected: exact author match, and v1 signature status `N` (`%G? = N` means +no signature). For orphan candidates the author check is still valid: the v2 `/main` path history is the source of the author line even though no user commit trailer exists. @@ -1257,6 +1268,7 @@ from §4's "Behavior notes" — cheap and local, because you did not push. | `sessions=N` for a candidate doesn't match the §3.5 expected | Either v1 already has the session (so report should have lower N), or session IDs are non-unique within v2. | Inspect; non-unique session IDs are a v2 corruption. | | Post-apply, `content_hash.txt` ≠ recomputed SHA-256 | Codex agent + ours-vs-original sanitization difference, OR a bug. Confirm `agent` field on the session. | If non-Codex, file a bug with chunk listing + bytes. | | Post-apply, `content_hash.txt` matches but v2's `raw_transcript_hash.txt` doesn't | Codex sanitization (expected) OR transcript was rewritten in transit. Confirm agent first. | If non-Codex, file a bug. | +| Post-apply, migrated v1 commit has signature status other than `N` | The migration signed a replayed-author commit. This should not happen. | File a bug and do not publish the migrated v1 branch until re-run with an unsigned tool. | | Re-running `--dry-run` after `--apply` still lists the same candidates | Apply failed silently or didn't get pushed before re-fetch. Look at the `migrated sessions` count. | Re-run with verbose logging; check that v1 branch actually advanced. | The report does not enumerate the exact checkpoint/session IDs behind `M1`, @@ -1301,15 +1313,17 @@ refs as shown in §3.3 and §3.5. `fetchV1Ref` (line 70). - Migration loop: `cmd/migrate-v2-checkpoints/migration.go` — `migrateDiscoveredCheckpoints` (line 53), `migrateCheckpoint` - (line 96), `writeOptionsFromV2Content` (line 216), - `writeMigrationReport` (line 251), `candidateCommitLabel` (line 290, + (line 96, disables commit signing before v1 writes), + `writeOptionsFromV2Content` (line 217), + `writeMigrationReport` (line 252), `candidateCommitLabel` (line 291, emits `(orphan)`). - v2 session author lookup: `cmd/migrate-v2-checkpoints/v2_author.go` — `findV2SessionAuthor` (line 19), `v2SessionMetadataPath` (line 59). - v1 write: `cmd/entire/cli/checkpoint/committed.go` — `WriteCommitted` - (line 58), `writeStandardCheckpointEntries` (line 310), - `writeSessionToSubdirectory` (line 404), `writeTranscript` (line 720), - `findSessionIndex` (line 610). + (line 72), `WithCommitSigningDisabled` (line 57), + `writeStandardCheckpointEntries` (line 324), + `writeSessionToSubdirectory` (line 418), `writeTranscript` (line 741), + `findSessionIndex` (line 631), `SignCommitBestEffort` (line 2001). - v2 read: `cmd/entire/cli/checkpoint/v2_read.go` — `ReadCommitted` (line 26), `ReadSessionMetadataAndPrompts` (line 205), `ReadSessionContent` (line 274), `readTranscriptFromFullRefs` From 0b2a750eb88dad310b4b81deef14bc4083a42d02 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 16:54:42 -0700 Subject: [PATCH 30/35] Ignore v2 merge commits for session authors Entire-Checkpoint: 912a3eb7de7e --- cmd/migrate-v2-checkpoints/v2_author.go | 8 + cmd/migrate-v2-checkpoints/v2_author_test.go | 171 +++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 cmd/migrate-v2-checkpoints/v2_author_test.go diff --git a/cmd/migrate-v2-checkpoints/v2_author.go b/cmd/migrate-v2-checkpoints/v2_author.go index bacf40172..bcb95ede0 100644 --- a/cmd/migrate-v2-checkpoints/v2_author.go +++ b/cmd/migrate-v2-checkpoints/v2_author.go @@ -44,6 +44,14 @@ func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpo if err := ctx.Err(); err != nil { return err //nolint:wrapcheck // Propagating context cancellation } + if commit.NumParents() > 1 { + return nil + } + if _, err := commit.File(metadataPath); errors.Is(err, object.ErrFileNotFound) { + return nil + } else if err != nil { + return fmt.Errorf("read %s from %s: %w", metadataPath, commit.Hash, err) + } author = commit.Author return errFoundV2SessionAuthor }) diff --git a/cmd/migrate-v2-checkpoints/v2_author_test.go b/cmd/migrate-v2-checkpoints/v2_author_test.go new file mode 100644 index 000000000..06fb301ac --- /dev/null +++ b/cmd/migrate-v2-checkpoints/v2_author_test.go @@ -0,0 +1,171 @@ +package main + +import ( + "context" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/checkpoint" + checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/cmd/entire/cli/testutil" + "github.com/stretchr/testify/require" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" + "github.com/go-git/go-git/v6/plumbing/filemode" + "github.com/go-git/go-git/v6/plumbing/object" +) + +func TestFindV2SessionAuthorSkipsV2MainMergeCommits(t *testing.T) { + t.Parallel() + + repo := setupV2AuthorRepo(t) + cpID := checkpointID.MustCheckpointID("aaaaaaaaaaaa") + checkpointAuthor := object.Signature{ + Name: "Checkpoint Author", + Email: "checkpoint@example.com", + When: time.Date(2024, 5, 11, 17, 19, 31, 0, time.UTC), + } + baseAuthor := object.Signature{ + Name: "Base Author", + Email: "base@example.com", + When: checkpointAuthor.When.Add(-48 * time.Hour), + } + baseHash := writeTestEmptyV2MainCommit(t, repo, nil, baseAuthor, "base v2/main") + mergeAuthor := object.Signature{ + Name: "Merge Author", + Email: "merge@example.com", + When: time.Date(2024, 5, 20, 16, 0, 6, 0, time.UTC), + } + writeTestV2Checkpoint(t, repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-with-merge-history", + AuthorName: checkpointAuthor.Name, + AuthorEmail: checkpointAuthor.Email, + AuthorWhen: checkpointAuthor.When, + }) + ref, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) + require.NoError(t, err) + writeTestV2MergeCommitWithCheckpointParent(t, repo, baseHash, ref.Hash(), mergeAuthor) + + author, err := findV2SessionAuthor(context.Background(), repo, cpID, 0) + + require.NoError(t, err) + require.Equal(t, checkpointAuthor.Name, author.Name) + require.Equal(t, checkpointAuthor.Email, author.Email) + require.True(t, author.When.Equal(checkpointAuthor.When), "author time = %s, want %s", author.When, checkpointAuthor.When) +} + +func TestFindV2SessionAuthorReturnsNotFoundWhenOnlyMergeTouchedPath(t *testing.T) { + t.Parallel() + + repo := setupV2AuthorRepo(t) + cpID := checkpointID.MustCheckpointID("bbbbbbbbbbbb") + metadataPath := v2SessionMetadataPath(cpID, 0) + metadataBlob, err := checkpoint.CreateBlobFromContent(repo, []byte("{}\n")) + require.NoError(t, err) + treeWithMetadata, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{ + metadataPath: { + Name: metadataPath, + Mode: filemode.Regular, + Hash: metadataBlob, + }, + }) + require.NoError(t, err) + + emptyTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{}) + require.NoError(t, err) + parentAuthor := object.Signature{ + Name: "Parent Author", + Email: "parent@example.com", + When: time.Date(2024, 5, 10, 10, 0, 0, 0, time.UTC), + } + firstParent := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + Author: parentAuthor, + Committer: parentAuthor, + Message: "parent one", + }) + secondParent := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + Author: parentAuthor, + Committer: parentAuthor, + Message: "parent two", + }) + mergeAuthor := object.Signature{ + Name: "Merge Author", + Email: "merge@example.com", + When: time.Date(2024, 5, 20, 16, 0, 6, 0, time.UTC), + } + mergeHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: treeWithMetadata, + ParentHashes: []plumbing.Hash{firstParent, secondParent}, + Author: mergeAuthor, + Committer: mergeAuthor, + Message: "Merge remote v2/main", + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), mergeHash))) + + _, err = findV2SessionAuthor(context.Background(), repo, cpID, 0) + + require.ErrorContains(t, err, metadataPath+" not found in "+paths.V2MainRefName+" history") +} + +func setupV2AuthorRepo(t *testing.T) *git.Repository { + t.Helper() + + dir := t.TempDir() + testutil.InitRepo(t, dir) + repo, err := git.PlainOpen(dir) + require.NoError(t, err) + return repo +} + +func writeTestEmptyV2MainCommit(t *testing.T, repo *git.Repository, parentHashes []plumbing.Hash, author object.Signature, message string) plumbing.Hash { + t.Helper() + + emptyTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{}) + require.NoError(t, err) + hash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + ParentHashes: parentHashes, + Author: author, + Committer: author, + Message: message, + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), hash))) + return hash +} + +func writeTestV2MergeCommitWithCheckpointParent(t *testing.T, repo *git.Repository, baseHash, checkpointParent plumbing.Hash, author object.Signature) plumbing.Hash { + t.Helper() + + checkpointCommit, err := repo.CommitObject(checkpointParent) + require.NoError(t, err) + mainParentAuthor := object.Signature{ + Name: "Main Parent", + Email: "main-parent@example.com", + When: author.When.Add(-24 * time.Hour), + } + mainParent := writeTestEmptyV2MainCommit(t, repo, []plumbing.Hash{baseHash}, mainParentAuthor, "main-side parent") + mergeHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: checkpointCommit.TreeHash, + ParentHashes: []plumbing.Hash{mainParent, checkpointParent}, + Author: author, + Committer: author, + Message: "Merge remote v2/main", + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), mergeHash))) + return mergeHash +} + +func writeTestCommitObject(t *testing.T, repo *git.Repository, commit *object.Commit) plumbing.Hash { + t.Helper() + + encoded := repo.Storer.NewEncodedObject() + require.NoError(t, commit.Encode(encoded)) + hash, err := repo.Storer.SetEncodedObject(encoded) + require.NoError(t, err) + return hash +} From 685057192ef1ee164743e995ce7958040167f824 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 17:15:34 -0700 Subject: [PATCH 31/35] Fix migration data preservation gaps Entire-Checkpoint: 70b0acc0e628 --- cmd/entire/cli/checkpoint/committed.go | 104 +++++++++++++++--- cmd/entire/cli/checkpoint/v2_store.go | 16 ++- cmd/entire/cli/checkpoint/v2_store_test.go | 7 +- cmd/migrate-v2-checkpoints/main_test.go | 95 ++++++++++++++++ cmd/migrate-v2-checkpoints/migration.go | 38 +++++-- cmd/migrate-v2-checkpoints/v2_fixture_test.go | 6 + 6 files changed, 235 insertions(+), 31 deletions(-) diff --git a/cmd/entire/cli/checkpoint/committed.go b/cmd/entire/cli/checkpoint/committed.go index 6b5a9b887..e5c8b9ba5 100644 --- a/cmd/entire/cli/checkpoint/committed.go +++ b/cmd/entire/cli/checkpoint/committed.go @@ -381,12 +381,16 @@ func (s *GitStore) writeStandardCheckpointEntries(ctx context.Context, opts Writ // Build the sessions array var sessions []SessionFilePaths if existingSummary != nil { - sessions = make([]SessionFilePaths, max(len(existingSummary.Sessions), sessionIndex+1)) - copy(sessions, existingSummary.Sessions) + sessions = append([]SessionFilePaths(nil), existingSummary.Sessions...) } else { - sessions = make([]SessionFilePaths, 1) + sessions = []SessionFilePaths{} } - sessions[sessionIndex] = sessionFilePaths + if position := sessionFilePathsPosition(basePath, sessions, sessionIndex); position >= 0 { + sessions[position] = sessionFilePaths + } else { + sessions = append(sessions, sessionFilePaths) + } + sortSessionFilePaths(basePath, sessions) // Tripwire: an unreproduced production report had session 0 silently // replaced with a different sessionID's data. The symptom was @@ -502,7 +506,7 @@ func (s *GitStore) writeSessionToSubdirectory(ctx context.Context, opts WriteCom // writeCheckpointSummary writes the root-level CheckpointSummary with aggregated statistics. // sessions is the complete sessions array (already built by the caller). func (s *GitStore) writeCheckpointSummary(opts WriteCommittedOptions, basePath string, entries map[string]object.TreeEntry, sessions []SessionFilePaths) error { - checkpointsCount, filesTouched, tokenUsage, err := s.reaggregateFromEntries(basePath, len(sessions), entries) + checkpointsCount, filesTouched, tokenUsage, err := s.reaggregateFromEntries(basePath, sessions, entries) if err != nil { return fmt.Errorf("failed to aggregate session stats: %w", err) } @@ -632,8 +636,15 @@ func (s *GitStore) findSessionIndex(ctx context.Context, basePath string, existi if existingSummary == nil { return 0 } - for i := range len(existingSummary.Sessions) { - path := fmt.Sprintf("%s%d/%s", basePath, i, paths.MetadataFileName) + usedIndexes := make(map[int]struct{}, len(existingSummary.Sessions)) + for summaryIndex, sessionPaths := range existingSummary.Sessions { + sessionIndex, ok := sessionIndexFromFilePaths(basePath, sessionPaths) + if !ok { + sessionIndex = summaryIndex + } + usedIndexes[sessionIndex] = struct{}{} + + path := fmt.Sprintf("%s%d/%s", basePath, sessionIndex, paths.MetadataFileName) entry, exists := entries[path] if !exists { continue @@ -641,35 +652,47 @@ func (s *GitStore) findSessionIndex(ctx context.Context, basePath string, existi meta, err := s.readMetadataFromBlob(entry.Hash) if err != nil { logging.Warn(ctx, "failed to read session metadata during dedup check", - slog.Int("session_index", i), + slog.Int("session_index", sessionIndex), slog.String("session_id", sessionID), slog.String("error", err.Error()), ) continue } if meta.SessionID == sessionID { - return i + return sessionIndex } } - return len(existingSummary.Sessions) + for sessionIndex := 0; ; sessionIndex++ { + if _, used := usedIndexes[sessionIndex]; used { + continue + } + if sessionPathHasEntries(basePath, sessionIndex, entries) { + continue + } + return sessionIndex + } } // reaggregateFromEntries reads all session metadata from the entries map and // reaggregates CheckpointsCount, FilesTouched, and TokenUsage. -func (s *GitStore) reaggregateFromEntries(basePath string, sessionCount int, entries map[string]object.TreeEntry) (int, []string, *agent.TokenUsage, error) { +func (s *GitStore) reaggregateFromEntries(basePath string, sessions []SessionFilePaths, entries map[string]object.TreeEntry) (int, []string, *agent.TokenUsage, error) { var totalCount int var allFiles []string var totalTokens *agent.TokenUsage - for i := range sessionCount { - path := fmt.Sprintf("%s%d/%s", basePath, i, paths.MetadataFileName) + for summaryIndex, sessionPaths := range sessions { + sessionIndex, ok := sessionIndexFromFilePaths(basePath, sessionPaths) + if !ok { + return 0, nil, nil, fmt.Errorf("session %d metadata path %q is invalid", summaryIndex, sessionPaths.Metadata) + } + path := fmt.Sprintf("%s%d/%s", basePath, sessionIndex, paths.MetadataFileName) entry, exists := entries[path] if !exists { - return 0, nil, nil, fmt.Errorf("session %d metadata not found at %s", i, path) + return 0, nil, nil, fmt.Errorf("session %d metadata not found at %s", summaryIndex, path) } meta, err := s.readMetadataFromBlob(entry.Hash) if err != nil { - return 0, nil, nil, fmt.Errorf("failed to read session %d metadata: %w", i, err) + return 0, nil, nil, fmt.Errorf("failed to read session %d metadata: %w", summaryIndex, err) } totalCount += meta.CheckpointsCount allFiles = mergeFilesTouched(allFiles, meta.FilesTouched) @@ -679,6 +702,57 @@ func (s *GitStore) reaggregateFromEntries(basePath string, sessionCount int, ent return totalCount, allFiles, totalTokens, nil } +func sessionFilePathsPosition(basePath string, sessions []SessionFilePaths, targetIndex int) int { + for i, sessionPaths := range sessions { + sessionIndex, ok := sessionIndexFromFilePaths(basePath, sessionPaths) + if ok && sessionIndex == targetIndex { + return i + } + } + return -1 +} + +func sortSessionFilePaths(basePath string, sessions []SessionFilePaths) { + sort.SliceStable(sessions, func(i, j int) bool { + left, leftOK := sessionIndexFromFilePaths(basePath, sessions[i]) + right, rightOK := sessionIndexFromFilePaths(basePath, sessions[j]) + if !leftOK || !rightOK { + return leftOK + } + return left < right + }) +} + +func sessionIndexFromFilePaths(basePath string, sessionPaths SessionFilePaths) (int, bool) { + if sessionPaths.Metadata == "" { + return 0, false + } + metadataPath := strings.TrimPrefix(sessionPaths.Metadata, "/") + relativePath, ok := strings.CutPrefix(metadataPath, basePath) + if !ok { + return 0, false + } + sessionDir, fileName, ok := strings.Cut(relativePath, "/") + if !ok || fileName != paths.MetadataFileName { + return 0, false + } + sessionIndex, err := strconv.Atoi(sessionDir) + if err != nil || sessionIndex < 0 { + return 0, false + } + return sessionIndex, true +} + +func sessionPathHasEntries(basePath string, sessionIndex int, entries map[string]object.TreeEntry) bool { + prefix := fmt.Sprintf("%s%d/", basePath, sessionIndex) + for path := range entries { + if strings.HasPrefix(path, prefix) { + return true + } + } + return false +} + func checkpointCreatedAt(opts WriteCommittedOptions) time.Time { if opts.CreatedAt.IsZero() { return time.Now().UTC() diff --git a/cmd/entire/cli/checkpoint/v2_store.go b/cmd/entire/cli/checkpoint/v2_store.go index 0c64581bd..c17700517 100644 --- a/cmd/entire/cli/checkpoint/v2_store.go +++ b/cmd/entire/cli/checkpoint/v2_store.go @@ -23,8 +23,9 @@ import ( // V2GitStore is separate from GitStore (v1) to keep concerns isolated // and simplify future v1 removal. type V2GitStore struct { - repo *git.Repository - gs *GitStore // shared entry-building helpers (same package) + repo *git.Repository + repoRoot string + gs *GitStore // shared entry-building helpers (same package) // blobFetcher fetches missing blobs by hash. When set, read paths wrap // trees with FetchingTree so missing blobs are auto-recovered (and the @@ -35,10 +36,14 @@ type V2GitStore struct { // NewV2GitStore creates a new v2 checkpoint store backed by the given git repository. func NewV2GitStore(repo *git.Repository) *V2GitStore { - return &V2GitStore{ + store := &V2GitStore{ repo: repo, gs: &GitStore{repo: repo}, } + if worktree, err := repo.Worktree(); err == nil { + store.repoRoot = worktree.Filesystem().Root() + } + return store } // SetBlobFetcher configures the store to automatically fetch missing blobs @@ -69,7 +74,7 @@ func (s *V2GitStore) GetRefState(refName plumbing.ReferenceName) (parentHash, tr commit, err := s.repo.CommitObject(ref.Hash()) if err != nil { - cliTreeHash, cliErr := commitTreeHashViaCLI(context.Background(), ref.Hash()) + cliTreeHash, cliErr := commitTreeHashViaCLI(context.Background(), s.repoRoot, ref.Hash()) if cliErr != nil { return plumbing.ZeroHash, plumbing.ZeroHash, fmt.Errorf("failed to get commit for ref %s: %w", refName, errors.Join(err, cliErr)) } @@ -86,8 +91,9 @@ func (s *V2GitStore) GetRefState(refName plumbing.ReferenceName) (parentHash, tr // commitTreeHashViaCLI resolves the tree hash of a commit via // `git rev-parse ^{tree}`. See GetRefState for the rationale. -func commitTreeHashViaCLI(ctx context.Context, commitHash plumbing.Hash) (plumbing.Hash, error) { +func commitTreeHashViaCLI(ctx context.Context, repoRoot string, commitHash plumbing.Hash) (plumbing.Hash, error) { cmd := exec.CommandContext(ctx, "git", "rev-parse", commitHash.String()+"^{tree}") + cmd.Dir = repoRoot output, err := cmd.Output() if err != nil { return plumbing.ZeroHash, fmt.Errorf("git rev-parse %s^{tree}: %w", commitHash.String()[:12], err) diff --git a/cmd/entire/cli/checkpoint/v2_store_test.go b/cmd/entire/cli/checkpoint/v2_store_test.go index d964e8fe9..712996dc9 100644 --- a/cmd/entire/cli/checkpoint/v2_store_test.go +++ b/cmd/entire/cli/checkpoint/v2_store_test.go @@ -64,7 +64,7 @@ func TestV2GitStore_GetRefState_FallsBackToGitCLIWhenCommitObjectMissing(t *test testutil.WriteFile(t, dir, "README.md", "init") testutil.GitAdd(t, dir, "README.md") testutil.GitCommit(t, dir, "initial") - t.Chdir(dir) + t.Chdir(t.TempDir()) repo, err := git.PlainOpen(dir) require.NoError(t, err) @@ -81,12 +81,13 @@ func TestV2GitStore_GetRefState_FallsBackToGitCLIWhenCommitObjectMissing(t *test commit, err := repo.CommitObject(ref.Hash()) require.NoError(t, err) - store := NewV2GitStore(&git.Repository{ + store := NewV2GitStore(repo) + store.repo = &git.Repository{ Storer: commitObjectMissingStorer{ Storer: repo.Storer, missing: ref.Hash(), }, - }) + } parentHash, treeHash, err := store.GetRefState(refName) require.NoError(t, err) require.Equal(t, ref.Hash(), parentHash) diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 9ac00b0dc..7363b9be1 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -491,6 +491,36 @@ func TestRunApplyMigratesV2OrphanCheckpointAndIsIdempotent(t *testing.T) { require.NotContains(t, stdout.String(), cpID.String()+" sessions=1 commits=(orphan)") } +func TestRunApplyMigratesInvestigationMetadata(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "investigation-session", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"investigate\"}\n")), + Kind: string(session.KindAgentInvestigate), + InvestigateRunID: "0123456789ab", + InvestigateTopic: "Why is checkout flaky?", + HasInvestigation: true, + }) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") + require.Contains(t, stdout, "migrated sessions: 1") + + v1Store := checkpoint.NewGitStore(fixture.repo) + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.True(t, summary.HasInvestigation) + + content, err := v1Store.ReadSessionContent(context.Background(), cpID, 0) + require.NoError(t, err) + require.Equal(t, string(session.KindAgentInvestigate), content.Metadata.Kind) + require.Equal(t, "0123456789ab", content.Metadata.InvestigateRunID) + require.Equal(t, "Why is checkout flaky?", content.Metadata.InvestigateTopic) +} + func TestRunDryRunPlansWithoutWritingV1(t *testing.T) { t.Parallel() @@ -633,6 +663,65 @@ func TestRunDryRunReadsSparseExistingV1SessionPaths(t *testing.T) { require.Contains(t, stdout, "checkpoints eligible for migration: 0") } +func TestRunApplyAppendsSparseExistingV1SessionWithoutOverwriting(t *testing.T) { + t.Parallel() + + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 existing\"}\n")), + }) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-v2-new", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"from v2 new\"}\n")), + }) + + v1Store := checkpoint.NewGitStore(fixture.repo) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-zero", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 zero\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + require.NoError(t, v1Store.WriteCommitted(context.Background(), checkpoint.WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-existing-two", + Strategy: testStrategy, + Branch: testBranchName, + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"already v1 two\"}\n")), + AuthorName: testAuthorName, + AuthorEmail: testAuthorEmail, + })) + rewriteV1SecondSessionToSparseSlot(t, fixture.repo, cpID) + + stdout := runMigrationCommand(t, fixture, fixture.mainHash, "--apply") + require.Contains(t, stdout, "already present v1 sessions: 1") + require.Contains(t, stdout, "migrated sessions: 1") + + preserved, err := v1Store.ReadSessionContent(context.Background(), cpID, 2) + require.NoError(t, err) + require.Equal(t, "session-existing-two", preserved.Metadata.SessionID) + require.JSONEq(t, `{"message":"already v1 two"}`, string(preserved.Transcript)) + + migrated, err := v1Store.ReadSessionContent(context.Background(), cpID, 1) + require.NoError(t, err) + require.Equal(t, "session-v2-new", migrated.Metadata.SessionID) + require.JSONEq(t, `{"message":"from v2 new"}`, string(migrated.Transcript)) + + summary, err := v1Store.ReadCommitted(context.Background(), cpID) + require.NoError(t, err) + require.Len(t, summary.Sessions, 3) + require.Equal(t, "/"+cpID.Path()+"/0/metadata.json", summary.Sessions[0].Metadata) + require.Equal(t, "/"+cpID.Path()+"/1/metadata.json", summary.Sessions[1].Metadata) + require.Equal(t, "/"+cpID.Path()+"/2/metadata.json", summary.Sessions[2].Metadata) +} + func TestRunApplyMigratesTaskMetadata(t *testing.T) { t.Parallel() @@ -674,6 +763,11 @@ func TestRunApplyHasReviewReflectsOnlyMigratedSessions(t *testing.T) { CheckpointID: cpID, SessionID: "normal-session", Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"normal\"}\n")), + CombinedAttribution: &checkpoint.InitialAttribution{ + AgentLines: 12, + TotalLinesChanged: 12, + AgentPercentage: 100, + }, }) writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ CheckpointID: cpID, @@ -694,6 +788,7 @@ func TestRunApplyHasReviewReflectsOnlyMigratedSessions(t *testing.T) { require.NoError(t, err) require.NotNil(t, summary) require.False(t, summary.HasReview) + require.Nil(t, summary.CombinedAttribution) require.Len(t, summary.Sessions, 1) } diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 6c843ee68..5bc8b3545 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -113,17 +113,24 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di } eligibleSessions := 0 + canPreserveCombinedAttribution := true + var eligibleContents []struct { + sessionIndex int + content *checkpoint.SessionContent + } for sessionIndex := range summary.Sessions { metadataContent, err := m.v2Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) if err != nil { if errors.Is(err, checkpoint.ErrCheckpointNotFound) { m.report.MissingV2SessionMetadata++ + canPreserveCombinedAttribution = false continue } return eligibleSessions, fmt.Errorf("read v2 checkpoint %s session %d metadata: %w", discovered.ID, sessionIndex, err) } if !hasRequiredV2Metadata(metadataContent) { m.report.MissingV2SessionMetadata++ + canPreserveCombinedAttribution = false continue } if _, exists := existingSessionIDs[metadataContent.Metadata.SessionID]; exists { @@ -135,25 +142,37 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di if err != nil { if errors.Is(err, checkpoint.ErrNoTranscript) { m.report.MissingRawTranscripts++ + canPreserveCombinedAttribution = false continue } return eligibleSessions, fmt.Errorf("read v2 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) } m.report.EligibleSessions++ - if m.opts.apply { - author, err := findV2SessionAuthor(ctx, m.repo, discovered.ID, sessionIndex) + eligibleContents = append(eligibleContents, struct { + sessionIndex int + content *checkpoint.SessionContent + }{sessionIndex: sessionIndex, content: content}) + eligibleSessions++ + } + + if m.opts.apply { + combinedAttribution := summary.CombinedAttribution + if !canPreserveCombinedAttribution { + combinedAttribution = nil + } + for _, eligibleContent := range eligibleContents { + author, err := findV2SessionAuthor(ctx, m.repo, discovered.ID, eligibleContent.sessionIndex) if err != nil { - return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, sessionIndex, err) + return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, eligibleContent.sessionIndex, err) } - writeOpts := writeOptionsFromV2Content(content, summary, author) + writeOpts := writeOptionsFromV2Content(eligibleContent.content, combinedAttribution, author) writeCtx := checkpoint.WithCommitSigningDisabled(ctx) if err := m.v1Store.WriteCommitted(writeCtx, writeOpts); err != nil { - return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, sessionIndex, err) + return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, eligibleContent.sessionIndex, err) } m.report.MigratedSessions++ } - eligibleSessions++ } return eligibleSessions, nil } @@ -214,7 +233,7 @@ func hasRequiredV2Metadata(content *checkpoint.SessionContent) bool { return !content.Metadata.CheckpointID.IsEmpty() && content.Metadata.SessionID != "" } -func writeOptionsFromV2Content(content *checkpoint.SessionContent, summary *checkpoint.CheckpointSummary, author object.Signature) checkpoint.WriteCommittedOptions { +func writeOptionsFromV2Content(content *checkpoint.SessionContent, combinedAttribution *checkpoint.InitialAttribution, author object.Signature) checkpoint.WriteCommittedOptions { meta := content.Metadata return checkpoint.WriteCommittedOptions{ CheckpointID: meta.CheckpointID, @@ -240,12 +259,15 @@ func writeOptionsFromV2Content(content *checkpoint.SessionContent, summary *chec SessionMetrics: meta.SessionMetrics, InitialAttribution: meta.InitialAttribution, PromptAttributionsJSON: meta.PromptAttributions, - CombinedAttribution: summary.CombinedAttribution, + CombinedAttribution: combinedAttribution, Summary: meta.Summary, Kind: meta.Kind, ReviewSkills: meta.ReviewSkills, ReviewPrompt: meta.ReviewPrompt, HasReview: session.Kind(meta.Kind).IsReview(), + InvestigateRunID: meta.InvestigateRunID, + InvestigateTopic: meta.InvestigateTopic, + HasInvestigation: session.Kind(meta.Kind).IsInvestigate(), } } diff --git a/cmd/migrate-v2-checkpoints/v2_fixture_test.go b/cmd/migrate-v2-checkpoints/v2_fixture_test.go index ae8356c05..a23de0bef 100644 --- a/cmd/migrate-v2-checkpoints/v2_fixture_test.go +++ b/cmd/migrate-v2-checkpoints/v2_fixture_test.go @@ -57,6 +57,9 @@ type testV2CheckpointOptions struct { ReviewSkills []string ReviewPrompt string HasReview bool + InvestigateRunID string + InvestigateTopic string + HasInvestigation bool } func writeTestV2Checkpoint(t *testing.T, repo *git.Repository, opts testV2CheckpointOptions) { @@ -104,6 +107,7 @@ func writeTestV2MainCheckpoint(t *testing.T, repo *git.Repository, opts testV2Ch TokenUsage: opts.TokenUsage, CombinedAttribution: opts.CombinedAttribution, HasReview: opts.HasReview, + HasInvestigation: opts.HasInvestigation, } if entry, ok := entries[basePath+paths.MetadataFileName]; ok { existing := readTestJSONFromBlob[checkpoint.CheckpointSummary](t, repo, entry.Hash) @@ -179,6 +183,8 @@ func writeTestV2MainCheckpoint(t *testing.T, repo *git.Repository, opts testV2Ch Kind: opts.Kind, ReviewSkills: opts.ReviewSkills, ReviewPrompt: opts.ReviewPrompt, + InvestigateRunID: opts.InvestigateRunID, + InvestigateTopic: opts.InvestigateTopic, } metadataJSON, err := jsonutil.MarshalIndentWithNewline(metadata, "", " ") require.NoError(t, err) From 092de100b8f72f8b310bb4ec2328ed0916df1291 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Thu, 28 May 2026 17:22:50 -0700 Subject: [PATCH 32/35] Name eligible v2 session struct Entire-Checkpoint: eedb9b4ae0cc --- cmd/migrate-v2-checkpoints/migration.go | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 5bc8b3545..5a777ae40 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -114,10 +114,7 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di eligibleSessions := 0 canPreserveCombinedAttribution := true - var eligibleContents []struct { - sessionIndex int - content *checkpoint.SessionContent - } + var eligible []eligibleV2Session for sessionIndex := range summary.Sessions { metadataContent, err := m.v2Store.ReadSessionMetadataAndPrompts(ctx, discovered.ID, sessionIndex) if err != nil { @@ -149,10 +146,7 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di } m.report.EligibleSessions++ - eligibleContents = append(eligibleContents, struct { - sessionIndex int - content *checkpoint.SessionContent - }{sessionIndex: sessionIndex, content: content}) + eligible = append(eligible, eligibleV2Session{sessionIndex: sessionIndex, content: content}) eligibleSessions++ } @@ -161,15 +155,15 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di if !canPreserveCombinedAttribution { combinedAttribution = nil } - for _, eligibleContent := range eligibleContents { - author, err := findV2SessionAuthor(ctx, m.repo, discovered.ID, eligibleContent.sessionIndex) + for _, entry := range eligible { + author, err := findV2SessionAuthor(ctx, m.repo, discovered.ID, entry.sessionIndex) if err != nil { - return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, eligibleContent.sessionIndex, err) + return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, entry.sessionIndex, err) } - writeOpts := writeOptionsFromV2Content(eligibleContent.content, combinedAttribution, author) + writeOpts := writeOptionsFromV2Content(entry.content, combinedAttribution, author) writeCtx := checkpoint.WithCommitSigningDisabled(ctx) if err := m.v1Store.WriteCommitted(writeCtx, writeOpts); err != nil { - return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, eligibleContent.sessionIndex, err) + return eligibleSessions, fmt.Errorf("write v1 checkpoint %s session %d: %w", discovered.ID, entry.sessionIndex, err) } m.report.MigratedSessions++ } @@ -177,6 +171,11 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di return eligibleSessions, nil } +type eligibleV2Session struct { + sessionIndex int + content *checkpoint.SessionContent +} + func (m checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered discoveredCheckpoint, summary *checkpoint.CheckpointSummary) (map[string]struct{}, error) { existing := make(map[string]struct{}) if summary == nil { From be2f7e8679a3cbf6017814afe010da0c0ec5171b Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Fri, 29 May 2026 11:06:04 -0700 Subject: [PATCH 33/35] Fix v2 session author lookup Entire-Checkpoint: 13bdc2a14cbd --- .golangci.yaml | 1 + cmd/entire/cli/logging/logger_test.go | 2 +- cmd/entire/cli/settings/settings_test.go | 2 +- cmd/entire/cli/strategy/hook_managers_test.go | 2 +- cmd/entire/cli/strategy/manual_commit_test.go | 2 +- .../cli/versioncheck/versioncheck_test.go | 3 +- cmd/migrate-v2-checkpoints/VALIDATION.md | 13 +- cmd/migrate-v2-checkpoints/main_test.go | 1 - cmd/migrate-v2-checkpoints/v2_author.go | 42 +++++-- cmd/migrate-v2-checkpoints/v2_author_test.go | 112 ++++++++++++++++++ 10 files changed, 160 insertions(+), 20 deletions(-) diff --git a/.golangci.yaml b/.golangci.yaml index cfe0ec092..fe2a72394 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -140,6 +140,7 @@ linters: rules: - path: _test\.go linters: + - goconst - gosec - wrapcheck - forbidigo diff --git a/cmd/entire/cli/logging/logger_test.go b/cmd/entire/cli/logging/logger_test.go index 7ef0dbd38..ff6c2172c 100644 --- a/cmd/entire/cli/logging/logger_test.go +++ b/cmd/entire/cli/logging/logger_test.go @@ -14,7 +14,7 @@ import ( "time" ) -// Test constants to avoid goconst warnings +// Shared test literals. const ( testSessionID = "2025-01-15-test-session" testComponent = "hooks" diff --git a/cmd/entire/cli/settings/settings_test.go b/cmd/entire/cli/settings/settings_test.go index 2782cd9df..54ee21486 100644 --- a/cmd/entire/cli/settings/settings_test.go +++ b/cmd/entire/cli/settings/settings_test.go @@ -172,7 +172,7 @@ func TestLoad_AcceptsValidKeys(t *testing.T) { if settings.SummaryGeneration.Provider != "claude-code" { t.Errorf("expected summary_generation.provider 'claude-code', got %q", settings.SummaryGeneration.Provider) } - if settings.SummaryGeneration.Model != "sonnet" { //nolint:goconst // test literal + if settings.SummaryGeneration.Model != "sonnet" { t.Errorf("expected summary_generation.model 'sonnet', got %q", settings.SummaryGeneration.Model) } if settings.Redaction == nil { diff --git a/cmd/entire/cli/strategy/hook_managers_test.go b/cmd/entire/cli/strategy/hook_managers_test.go index 877bb62e5..b60fa25c2 100644 --- a/cmd/entire/cli/strategy/hook_managers_test.go +++ b/cmd/entire/cli/strategy/hook_managers_test.go @@ -55,7 +55,7 @@ func TestDetectHookManagers_Lefthook(t *testing.T) { if len(managers) != 1 { t.Fatalf("expected 1 manager, got %d", len(managers)) } - if managers[0].Name != "Lefthook" { //nolint:goconst // test assertion, not a magic string + if managers[0].Name != "Lefthook" { t.Errorf("expected Lefthook, got %s", managers[0].Name) } if managers[0].ConfigPath != "lefthook.yml" { diff --git a/cmd/entire/cli/strategy/manual_commit_test.go b/cmd/entire/cli/strategy/manual_commit_test.go index 39d92ef09..65af4f0ee 100644 --- a/cmd/entire/cli/strategy/manual_commit_test.go +++ b/cmd/entire/cli/strategy/manual_commit_test.go @@ -893,7 +893,7 @@ func TestShadowStrategy_PrepareCommitMsg_SkipsSessionWhenContentCheckFails(t *te func TestAddCheckpointTrailer_NoComment(t *testing.T) { // Test that addCheckpointTrailer adds trailer without any comment lines - message := "Test commit message\n" //nolint:goconst // already present in codebase + message := "Test commit message\n" result := addCheckpointTrailer(message, testTrailerCheckpointID) diff --git a/cmd/entire/cli/versioncheck/versioncheck_test.go b/cmd/entire/cli/versioncheck/versioncheck_test.go index 33afe4c60..85fa5f7fd 100644 --- a/cmd/entire/cli/versioncheck/versioncheck_test.go +++ b/cmd/entire/cli/versioncheck/versioncheck_test.go @@ -341,8 +341,7 @@ func TestParseGitHubRelease(t *testing.T) { } // brewUpgradeCmd is the install command produced for any brew-installed -// binary on a stable channel. Hoisted to a const so tests can reference -// it without tripping goconst on repeated string literals. +// binary on a stable channel. const brewUpgradeCmd = "brew upgrade entire" func TestUpdateCommand(t *testing.T) { diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index 54da9e647..56804f2be 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -623,11 +623,11 @@ git -C "$REPO" log --format='%h %ci %s' \ through a checkpoint with multiple eligible sessions, earlier sessions remain written and later sessions reappear on the next run. - **v1 commit author matches v2.** Each new v1 commit is authored with - the same name, email, and author timestamp as the v2 `/main` commit that - wrote the migrated session's `metadata.json`, so `git log` against v1 - and v2 attributes the same checkpoint session to the same author. §5.6 - treats the `author` header in `entire explain` as a required check; a - mismatch is a regression, not an accepted divergence. + the same name, email, and author timestamp as the v2 `/main` non-merge + commit that actually changed the migrated session's `metadata.json`. + Later commits that merely carry that path through their tree do not + count. §5.6 treats the `author` header in `entire explain` as a required + check; a mismatch is a regression, not an accepted divergence. - **Migration commits are unsigned.** The tool disables checkpoint commit signing for migrated writes, even if normal checkpoint signing is enabled in the repo. The v1 author line is replayed from v2 history; adding a local @@ -1318,7 +1318,8 @@ refs as shown in §3.3 and §3.5. `writeMigrationReport` (line 252), `candidateCommitLabel` (line 291, emits `(orphan)`). - v2 session author lookup: `cmd/migrate-v2-checkpoints/v2_author.go` — - `findV2SessionAuthor` (line 19), `v2SessionMetadataPath` (line 59). + `findV2SessionAuthor` (line 19), `commitChangedPath` (line 66), + `v2SessionMetadataPath` (line 95). - v1 write: `cmd/entire/cli/checkpoint/committed.go` — `WriteCommitted` (line 72), `WithCommitSigningDisabled` (line 57), `writeStandardCheckpointEntries` (line 324), diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 7363b9be1..98cd723c8 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -1,4 +1,3 @@ -//nolint:goconst // Repeated CLI flag literals keep argument-list tests readable. package main import ( diff --git a/cmd/migrate-v2-checkpoints/v2_author.go b/cmd/migrate-v2-checkpoints/v2_author.go index bcb95ede0..dac5b1d82 100644 --- a/cmd/migrate-v2-checkpoints/v2_author.go +++ b/cmd/migrate-v2-checkpoints/v2_author.go @@ -30,12 +30,9 @@ func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpo iter, err := repo.Log(&git.LogOptions{ From: ref.Hash(), Order: git.LogOrderCommitterTime, - PathFilter: func(path string) bool { - return path == metadataPath - }, }) if err != nil { - return object.Signature{}, fmt.Errorf("read %s history for %s: %w", paths.V2MainRefName, metadataPath, err) + return object.Signature{}, fmt.Errorf("read %s history: %w", paths.V2MainRefName, err) } defer iter.Close() @@ -47,10 +44,12 @@ func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpo if commit.NumParents() > 1 { return nil } - if _, err := commit.File(metadataPath); errors.Is(err, object.ErrFileNotFound) { + changed, err := commitChangedPath(commit, metadataPath) + if err != nil { + return fmt.Errorf("check %s change in %s: %w", metadataPath, commit.Hash, err) + } + if !changed { return nil - } else if err != nil { - return fmt.Errorf("read %s from %s: %w", metadataPath, commit.Hash, err) } author = commit.Author return errFoundV2SessionAuthor @@ -64,6 +63,35 @@ func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpo return object.Signature{}, fmt.Errorf("%s not found in %s history", metadataPath, paths.V2MainRefName) } +func commitChangedPath(commit *object.Commit, path string) (bool, error) { + file, err := commit.File(path) + if errors.Is(err, object.ErrFileNotFound) { + return false, nil + } + if err != nil { + return false, fmt.Errorf("read file: %w", err) + } + if commit.NumParents() == 0 { + return true, nil + } + if commit.NumParents() > 1 { + return false, nil + } + + parent, err := commit.Parent(0) + if err != nil { + return false, fmt.Errorf("read parent: %w", err) + } + parentFile, err := parent.File(path) + if errors.Is(err, object.ErrFileNotFound) { + return true, nil + } + if err != nil { + return false, fmt.Errorf("read parent file: %w", err) + } + return file.Hash != parentFile.Hash || file.Mode != parentFile.Mode, nil +} + func v2SessionMetadataPath(cpID checkpointID.CheckpointID, sessionIndex int) string { return cpID.Path() + "/" + strconv.Itoa(sessionIndex) + "/" + paths.MetadataFileName } diff --git a/cmd/migrate-v2-checkpoints/v2_author_test.go b/cmd/migrate-v2-checkpoints/v2_author_test.go index 06fb301ac..a9c79e3b0 100644 --- a/cmd/migrate-v2-checkpoints/v2_author_test.go +++ b/cmd/migrate-v2-checkpoints/v2_author_test.go @@ -57,6 +57,118 @@ func TestFindV2SessionAuthorSkipsV2MainMergeCommits(t *testing.T) { require.True(t, author.When.Equal(checkpointAuthor.When), "author time = %s, want %s", author.When, checkpointAuthor.When) } +func TestFindV2SessionAuthorSkipsLaterCheckpointCommitsThatOnlyCarryPath(t *testing.T) { + t.Parallel() + + repo := setupV2AuthorRepo(t) + cpID := checkpointID.MustCheckpointID("0b0206eed178") + metadataPath := v2SessionMetadataPath(cpID, 0) + metadataBlob, err := checkpoint.CreateBlobFromContent(repo, []byte(`{"session_id":"original"}`+"\n")) + require.NoError(t, err) + metadataEntries := map[string]object.TreeEntry{ + metadataPath: { + Name: metadataPath, + Mode: filemode.Regular, + Hash: metadataBlob, + }, + } + metadataTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, metadataEntries) + require.NoError(t, err) + emptyTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, map[string]object.TreeEntry{}) + require.NoError(t, err) + + checkpointAuthor := object.Signature{ + Name: "Checkpoint Author", + Email: "checkpoint@example.com", + When: time.Date(2024, 5, 11, 17, 19, 31, 0, time.UTC), + } + baseAuthor := object.Signature{ + Name: "Base Author", + Email: "base@example.com", + When: checkpointAuthor.When.Add(-24 * time.Hour), + } + baseHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + Author: baseAuthor, + Committer: baseAuthor, + Message: "base v2/main", + }) + checkpointHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: metadataTree, + ParentHashes: []plumbing.Hash{baseHash}, + Author: checkpointAuthor, + Committer: checkpointAuthor, + Message: "Checkpoint: 0b0206eed178", + }) + checkpointCommit, err := repo.CommitObject(checkpointHash) + require.NoError(t, err) + changed, err := commitChangedPath(checkpointCommit, metadataPath) + require.NoError(t, err) + require.True(t, changed) + + sideAuthor := object.Signature{ + Name: "Side Author", + Email: "side@example.com", + When: checkpointAuthor.When.Add(47 * time.Hour), + } + sideHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: emptyTree, + ParentHashes: []plumbing.Hash{baseHash}, + Author: sideAuthor, + Committer: sideAuthor, + Message: "side commit without metadata", + }) + mergeAuthor := object.Signature{ + Name: "Merge Author", + Email: "merge@example.com", + When: checkpointAuthor.When.Add(24 * time.Hour), + } + mergeHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: metadataTree, + ParentHashes: []plumbing.Hash{checkpointHash, sideHash}, + Author: mergeAuthor, + Committer: mergeAuthor, + Message: "Merge remote v2/main", + }) + laterAuthor := object.Signature{ + Name: "Later Checkpoint Author", + Email: "later@example.com", + When: checkpointAuthor.When.Add(48 * time.Hour), + } + laterBlob, err := checkpoint.CreateBlobFromContent(repo, []byte(`{"session_id":"later"}`+"\n")) + require.NoError(t, err) + laterEntries := map[string]object.TreeEntry{ + metadataPath: metadataEntries[metadataPath], + "68/0da8552908/0/metadata.json": { + Name: "68/0da8552908/0/metadata.json", + Mode: filemode.Regular, + Hash: laterBlob, + }, + } + laterTree, err := checkpoint.BuildTreeFromEntries(context.Background(), repo, laterEntries) + require.NoError(t, err) + laterHash := writeTestCommitObject(t, repo, &object.Commit{ + TreeHash: laterTree, + ParentHashes: []plumbing.Hash{mergeHash}, + Author: laterAuthor, + Committer: laterAuthor, + Message: "Checkpoint: 680da8552908", + }) + require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), laterHash))) + laterCommit, err := repo.CommitObject(laterHash) + require.NoError(t, err) + changed, err = commitChangedPath(laterCommit, metadataPath) + require.NoError(t, err) + require.False(t, changed) + + author, err := findV2SessionAuthor(context.Background(), repo, cpID, 0) + + require.NoError(t, err) + require.Equal(t, checkpointAuthor.Name, author.Name) + require.Equal(t, checkpointAuthor.Email, author.Email) + require.True(t, author.When.Equal(checkpointAuthor.When), "author time = %s, want %s", author.When, checkpointAuthor.When) +} + func TestFindV2SessionAuthorReturnsNotFoundWhenOnlyMergeTouchedPath(t *testing.T) { t.Parallel() From 8616ee20128820924f618695cfb2f0379d13d134 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Fri, 29 May 2026 11:26:20 -0700 Subject: [PATCH 34/35] Disable signing for migration tool runs Entire-Checkpoint: c58d83f4bb9a --- cmd/migrate-v2-checkpoints/VALIDATION.md | 7 ++- cmd/migrate-v2-checkpoints/main.go | 2 + cmd/migrate-v2-checkpoints/main_test.go | 75 ++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 3 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index 56804f2be..48df16b52 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -628,11 +628,12 @@ git -C "$REPO" log --format='%h %ci %s' \ Later commits that merely carry that path through their tree do not count. §5.6 treats the `author` header in `entire explain` as a required check; a mismatch is a regression, not an accepted divergence. -- **Migration commits are unsigned.** The tool disables checkpoint commit - signing for migrated writes, even if normal checkpoint signing is enabled +- **Migration-run commits are unsigned.** The tool disables checkpoint commit + signing for the whole migration run, including the v1/v2 preflight ref + refresh and the migrated writes, even if normal checkpoint signing is enabled in the repo. The v1 author line is replayed from v2 history; adding a local operator signature to that replayed author would be misleading. Any signed - migrated v1 commit is a bug. + commit created by this tool is a bug. - **Roll back** by resetting v1 back to `$PRE_APPLY_TIP`: ```sh diff --git a/cmd/migrate-v2-checkpoints/main.go b/cmd/migrate-v2-checkpoints/main.go index a9e6889d3..ac246a23c 100644 --- a/cmd/migrate-v2-checkpoints/main.go +++ b/cmd/migrate-v2-checkpoints/main.go @@ -7,6 +7,7 @@ import ( "io" "os" + "github.com/entireio/cli/cmd/entire/cli/checkpoint" "github.com/entireio/cli/cmd/entire/cli/gitrepo" "github.com/entireio/cli/cmd/entire/cli/paths" "github.com/entireio/cli/cmd/entire/cli/settings" @@ -54,6 +55,7 @@ func run(ctx context.Context, args []string, stdout io.Writer) error { return err } ctx = settings.WithWorktreeRoot(ctx, repoRoot) + ctx = checkpoint.WithCommitSigningDisabled(ctx) if shouldEnsureCheckpointRefs(opts) { if err := ensureLatestV1Ref(ctx, repoRoot, repo); err != nil { diff --git a/cmd/migrate-v2-checkpoints/main_test.go b/cmd/migrate-v2-checkpoints/main_test.go index 98cd723c8..b4c272566 100644 --- a/cmd/migrate-v2-checkpoints/main_test.go +++ b/cmd/migrate-v2-checkpoints/main_test.go @@ -355,6 +355,58 @@ func TestRunApplySeedsLocalV1FromRemoteBeforeMigrating(t *testing.T) { require.Len(t, summary.Sessions, 1) } +func TestRunApplyDisablesSigningDuringPreflightRefReplay(t *testing.T) { //nolint:paralleltest // mutates git config env + markerPath := configureFailingCheckpointSigner(t) + fixture := setupMigrationHistoryRepo(t) + cpID := id.MustCheckpointID(mainCheckpointID) + writeTestV2Checkpoint(t, fixture.repo, testV2CheckpointOptions{ + CheckpointID: cpID, + SessionID: "session-signing-disabled", + Transcript: redact.AlreadyRedacted([]byte("{\"message\":\"signing disabled\"}\n")), + }) + cloneDir := cloneMigrationRepoWithOrigin(t, fixture) + cloneRepo, err := git.PlainOpen(cloneDir) + require.NoError(t, err) + + localV1RefName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) + localOnlyBlob, err := checkpoint.CreateBlobFromContent(cloneRepo, []byte("local-only\n")) + require.NoError(t, err) + writeTestV2RefEntriesWithAuthor(t, cloneRepo, localV1RefName, plumbing.ZeroHash, map[string]object.TreeEntry{ + "local-only.txt": { + Name: "local-only.txt", + Mode: 0o100644, + Hash: localOnlyBlob, + }, + }, "local v1 commit", object.Signature{ + Name: testAuthorName, + Email: testAuthorEmail, + When: time.Date(2024, 8, 9, 10, 11, 12, 0, time.UTC), + }) + + var stdout bytes.Buffer + err = run(context.Background(), []string{ + "--repo", cloneDir, + "--since", fixture.baseHash.String(), + "--head", fixture.mainHash.String(), + "--apply", + }, &stdout) + require.NoError(t, err) + require.Contains(t, stdout.String(), "migrated sessions: 1") + require.NoFileExists(t, markerPath) + + cloneRepo, err = git.PlainOpen(cloneDir) + require.NoError(t, err) + ref, err := cloneRepo.Reference(localV1RefName, true) + require.NoError(t, err) + migrationCommit, err := cloneRepo.CommitObject(ref.Hash()) + require.NoError(t, err) + require.Empty(t, migrationCommit.Signature) + require.NotEmpty(t, migrationCommit.ParentHashes) + replayedCommit, err := cloneRepo.CommitObject(migrationCommit.ParentHashes[0]) + require.NoError(t, err) + require.Empty(t, replayedCommit.Signature) +} + func TestRunApplyMigratesV2CheckpointToV1(t *testing.T) { t.Parallel() @@ -1020,6 +1072,29 @@ func runMigrationGit(t *testing.T, dir string, args ...string) { require.NoError(t, err, "git %s failed: %s", strings.Join(args, " "), output) } +func configureFailingCheckpointSigner(t *testing.T) string { + t.Helper() + + testutil.IsolateGitConfigEnv(t) + dir := t.TempDir() + markerPath := filepath.Join(dir, "signer-called") + signerPath := filepath.Join(dir, "fake-gpg") + script := fmt.Sprintf("#!/bin/sh\nprintf called > %q\nexit 1\n", markerPath) + require.NoError(t, os.WriteFile(signerPath, []byte(script), 0o755)) + + globalConfig := fmt.Sprintf(`[commit] + gpgsign = true +[user] + signingkey = TESTKEY +[gpg] + program = %s +`, signerPath) + globalConfigPath := filepath.Join(dir, "gitconfig") + require.NoError(t, os.WriteFile(globalConfigPath, []byte(globalConfig), 0o644)) + t.Setenv("GIT_CONFIG_GLOBAL", globalConfigPath) + return markerPath +} + func runMigrationCommand(t *testing.T, fixture migrationHistoryFixture, head plumbing.Hash, mode string) string { t.Helper() From d6e4e40bf1d1d14f25edc05a5fc3a7ec2de41ef3 Mon Sep 17 00:00:00 2001 From: Sven Pfleiderer Date: Fri, 29 May 2026 13:04:33 -0700 Subject: [PATCH 35/35] Index v2 session authors during migration Entire-Checkpoint: 77778444950d --- cmd/migrate-v2-checkpoints/VALIDATION.md | 5 +- cmd/migrate-v2-checkpoints/migration.go | 28 ++-- cmd/migrate-v2-checkpoints/v2_author.go | 127 ++++++++++++++----- cmd/migrate-v2-checkpoints/v2_author_test.go | 10 -- 4 files changed, 115 insertions(+), 55 deletions(-) diff --git a/cmd/migrate-v2-checkpoints/VALIDATION.md b/cmd/migrate-v2-checkpoints/VALIDATION.md index 48df16b52..1994c8a39 100644 --- a/cmd/migrate-v2-checkpoints/VALIDATION.md +++ b/cmd/migrate-v2-checkpoints/VALIDATION.md @@ -1319,8 +1319,9 @@ refs as shown in §3.3 and §3.5. `writeMigrationReport` (line 252), `candidateCommitLabel` (line 291, emits `(orphan)`). - v2 session author lookup: `cmd/migrate-v2-checkpoints/v2_author.go` — - `findV2SessionAuthor` (line 19), `commitChangedPath` (line 66), - `v2SessionMetadataPath` (line 95). + `buildV2SessionAuthorIndex` (line 30), + `changedV2SessionMetadataPaths` (line 79), + `v2SessionMetadataPath` (line 152). - v1 write: `cmd/entire/cli/checkpoint/committed.go` — `WriteCommitted` (line 72), `WithCommitSigningDisabled` (line 57), `writeStandardCheckpointEntries` (line 324), diff --git a/cmd/migrate-v2-checkpoints/migration.go b/cmd/migrate-v2-checkpoints/migration.go index 5a777ae40..f3d696573 100644 --- a/cmd/migrate-v2-checkpoints/migration.go +++ b/cmd/migrate-v2-checkpoints/migration.go @@ -43,11 +43,12 @@ type migrationCandidate struct { } type checkpointMigrator struct { - repo *git.Repository - v1Store *checkpoint.GitStore - v2Store *checkpoint.V2GitStore - opts migrationOptions - report *migrationReport + repo *git.Repository + v1Store *checkpoint.GitStore + v2Store *checkpoint.V2GitStore + authorIndex *v2SessionAuthorIndex + opts migrationOptions + report *migrationReport } func migrateDiscoveredCheckpoints(ctx context.Context, repo *git.Repository, discovered []discoveredCheckpoint, opts migrationOptions) (migrationReport, error) { @@ -93,7 +94,7 @@ func migrationCandidateFromDiscovered(discovered discoveredCheckpoint, sessionCo } } -func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered discoveredCheckpoint) (int, error) { +func (m *checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered discoveredCheckpoint) (int, error) { existing, err := m.v1Store.ReadCommitted(ctx, discovered.ID) if err != nil { return 0, fmt.Errorf("read v1 checkpoint %s: %w", discovered.ID, err) @@ -156,7 +157,7 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di combinedAttribution = nil } for _, entry := range eligible { - author, err := findV2SessionAuthor(ctx, m.repo, discovered.ID, entry.sessionIndex) + author, err := m.findV2SessionAuthor(ctx, discovered.ID, entry.sessionIndex) if err != nil { return eligibleSessions, fmt.Errorf("resolve v2 checkpoint %s session %d author: %w", discovered.ID, entry.sessionIndex, err) } @@ -171,12 +172,23 @@ func (m checkpointMigrator) migrateCheckpoint(ctx context.Context, discovered di return eligibleSessions, nil } +func (m *checkpointMigrator) findV2SessionAuthor(ctx context.Context, cpID checkpointID.CheckpointID, sessionIndex int) (object.Signature, error) { + if m.authorIndex == nil { + authorIndex, err := buildV2SessionAuthorIndex(ctx, m.repo) + if err != nil { + return object.Signature{}, err + } + m.authorIndex = authorIndex + } + return m.authorIndex.find(cpID, sessionIndex) +} + type eligibleV2Session struct { sessionIndex int content *checkpoint.SessionContent } -func (m checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered discoveredCheckpoint, summary *checkpoint.CheckpointSummary) (map[string]struct{}, error) { +func (m *checkpointMigrator) existingV1SessionIDs(ctx context.Context, discovered discoveredCheckpoint, summary *checkpoint.CheckpointSummary) (map[string]struct{}, error) { existing := make(map[string]struct{}) if summary == nil { return existing, nil diff --git a/cmd/migrate-v2-checkpoints/v2_author.go b/cmd/migrate-v2-checkpoints/v2_author.go index dac5b1d82..8fe3d43d5 100644 --- a/cmd/migrate-v2-checkpoints/v2_author.go +++ b/cmd/migrate-v2-checkpoints/v2_author.go @@ -2,9 +2,9 @@ package main import ( "context" - "errors" "fmt" "strconv" + "strings" checkpointID "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" "github.com/entireio/cli/cmd/entire/cli/paths" @@ -12,31 +12,37 @@ import ( "github.com/go-git/go-git/v6" "github.com/go-git/go-git/v6/plumbing" "github.com/go-git/go-git/v6/plumbing/object" + "github.com/go-git/go-git/v6/utils/merkletrie" ) -var errFoundV2SessionAuthor = errors.New("found v2 session author") +type v2SessionAuthorIndex struct { + authors map[string]object.Signature +} func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpointID.CheckpointID, sessionIndex int) (object.Signature, error) { - if err := ctx.Err(); err != nil { - return object.Signature{}, err //nolint:wrapcheck // Propagating context cancellation + index, err := buildV2SessionAuthorIndex(ctx, repo) + if err != nil { + return object.Signature{}, err } + return index.find(cpID, sessionIndex) +} +func buildV2SessionAuthorIndex(ctx context.Context, repo *git.Repository) (*v2SessionAuthorIndex, error) { ref, err := repo.Reference(plumbing.ReferenceName(paths.V2MainRefName), true) if err != nil { - return object.Signature{}, fmt.Errorf("resolve %s: %w", paths.V2MainRefName, err) + return nil, fmt.Errorf("resolve %s: %w", paths.V2MainRefName, err) } - metadataPath := v2SessionMetadataPath(cpID, sessionIndex) iter, err := repo.Log(&git.LogOptions{ From: ref.Hash(), Order: git.LogOrderCommitterTime, }) if err != nil { - return object.Signature{}, fmt.Errorf("read %s history: %w", paths.V2MainRefName, err) + return nil, fmt.Errorf("read %s history: %w", paths.V2MainRefName, err) } defer iter.Close() - var author object.Signature + index := &v2SessionAuthorIndex{authors: make(map[string]object.Signature)} err = iter.ForEach(func(commit *object.Commit) error { if err := ctx.Err(); err != nil { return err //nolint:wrapcheck // Propagating context cancellation @@ -44,52 +50,103 @@ func findV2SessionAuthor(ctx context.Context, repo *git.Repository, cpID checkpo if commit.NumParents() > 1 { return nil } - changed, err := commitChangedPath(commit, metadataPath) + paths, err := changedV2SessionMetadataPaths(ctx, commit) if err != nil { - return fmt.Errorf("check %s change in %s: %w", metadataPath, commit.Hash, err) + return fmt.Errorf("read changed v2 session metadata paths in %s: %w", commit.Hash, err) } - if !changed { - return nil + for _, path := range paths { + if _, exists := index.authors[path]; !exists { + index.authors[path] = commit.Author + } } - author = commit.Author - return errFoundV2SessionAuthor + return nil }) - if errors.Is(err, errFoundV2SessionAuthor) { - return author, nil - } if err != nil { - return object.Signature{}, fmt.Errorf("walk %s history for %s: %w", paths.V2MainRefName, metadataPath, err) + return nil, fmt.Errorf("walk %s history: %w", paths.V2MainRefName, err) } - return object.Signature{}, fmt.Errorf("%s not found in %s history", metadataPath, paths.V2MainRefName) + return index, nil } -func commitChangedPath(commit *object.Commit, path string) (bool, error) { - file, err := commit.File(path) - if errors.Is(err, object.ErrFileNotFound) { - return false, nil +func (index *v2SessionAuthorIndex) find(cpID checkpointID.CheckpointID, sessionIndex int) (object.Signature, error) { + metadataPath := v2SessionMetadataPath(cpID, sessionIndex) + author, ok := index.authors[metadataPath] + if !ok { + return object.Signature{}, fmt.Errorf("%s not found in %s history", metadataPath, paths.V2MainRefName) } + return author, nil +} + +func changedV2SessionMetadataPaths(ctx context.Context, commit *object.Commit) ([]string, error) { + commitTree, err := commit.Tree() if err != nil { - return false, fmt.Errorf("read file: %w", err) + return nil, fmt.Errorf("read commit tree: %w", err) } - if commit.NumParents() == 0 { - return true, nil + + var parentTree *object.Tree + if commit.NumParents() > 0 { + parent, err := commit.Parent(0) + if err != nil { + return nil, fmt.Errorf("read parent: %w", err) + } + parentTree, err = parent.Tree() + if err != nil { + return nil, fmt.Errorf("read parent tree: %w", err) + } } - if commit.NumParents() > 1 { - return false, nil + + changes, err := object.DiffTreeContext(ctx, parentTree, commitTree) + if err != nil { + return nil, fmt.Errorf("diff commit tree: %w", err) } - parent, err := commit.Parent(0) + var paths []string + for _, change := range changes { + path, ok, err := v2SessionMetadataPathFromChange(change) + if err != nil { + return nil, err + } + if ok { + paths = append(paths, path) + } + } + return paths, nil +} + +func v2SessionMetadataPathFromChange(change *object.Change) (string, bool, error) { + action, err := change.Action() if err != nil { - return false, fmt.Errorf("read parent: %w", err) + return "", false, fmt.Errorf("read change action: %w", err) + } + if action != merkletrie.Insert && action != merkletrie.Modify { + return "", false, nil + } + if !isV2SessionMetadataPath(change.To.Name) { + return "", false, nil + } + return change.To.Name, true, nil +} + +func isV2SessionMetadataPath(path string) bool { + shard, rest, ok := strings.Cut(path, "/") + if !ok || len(shard) != 2 { + return false + } + suffix, rest, ok := strings.Cut(rest, "/") + if !ok || len(suffix) != 10 { + return false + } + if _, err := checkpointID.NewCheckpointID(shard + suffix); err != nil { + return false } - parentFile, err := parent.File(path) - if errors.Is(err, object.ErrFileNotFound) { - return true, nil + sessionDir, fileName, ok := strings.Cut(rest, "/") + if !ok || fileName != paths.MetadataFileName { + return false } + sessionIndex, err := strconv.Atoi(sessionDir) if err != nil { - return false, fmt.Errorf("read parent file: %w", err) + return false } - return file.Hash != parentFile.Hash || file.Mode != parentFile.Mode, nil + return sessionIndex >= 0 } func v2SessionMetadataPath(cpID checkpointID.CheckpointID, sessionIndex int) string { diff --git a/cmd/migrate-v2-checkpoints/v2_author_test.go b/cmd/migrate-v2-checkpoints/v2_author_test.go index a9c79e3b0..70cb5a573 100644 --- a/cmd/migrate-v2-checkpoints/v2_author_test.go +++ b/cmd/migrate-v2-checkpoints/v2_author_test.go @@ -100,11 +100,6 @@ func TestFindV2SessionAuthorSkipsLaterCheckpointCommitsThatOnlyCarryPath(t *test Committer: checkpointAuthor, Message: "Checkpoint: 0b0206eed178", }) - checkpointCommit, err := repo.CommitObject(checkpointHash) - require.NoError(t, err) - changed, err := commitChangedPath(checkpointCommit, metadataPath) - require.NoError(t, err) - require.True(t, changed) sideAuthor := object.Signature{ Name: "Side Author", @@ -155,11 +150,6 @@ func TestFindV2SessionAuthorSkipsLaterCheckpointCommitsThatOnlyCarryPath(t *test Message: "Checkpoint: 680da8552908", }) require.NoError(t, repo.Storer.SetReference(plumbing.NewHashReference(plumbing.ReferenceName(paths.V2MainRefName), laterHash))) - laterCommit, err := repo.CommitObject(laterHash) - require.NoError(t, err) - changed, err = commitChangedPath(laterCommit, metadataPath) - require.NoError(t, err) - require.False(t, changed) author, err := findV2SessionAuthor(context.Background(), repo, cpID, 0)