From 2abdba744dd093731ea39a55d7fe080c881c0c75 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Tue, 30 Jun 2026 02:13:36 +0200 Subject: [PATCH 1/6] feat(agents): add agent-autofinish-watch.sh stalled-lane watcher The SessionStart shim scripts/agent-stalled-report.sh has always wrapped scripts/agent-autofinish-watch.sh, but that watcher was never authored, so the hook soft-exited 0 and merged-PR worktrees (the 'retained for now' path in agent-branch-finish.sh) were never reaped. Write it as a thin orchestrator over the existing 'gx worktree prune' primitive: scans agent/* worktrees, reports stalled lanes (work present, no open PR, past idle gate) and merged-but-retained lanes, and under --auto-merge reaps merged lanes. Resolves the primary checkout via the git common dir so it works from inside any worktree. Healthy in-flight lanes (open PR / live process) stay silent. --- scripts/agent-autofinish-watch.sh | 272 ++++++++++++++++++++++++++++ test/agent-autofinish-watch.test.js | 117 ++++++++++++ 2 files changed, 389 insertions(+) create mode 100755 scripts/agent-autofinish-watch.sh create mode 100644 test/agent-autofinish-watch.test.js diff --git a/scripts/agent-autofinish-watch.sh b/scripts/agent-autofinish-watch.sh new file mode 100755 index 0000000..c9f42b3 --- /dev/null +++ b/scripts/agent-autofinish-watch.sh @@ -0,0 +1,272 @@ +#!/usr/bin/env bash +# Detect stalled agent/* worktrees and (optionally) reap lanes whose PR already +# merged but whose worktree was retained on disk. +# +# This is the watcher that scripts/agent-stalled-report.sh (the SessionStart +# hook) expects. Without it, that shim soft-exits 0 and merged-PR worktrees are +# never cleaned up (the "retained for now" path in agent-branch-finish.sh). +# +# It does NOT reinvent cleanup: reaping delegates to `gx worktree prune` +# (scripts/agent-worktree-prune.sh), the existing, tested primitive. +# +# Per-lane status lines use the prefix the report shim greps: +# [agent-autofinish-watch] agent/: +# A line is emitted ONLY for actionable lanes (merged-but-retained, or stalled +# with no open PR after the idle gate). Healthy in-flight lanes stay silent. +# +# Exit codes: 0 always (informational); reaping failures warn but do not fail. + +set -euo pipefail + +MODE="once" # once | daemon +DRY_RUN=0 +AUTO_MERGE=0 +INTERVAL=300 +IDLE_MINUTES="${GUARDEX_AUTOFINISH_IDLE_MINUTES:-60}" +BASE_BRANCH="${GUARDEX_BASE_BRANCH:-}" +GH_BIN="${GUARDEX_GH_BIN:-gh}" +NOW_EPOCH_OVERRIDE="${GUARDEX_AUTOFINISH_NOW_EPOCH:-}" + +WORKTREE_ROOT_RELS=( + ".omx/agent-worktrees" + ".omx/.tmp-worktrees" + ".omc/agent-worktrees" + ".omc/.tmp-worktrees" +) +LOCK_FILE_REL=".omx/state/agent-file-locks.json" + +while [[ $# -gt 0 ]]; do + case "$1" in + --once) MODE="once"; shift ;; + --daemon) MODE="daemon"; shift ;; + --dry-run) DRY_RUN=1; shift ;; + --auto-merge) AUTO_MERGE=1; shift ;; + --interval) INTERVAL="${2:-300}"; shift 2 ;; + --idle-minutes) IDLE_MINUTES="${2:-60}"; shift 2 ;; + --base) BASE_BRANCH="${2:-}"; shift 2 ;; + -h|--help) + echo "Usage: $0 [--once|--daemon] [--dry-run] [--auto-merge] [--interval SEC] [--idle-minutes MIN] [--base BRANCH]" + exit 0 + ;; + *) + echo "[agent-autofinish-watch] Unknown argument: $1" >&2 + exit 1 + ;; + esac +done + +if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then + echo "[agent-autofinish-watch] Not inside a git repository." >&2 + exit 0 +fi + +# Resolve the PRIMARY checkout root, not the current worktree: the managed +# worktree roots (.omc/agent-worktrees, ...) live under the primary checkout, +# and refs/reflogs are shared via the common git dir. Running from inside an +# agent worktree must still see every sibling lane. +git_common_dir="$(git rev-parse --git-common-dir 2>/dev/null)" +case "$git_common_dir" in + /*) ;; + *) git_common_dir="$(git rev-parse --show-toplevel)/${git_common_dir}" ;; +esac +repo_root="$(cd "$(dirname "$git_common_dir")" && pwd)" + +resolve_base_branch() { + [[ -n "$BASE_BRANCH" ]] && return 0 + local head_ref + head_ref="$(git -C "$repo_root" symbolic-ref --quiet --short refs/remotes/origin/HEAD 2>/dev/null || true)" + if [[ -n "$head_ref" ]]; then + BASE_BRANCH="${head_ref#origin/}" + return 0 + fi + for cand in main master dev; do + if git -C "$repo_root" show-ref --verify --quiet "refs/heads/${cand}"; then + BASE_BRANCH="$cand" + return 0 + fi + done + BASE_BRANCH="main" +} + +is_managed_worktree_path() { + local entry="$1" rel + for rel in "${WORKTREE_ROOT_RELS[@]}"; do + [[ "$entry" == "${repo_root}/${rel}"/* ]] && return 0 + done + return 1 +} + +is_temporary_worktree_path() { + local name + name="$(basename "$1")" + [[ "$name" == __agent_integrate-* || "$name" == __source-probe-* ]] +} + +now_epoch() { + if [[ -n "$NOW_EPOCH_OVERRIDE" ]]; then + printf '%s' "$NOW_EPOCH_OVERRIDE" + else + date +%s + fi +} + +has_live_process_in_worktree() { + local wt="$1" proc_cwd live_cwd + [[ -d /proc ]] || return 1 + for proc_cwd in /proc/[0-9]*/cwd; do + [[ -e "$proc_cwd" ]] || continue + live_cwd="$(readlink "$proc_cwd" 2>/dev/null || true)" + [[ -n "$live_cwd" ]] || continue + live_cwd="${live_cwd% (deleted)}" + if [[ "$live_cwd" == "$wt" || "$live_cwd" == "${wt}"/* ]]; then + return 0 + fi + done + return 1 +} + +branch_idle_minutes() { + local branch="$1" wt="$2" activity_epoch="" lock_mtime now + activity_epoch="$(git -C "$repo_root" reflog show --format='%ct' -n 1 "refs/heads/${branch}" 2>/dev/null | head -n1 | tr -d '[:space:]')" + if [[ -z "$activity_epoch" ]]; then + activity_epoch="$(git -C "$repo_root" log -1 --format='%ct' "$branch" 2>/dev/null | head -n1 | tr -d '[:space:]')" + fi + if [[ -n "$wt" && -f "${wt}/${LOCK_FILE_REL}" ]]; then + lock_mtime="$(stat -c %Y "${wt}/${LOCK_FILE_REL}" 2>/dev/null || stat -f %m "${wt}/${LOCK_FILE_REL}" 2>/dev/null || true)" + if [[ "$lock_mtime" =~ ^[0-9]+$ && ( -z "$activity_epoch" || "$lock_mtime" -gt "$activity_epoch" ) ]]; then + activity_epoch="$lock_mtime" + fi + fi + [[ "$activity_epoch" =~ ^[0-9]+$ ]] || { printf '%s' 999999; return; } + now="$(now_epoch)" + printf '%s' $(( (now - activity_epoch) / 60 )) +} + +# Count uncommitted changes, ignoring lock-file churn. +dirty_count() { + local wt="$1" + git -C "$wt" status --porcelain -- . ":(exclude)${LOCK_FILE_REL}" 2>/dev/null | grep -c . || true +} + +commits_ahead() { + local branch="$1" + git -C "$repo_root" rev-list --count "${BASE_BRANCH}..${branch}" 2>/dev/null || printf '0' +} + +# Prefer the gx CLI; fall back to the bundled prune script. +run_prune() { + if command -v gx >/dev/null 2>&1; then + gx worktree prune "$@" + else + bash "${repo_root}/scripts/agent-worktree-prune.sh" "$@" + fi +} + +declare -A MERGED_BRANCHES=() +declare -A OPEN_BRANCHES=() + +load_pr_state() { + command -v "$GH_BIN" >/dev/null 2>&1 || return 0 + local line + while IFS= read -r line; do + [[ -n "$line" ]] && MERGED_BRANCHES["$line"]=1 + done < <("$GH_BIN" pr list --state merged --base "$BASE_BRANCH" --limit 200 --json headRefName --jq '.[].headRefName' 2>/dev/null || true) + while IFS= read -r line; do + [[ -n "$line" ]] && OPEN_BRANCHES["$line"]=1 + done < <("$GH_BIN" pr list --state open --base "$BASE_BRANCH" --limit 200 --json headRefName --jq '.[].headRefName' 2>/dev/null || true) +} + +run_once() { + resolve_base_branch + MERGED_BRANCHES=() + OPEN_BRANCHES=() + load_pr_state + + local scanned=0 stalled=0 merged=0 + local cur_wt="" cur_branch="" + + while IFS= read -r line; do + if [[ "$line" == worktree\ * ]]; then + cur_wt="${line#worktree }" + cur_branch="" + elif [[ "$line" == branch\ refs/heads/* ]]; then + cur_branch="${line#branch refs/heads/}" + elif [[ -z "$line" ]]; then + process_lane "$cur_wt" "$cur_branch" + cur_wt=""; cur_branch="" + fi + done < <(git -C "$repo_root" worktree list --porcelain; printf '\n') + + # Reap merged-but-retained lanes before the summary so reaped= is accurate. + if [[ "$merged" -gt 0 ]]; then + reap_merged + fi + + printf '[agent-autofinish-watch] scanned=%s stalled=%s merged=%s reaped=%s\n' \ + "$scanned" "$stalled" "$merged" "$reaped" +} + +# process_lane mutates scanned/stalled/merged/reaped/merged_lanes in the caller +# scope (bash dynamic scope via run_once locals). +process_lane() { + local wt="$1" branch="$2" + [[ -n "$wt" && -n "$branch" ]] || return 0 + [[ "$branch" == agent/* ]] || return 0 + is_managed_worktree_path "$wt" || return 0 + is_temporary_worktree_path "$wt" && return 0 + scanned=$((scanned + 1)) + + if [[ -n "${MERGED_BRANCHES[$branch]:-}" && -d "$wt" ]]; then + merged=$((merged + 1)) + echo "[agent-autofinish-watch] ${branch}: merged PR, worktree retained -> prunable" + return 0 + fi + + # Open PR or live process => healthy in-flight, stay silent. + [[ -n "${OPEN_BRANCHES[$branch]:-}" ]] && return 0 + has_live_process_in_worktree "$wt" && return 0 + + local idle dirty ahead + idle="$(branch_idle_minutes "$branch" "$wt")" + [[ "$idle" -ge "$IDLE_MINUTES" ]] || return 0 + + dirty="$(dirty_count "$wt")" + if [[ "$dirty" -gt 0 ]]; then + stalled=$((stalled + 1)) + echo "[agent-autofinish-watch] ${branch}: ${dirty} uncommitted change(s), idle ${idle}m -> needs commit + finish" + return 0 + fi + + ahead="$(commits_ahead "$branch")" + if [[ "$ahead" -gt 0 ]]; then + stalled=$((stalled + 1)) + echo "[agent-autofinish-watch] ${branch}: ${ahead} commit(s) ahead of ${BASE_BRANCH}, no PR, idle ${idle}m -> needs finish" + fi +} + +reaped=0 + +reap_merged() { + [[ "$AUTO_MERGE" -eq 1 ]] || return 0 + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "[agent-autofinish-watch] [dry-run] would prune merged lanes: gx worktree prune --include-pr-merged --delete-branches --base ${BASE_BRANCH}" + return 0 + fi + local out="" + out="$(run_prune --include-pr-merged --delete-branches --base "$BASE_BRANCH" 2>&1 || true)" + printf '%s\n' "$out" + local removed + removed="$(printf '%s\n' "$out" | sed -n 's/.*removed_worktrees=\([0-9]*\).*/\1/p' | head -n1)" + [[ "$removed" =~ ^[0-9]+$ ]] && reaped="$removed" +} + +if [[ "$MODE" == "daemon" ]]; then + while true; do + reaped=0 + run_once + sleep "$INTERVAL" + done +else + reaped=0 + run_once +fi diff --git a/test/agent-autofinish-watch.test.js b/test/agent-autofinish-watch.test.js new file mode 100644 index 0000000..5180725 --- /dev/null +++ b/test/agent-autofinish-watch.test.js @@ -0,0 +1,117 @@ +// The autofinish watcher (scripts/agent-autofinish-watch.sh) is what the +// SessionStart shim (scripts/agent-stalled-report.sh) expects. It must: +// - flag agent/* worktrees that are stalled (work present, no open PR), and +// - report merged-but-retained lanes as prunable (the post-merge cleanup gap). +// Healthy in-flight lanes (open PR) stay silent so the shim shows nothing. + +const { + test, + assert, + fs, + os, + path, + cp, + initRepo, + seedCommit, + runHumanCmd, + createFakeGhScript, + defineSpawnSuite, +} = require('./helpers/install-test-helpers'); + +const WATCHER = path.resolve(__dirname, '..', 'scripts', 'agent-autofinish-watch.sh'); + +// gh stub: report `mergedBranch` for `pr list --state merged`, nothing for open. +function fakeGh(mergedBranch = '') { + const body = [ + 'state=""', + 'for a in "$@"; do', + ' case "$prev" in --state) state="$a";; esac', + ' prev="$a"', + 'done', + `if [[ "$state" == "merged" && -n "${mergedBranch}" ]]; then echo "${mergedBranch}"; fi`, + 'exit 0', + ].join('\n'); + return createFakeGhScript(body).fakePath; +} + +function makeLane(repoDir, branch, { commitAhead = false, dirty = false } = {}) { + const wt = path.join(repoDir, '.omc', 'agent-worktrees', branch.replace(/\//g, '__')); + const add = runHumanCmd('git', ['worktree', 'add', '-b', branch, wt, 'main'], repoDir); + assert.equal(add.status, 0, add.stderr || add.stdout); + if (commitAhead) { + fs.writeFileSync(path.join(wt, 'work.txt'), 'change\n'); + assert.equal(runHumanCmd('git', ['add', '-A'], wt).status, 0); + assert.equal(runHumanCmd('git', ['commit', '-m', 'lane work'], wt).status, 0); + } + if (dirty) { + fs.writeFileSync(path.join(wt, 'dirty.txt'), 'uncommitted\n'); + } + return wt; +} + +function runWatcher(repoDir, ghBin, extraArgs = []) { + return cp.spawnSync( + 'bash', + [WATCHER, '--once', '--idle-minutes', '0', ...extraArgs], + { cwd: repoDir, encoding: 'utf8', env: { ...process.env, GUARDEX_GH_BIN: ghBin, GUARDEX_BASE_BRANCH: 'main' } }, + ); +} + +defineSpawnSuite('agent-autofinish-watch', () => { + test('flags a stalled lane (commit ahead, no PR) for finish', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + makeLane(repoDir, 'agent/test/stalled', { commitAhead: true }); + + const res = runWatcher(repoDir, fakeGh()); + assert.equal(res.status, 0, res.stderr); + assert.match(res.stdout, /agent\/test\/stalled: 1 commit\(s\) ahead of main, no PR/); + assert.match(res.stdout, /scanned=1 stalled=1 merged=0/); + }); + + test('flags an uncommitted lane as needing commit + finish', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + makeLane(repoDir, 'agent/test/dirty', { dirty: true }); + + const res = runWatcher(repoDir, fakeGh()); + assert.equal(res.status, 0, res.stderr); + assert.match(res.stdout, /agent\/test\/dirty: 1 uncommitted change\(s\).*needs commit \+ finish/); + assert.match(res.stdout, /stalled=1/); + }); + + test('reports a merged-but-retained lane as prunable', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + makeLane(repoDir, 'agent/test/merged', { commitAhead: true }); + + const res = runWatcher(repoDir, fakeGh('agent/test/merged')); + assert.equal(res.status, 0, res.stderr); + assert.match(res.stdout, /agent\/test\/merged: merged PR, worktree retained -> prunable/); + assert.match(res.stdout, /merged=1/); + // Without --auto-merge the worktree is reported, never removed. + assert.ok(fs.existsSync(path.join(repoDir, '.omc', 'agent-worktrees', 'agent__test__merged'))); + }); + + test('--auto-merge --dry-run announces the prune without removing the lane', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + const wt = makeLane(repoDir, 'agent/test/merged', { commitAhead: true }); + + const res = runWatcher(repoDir, fakeGh('agent/test/merged'), ['--auto-merge', '--dry-run']); + assert.equal(res.status, 0, res.stderr); + assert.match(res.stdout, /\[dry-run\] would prune merged lanes/); + assert.ok(fs.existsSync(wt), 'dry-run must not remove the worktree'); + }); + + test('healthy lane with no work and no PR is silent', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + makeLane(repoDir, 'agent/test/idle'); // no commit ahead, no dirt + + const res = runWatcher(repoDir, fakeGh()); + assert.equal(res.status, 0, res.stderr); + assert.doesNotMatch(res.stdout, /agent\/test\/idle:/); + assert.match(res.stdout, /scanned=1 stalled=0 merged=0/); + }); +}); From 1cb8defbae2c27ba0c425e909a771b0a1657b94c Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Tue, 30 Jun 2026 02:20:23 +0200 Subject: [PATCH 2/6] feat(locks): add 'gx locks reap' for stale locks + claim staleness hint A crashed or abandoned agent worktree keeps holding its file locks forever: claimed_at was recorded but never acted on, and (unlike a pruned worktree, whose lock file vanishes with it) a lingering-but-idle lane stays on disk so its locks keep blocking other agents. Add a 'reap' subcommand that clears locks from worktrees idle past a TTL (default 7d / GUARDEX_LOCK_TTL_HOURS / --ttl-hours) that have no live process inside them. The caller's own worktree is always live, so reap never clears active locks; --dry-run reports without removing. A blocked 'claim' against a past-TTL lock now prints a hint pointing at 'gx locks reap'. now_iso() routes through now_epoch() so GUARDEX_LOCK_NOW_EPOCH makes claim ages deterministic in tests. --- templates/scripts/agent-file-locks.py | 140 +++++++++++++++++++++++++- test/agent-file-locks-reap.test.js | 101 +++++++++++++++++++ 2 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 test/agent-file-locks-reap.test.js diff --git a/templates/scripts/agent-file-locks.py b/templates/scripts/agent-file-locks.py index b078210..0b7d477 100755 --- a/templates/scripts/agent-file-locks.py +++ b/templates/scripts/agent-file-locks.py @@ -48,6 +48,11 @@ 'scripts/guardex-env.sh', } ALLOW_GUARDRAIL_DELETE_ENV = 'AGENT_ALLOW_GUARDRAIL_DELETE' +LOCK_TTL_HOURS_ENV = 'GUARDEX_LOCK_TTL_HOURS' +LOCK_NOW_EPOCH_ENV = 'GUARDEX_LOCK_NOW_EPOCH' +# Generous default so an active long-running lane is never reaped; `reap` is an +# explicit, opt-in maintenance command, not an automatic background sweep. +DEFAULT_LOCK_TTL_HOURS = 168.0 # 7 days @dataclass @@ -174,7 +179,7 @@ def write_state(repo_root: Path, state: dict[str, Any]) -> None: def now_iso() -> str: - return datetime.now(timezone.utc).isoformat() + return datetime.fromtimestamp(now_epoch(), tz=timezone.utc).isoformat() def env_truthy(value: str | None) -> bool: @@ -183,6 +188,71 @@ def env_truthy(value: str | None) -> bool: return value.strip().lower() in {'1', 'true', 'yes', 'on'} +def now_epoch() -> float: + """Current unix time, overridable via GUARDEX_LOCK_NOW_EPOCH for tests.""" + override = os.environ.get(LOCK_NOW_EPOCH_ENV) + if override: + try: + return float(override) + except ValueError: + pass + return datetime.now(timezone.utc).timestamp() + + +def parse_iso_epoch(value: str) -> float | None: + """Parse a `claimed_at` ISO-8601 stamp to a unix epoch, or None if unset + or unparseable. Naive stamps are assumed UTC (claims are written in UTC).""" + if not value: + return None + try: + dt = datetime.fromisoformat(value) + except ValueError: + return None + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.timestamp() + + +def has_live_process_in_worktree(worktree: Path) -> bool: + """True when any running process has its cwd inside `worktree` (Linux /proc + only; best-effort no elsewhere). Guards reap from touching an active lane.""" + proc = Path('/proc') + if not proc.is_dir(): + return False + wt = str(worktree) + try: + entries = list(proc.iterdir()) + except OSError: + return False + for entry in entries: + if not entry.name.isdigit(): + continue + try: + target = os.readlink(entry / 'cwd') + except OSError: + continue + if target.endswith(' (deleted)'): + target = target[: -len(' (deleted)')] + if target == wt or target.startswith(wt + os.sep): + return True + return False + + +def resolve_ttl_hours(args: argparse.Namespace) -> float: + """Reap TTL in hours: --ttl-hours wins, then GUARDEX_LOCK_TTL_HOURS, then + the 7-day default.""" + explicit = getattr(args, 'ttl_hours', None) + if explicit is not None: + return float(explicit) + env = os.environ.get(LOCK_TTL_HOURS_ENV) + if env: + try: + return float(env) + except ValueError: + pass + return DEFAULT_LOCK_TTL_HOURS + + def staged_changes(repo_root: Path) -> list[tuple[str, str]]: out = run_git(['diff', '--cached', '--name-status', '--diff-filter=ACMRDTUXB'], cwd=repo_root) if not out: @@ -304,15 +374,27 @@ def cmd_claim(args: argparse.Namespace, repo_root: Path) -> int: # claim (each worktree keeps a separate lock file). The write below still # records the claim in THIS worktree's lock file. all_locks = load_all_locks(repo_root) + ttl_seconds = resolve_ttl_hours(args) * 3600.0 + now = now_epoch() + any_stale = False for file_path in files: foreign = [e for e in all_locks.get(file_path, []) if not owner_matches(e, args.branch, claim_agent)] if foreign: conflicts.append((file_path, owner_label(foreign[0]))) + claimed = parse_iso_epoch(str(foreign[0].get('claimed_at', ''))) + if claimed is not None and (now - claimed) >= ttl_seconds: + any_stale = True if conflicts: print('[agent-file-locks] Cannot claim files already locked by another owner:', file=sys.stderr) for file_path, owner in conflicts: print(f' - {file_path} (locked by {owner})', file=sys.stderr) + if any_stale: + print( + '[agent-file-locks] Some blocking locks are past the staleness TTL; if their ' + 'lane is abandoned, run `gx locks reap` to clear them.', + file=sys.stderr, + ) return 1 for file_path in files: @@ -394,6 +476,49 @@ def cmd_release(args: argparse.Namespace, repo_root: Path) -> int: return 0 +def cmd_reap(args: argparse.Namespace, repo_root: Path) -> int: + # Clear locks held by ABANDONED worktrees: present on disk, idle past the + # TTL, and with no live process inside. Dead worktrees self-clean (their lock + # file lives inside them), so this targets the lingering-but-idle case where + # a crashed or forgotten lane keeps blocking a file forever. The caller's own + # worktree always has a live process, so reap never clears its active locks. + ttl_hours = resolve_ttl_hours(args) + ttl_seconds = ttl_hours * 3600.0 + now = now_epoch() + roots = list_worktree_roots(repo_root) + reaped: list[tuple[str, str, str, int]] = [] # worktree, file, branch, age_hours + for root in roots: + try: + state = load_state(root) + except LockError: + continue + locks = state['locks'] + if not locks: + continue + if has_live_process_in_worktree(root): + continue + survivors: dict[str, Any] = {} + changed = False + for file_path, entry in locks.items(): + claimed = parse_iso_epoch(str(entry.get('claimed_at', ''))) + if claimed is not None and (now - claimed) >= ttl_seconds: + reaped.append((str(root), file_path, str(entry.get('branch', '')), int((now - claimed) // 3600))) + changed = True + else: + survivors[file_path] = entry + if changed and not args.dry_run: + write_state(root, {**state, 'locks': survivors}) + + if not reaped: + print(f'[agent-file-locks] reap: no stale locks (ttl={int(ttl_hours)}h, scanned {len(roots)} worktree(s)).') + return 0 + label = '[agent-file-locks] [dry-run] would reap' if args.dry_run else '[agent-file-locks] reaped' + print(f'{label} {len(reaped)} stale lock(s) (ttl={int(ttl_hours)}h):') + for root, file_path, branch, age_hours in reaped: + print(f' - {file_path} | {branch} | idle {age_hours}h | {root}') + return 0 + + def cmd_status(args: argparse.Namespace, repo_root: Path) -> int: # Union across worktrees so status reflects what claim/validate now enforce — # a sibling worktree's claims would otherwise be invisible here. @@ -555,6 +680,15 @@ def build_parser() -> argparse.ArgumentParser: status.add_argument('--branch', help='Filter by branch') add_agent_arg(status) + reap = sub.add_parser('reap', help='Clear stale locks from abandoned (idle past TTL, no live process) worktrees') + reap.add_argument( + '--ttl-hours', + type=float, + default=None, + help=f'Idle hours before a lock is stale (default {int(DEFAULT_LOCK_TTL_HOURS)}h or ${LOCK_TTL_HOURS_ENV})', + ) + reap.add_argument('--dry-run', action='store_true', help='Report stale locks without removing them') + validate = sub.add_parser('validate', help='Validate staged files are locked by branch') validate.add_argument('--branch', required=True, help='Owner branch name') add_agent_arg(validate) @@ -571,6 +705,8 @@ def dispatch_command(args: argparse.Namespace, repo_root: Path) -> int: return cmd_allow_delete(args, repo_root) if args.command == 'release': return cmd_release(args, repo_root) + if args.command == 'reap': + return cmd_reap(args, repo_root) if args.command == 'status': return cmd_status(args, repo_root) if args.command == 'validate': @@ -589,7 +725,7 @@ def main() -> int: # Serialize state-changing commands (and validate's snapshot read) across # ALL worktrees with one shared lock, so concurrent runs can't clobber # each other or both win the same file. status is a pure read -> unlocked. - if args.command in {'claim', 'allow-delete', 'release', 'validate'}: + if args.command in {'claim', 'allow-delete', 'release', 'reap', 'validate'}: with cross_worktree_lock(repo_root): return dispatch_command(args, repo_root) return dispatch_command(args, repo_root) diff --git a/test/agent-file-locks-reap.test.js b/test/agent-file-locks-reap.test.js new file mode 100644 index 0000000..a972a18 --- /dev/null +++ b/test/agent-file-locks-reap.test.js @@ -0,0 +1,101 @@ +// `gx locks reap` clears locks held by ABANDONED worktrees (present on disk, +// idle past the TTL, no live process inside). Dead worktrees self-clean because +// their lock file lives inside them; this targets the lingering-but-idle lane +// that otherwise blocks a file forever. The caller's own worktree is always +// "live" (a running process sits in it), so reap never clears active locks. + +const { + test, + assert, + fs, + path, + cp, + initRepo, + seedCommit, + runHumanCmd, + defineSpawnSuite, +} = require('./helpers/install-test-helpers'); + +const LOCK_PY = path.resolve(__dirname, '..', 'scripts', 'agent-file-locks.py'); +const T0 = 1_700_000_000; // fixed base epoch for deterministic claim ages + +function lockTool(args, cwd, nowEpoch) { + return cp.spawnSync('python3', [LOCK_PY, ...args], { + cwd, + encoding: 'utf8', + env: { ...process.env, GUARDEX_LOCK_NOW_EPOCH: String(nowEpoch) }, + }); +} + +// Create a managed worktree and claim a file from INSIDE it, stamped at `atEpoch`. +function makeLaneWithClaim(repoDir, branch, file, atEpoch) { + const wt = path.join(repoDir, '.omc', 'agent-worktrees', branch.replace(/\//g, '__')); + assert.equal(runHumanCmd('git', ['worktree', 'add', '-b', branch, wt, 'main'], repoDir).status, 0); + fs.writeFileSync(path.join(wt, file), 'x\n'); + const claim = lockTool(['claim', '--branch', branch, file], wt, atEpoch); + assert.equal(claim.status, 0, claim.stderr || claim.stdout); + return wt; +} + +function lockEntries(wt) { + const p = path.join(wt, '.omx', 'state', 'agent-file-locks.json'); + if (!fs.existsSync(p)) return {}; + return JSON.parse(fs.readFileSync(p, 'utf8')).locks || {}; +} + +defineSpawnSuite('agent-file-locks reap', () => { + test('reaps a stale lock from an idle sibling worktree', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + const wt = makeLaneWithClaim(repoDir, 'agent/test/stale', 'fileA.txt', T0); + assert.ok(lockEntries(wt)['fileA.txt'], 'precondition: lock recorded'); + + // Reap from the PRIMARY repo 2h later, ttl 1h => the sibling lane is idle, + // has no live process, and is past TTL => its lock is cleared. + const res = lockTool(['reap', '--ttl-hours', '1'], repoDir, T0 + 2 * 3600); + assert.equal(res.status, 0, res.stderr); + assert.match(res.stdout, /reaped 1 stale lock\(s\)/); + assert.equal(lockEntries(wt)['fileA.txt'], undefined, 'stale lock should be removed'); + }); + + test('does not reap a lock that is still within TTL', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + const wt = makeLaneWithClaim(repoDir, 'agent/test/fresh', 'fileA.txt', T0); + + // Only 30 min later with a 1h TTL => not stale yet. + const res = lockTool(['reap', '--ttl-hours', '1'], repoDir, T0 + 1800); + assert.equal(res.status, 0, res.stderr); + assert.match(res.stdout, /no stale locks/); + assert.ok(lockEntries(wt)['fileA.txt'], 'fresh lock should survive'); + }); + + test('--dry-run reports but never removes', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + const wt = makeLaneWithClaim(repoDir, 'agent/test/stale', 'fileA.txt', T0); + + const res = lockTool(['reap', '--ttl-hours', '1', '--dry-run'], repoDir, T0 + 2 * 3600); + assert.equal(res.status, 0, res.stderr); + assert.match(res.stdout, /\[dry-run\] would reap 1 stale lock/); + assert.ok(lockEntries(wt)['fileA.txt'], 'dry-run must not remove the lock'); + }); + + test('a blocked claim against a stale lock surfaces the reap hint', () => { + const repoDir = initRepo({ branch: 'main' }); + seedCommit(repoDir); + makeLaneWithClaim(repoDir, 'agent/test/owner', 'shared.txt', T0); + + // A different branch tries to claim the same file long after the TTL. + // claim has no --ttl-hours flag; it reads GUARDEX_LOCK_TTL_HOURS instead. + fs.writeFileSync(path.join(repoDir, 'shared.txt'), 'x\n'); + const res = cp.spawnSync('python3', [LOCK_PY, 'claim', '--branch', 'agent/test/newcomer', 'shared.txt'], { + cwd: repoDir, + encoding: 'utf8', + env: { ...process.env, GUARDEX_LOCK_NOW_EPOCH: String(T0 + 5 * 3600), GUARDEX_LOCK_TTL_HOURS: '1' }, + }); + assert.equal(res.status, 1, 'conflicting claim must fail'); + assert.match(res.stderr, /locked by/); + assert.match(res.stderr, /gx locks reap/, 'should hint reap for a stale blocking lock'); + }); +}); From 680e6d81b81e1cb6e88102aab382ce16dff92bde Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Tue, 30 Jun 2026 02:27:39 +0200 Subject: [PATCH 3/6] feat(finish): sweep merged-but-stranded worktrees after 'gx finish --all' After a bulk --all finish, reap merged-but-stranded worktree dirs whose branch merged out-of-band and was never cleaned (the post-merge 'retained for now' gap). Gated by a pure shouldSweepOrphans predicate: only --all, only on full success (failed===0), never on a dry run, opt-out via --no-sweep-orphans. The sweep delegates to the existing 'gx worktree prune --include-pr-merged --delete-branches' primitive and is best-effort (a failure warns, never fails the finish). Guard is unit-tested directly since the full PR finish flow needs a GitHub host unavailable in unit runs. --- src/cli/args.js | 12 +++++++++++ src/finish/index.js | 37 ++++++++++++++++++++++++++++++++ test/cli-args-dispatch.test.js | 9 ++++++++ test/finish-orphan-sweep.test.js | 31 ++++++++++++++++++++++++++ 4 files changed, 89 insertions(+) create mode 100644 test/finish-orphan-sweep.test.js diff --git a/src/cli/args.js b/src/cli/args.js index d4424fe..5dcea8a 100644 --- a/src/cli/args.js +++ b/src/cli/args.js @@ -1171,6 +1171,10 @@ function parseFinishArgs(rawArgs, defaults = {}) { gateReview: defaults.gateReview ?? autoShip, reviewProvider: defaults.reviewProvider || 'codex', allowNoChecks: false, + // After a bulk `--all` finish, sweep merged-but-stranded worktree dirs whose + // branch was merged out-of-band and never reaped (the post-merge "retained" + // gap). Only fires for `--all`; opt out with --no-sweep-orphans. + sweepOrphans: defaults.sweepOrphans ?? true, }; for (let index = 0; index < rawArgs.length; index += 1) { @@ -1215,6 +1219,14 @@ function parseFinishArgs(rawArgs, defaults = {}) { options.all = true; continue; } + if (arg === '--no-sweep-orphans') { + options.sweepOrphans = false; + continue; + } + if (arg === '--sweep-orphans') { + options.sweepOrphans = true; + continue; + } if (arg === '--dry-run') { options.dryRun = true; continue; diff --git a/src/finish/index.js b/src/finish/index.js index cd9bb0b..417e35a 100644 --- a/src/finish/index.js +++ b/src/finish/index.js @@ -324,6 +324,20 @@ function merge(rawArgs) { * @returns {void} * @throws {Error} When `--branch` references an unknown ref, or when any branch fails to finish (after the loop completes). */ +/** + * Decide whether a finish run should sweep merged-but-stranded worktree dirs + * after the per-lane loop. Only for bulk `--all`, never on a dry run, only when + * every lane succeeded (failed === 0), and honoring the --no-sweep-orphans + * opt-out. Pure so the guard can be tested without the gh/PR finish flow. + * + * @param {{all?: boolean, sweepOrphans?: boolean, dryRun?: boolean}} options + * @param {number} failed Count of lanes that failed to finish. + * @returns {boolean} + */ +function shouldSweepOrphans(options, failed) { + return Boolean(options.all && options.sweepOrphans && !options.dryRun && failed === 0); +} + function finish(rawArgs, defaults = {}) { const activeCwd = process.cwd(); const options = parseFinishArgs(rawArgs, defaults); @@ -518,6 +532,28 @@ function finish(rawArgs, defaults = {}) { `[${TOOL_NAME}] Finish summary: total=${candidates.length}, success=${succeeded}, failed=${failed}, autoCommitted=${autoCommitted}`, ); + // Bulk `--all` finish self-cleans: sweep merged-but-stranded worktree dirs + // whose branch was merged out-of-band and never reaped (the post-merge + // "retained for now" gap in agent-branch-finish.sh). Only when every lane + // succeeded, never on a dry run, and opt-out via --no-sweep-orphans. The + // sweep is best-effort: a failure warns but does not fail the finish. + if (shouldSweepOrphans(options, failed)) { + const baseForSweep = options.base || candidates[0].baseBranch; + const sweepArgs = ['--include-pr-merged', '--delete-branches']; + if (baseForSweep) { + sweepArgs.push('--base', baseForSweep); + } + console.log(`[${TOOL_NAME}] Sweeping merged-but-stranded worktrees...`); + const sweep = runPackageAsset('worktreePrune', sweepArgs, { + cwd: repoRoot, + stdio: 'inherit', + env: { GUARDEX_PRUNE_ACTIVE_CWD: activeCwd }, + }); + if (sweep.status !== 0) { + console.error(`[${TOOL_NAME}] Warning: orphan sweep failed (non-fatal).`); + } + } + if (failed > 0) { throw new Error('finish command failed for one or more agent branches'); } @@ -701,4 +737,5 @@ module.exports = { finish, sync, autoCommitWorktreeForFinish, + shouldSweepOrphans, }; diff --git a/test/cli-args-dispatch.test.js b/test/cli-args-dispatch.test.js index adb79ab..11dc513 100644 --- a/test/cli-args-dispatch.test.js +++ b/test/cli-args-dispatch.test.js @@ -300,6 +300,15 @@ test('parseFinishArgs rejects non-agent branches and preserves explicit override assert.equal(options.commitMessage, 'Finish the active lane'); }); +test('parseFinishArgs sweepOrphans defaults on and toggles', () => { + assert.equal(parseFinishArgs([]).sweepOrphans, true); + assert.equal(parseFinishArgs(['--all']).sweepOrphans, true); + assert.equal(parseFinishArgs(['--all', '--no-sweep-orphans']).sweepOrphans, false); + assert.equal(parseFinishArgs(['--no-sweep-orphans', '--sweep-orphans']).sweepOrphans, true); + // Caller defaults can opt out without a flag. + assert.equal(parseFinishArgs([], { sweepOrphans: false }).sweepOrphans, false); +}); + test('dispatch helpers preserve suggestion, alias, deprecation, and flag extraction behavior', () => { assert.equal(maybeSuggestCommand('docto'), 'doctor'); diff --git a/test/finish-orphan-sweep.test.js b/test/finish-orphan-sweep.test.js new file mode 100644 index 0000000..2259522 --- /dev/null +++ b/test/finish-orphan-sweep.test.js @@ -0,0 +1,31 @@ +// `gx finish --all` self-cleans: after every lane finishes, it sweeps +// merged-but-stranded worktree dirs (branch merged out-of-band, never reaped — +// the post-merge "retained for now" gap). The sweep only fires for --all, only +// on full success, never on a dry run, and is opt-out via --no-sweep-orphans. +// +// The guard is extracted as a pure predicate so it can be tested without the +// gh/PR finish flow (which needs a real GitHub host, unavailable in unit runs — +// mirrors test/auto-finish-sweep-gate.test.js). + +const test = require('node:test'); +const assert = require('node:assert/strict'); + +const { shouldSweepOrphans } = require('../src/finish/index'); +const { parseFinishArgs } = require('../src/cli/args'); + +const ALL = { all: true, sweepOrphans: true, dryRun: false }; + +test('sweep fires only for --all, on full success, not on dry-run', () => { + assert.equal(shouldSweepOrphans(ALL, 0), true); + assert.equal(shouldSweepOrphans({ ...ALL, all: false }, 0), false, 'single-branch finish never sweeps'); + assert.equal(shouldSweepOrphans({ ...ALL, dryRun: true }, 0), false, 'dry-run never sweeps'); + assert.equal(shouldSweepOrphans({ ...ALL, sweepOrphans: false }, 0), false, '--no-sweep-orphans opts out'); + assert.equal(shouldSweepOrphans(ALL, 1), false, 'a failed lane skips the sweep'); +}); + +test('parseFinishArgs feeds the guard: --all sweeps, --no-sweep-orphans does not', () => { + assert.equal(shouldSweepOrphans(parseFinishArgs(['--all']), 0), true); + assert.equal(shouldSweepOrphans(parseFinishArgs(['--all', '--no-sweep-orphans']), 0), false); + // Without --all the option is moot. + assert.equal(shouldSweepOrphans(parseFinishArgs([]), 0), false); +}); From e2ccdf977e363161bc3e62a646b0c6166d66a619 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Tue, 30 Jun 2026 02:30:03 +0200 Subject: [PATCH 4/6] docs(openspec): spec + recovery doc for watcher, lock reap, orphan sweep Fill the OpenSpec proposal/spec/tasks (validates --strict, 133 specs pass) and correct .agent/STALLED-WORKTREE-RECOVERY.md to match the conservative watcher: --auto-merge reaps merged lanes only (never auto-commits/pushes/PRs un-reviewed work), real flags are --idle-minutes/--interval/--base, and 'gx locks reap' clears locks held by abandoned lanes. --- .agent/STALLED-WORKTREE-RECOVERY.md | 10 ++++-- .../.openspec.yaml | 2 ++ .../proposal.md | 19 +++++++++++ .../spec.md | 34 +++++++++++++++++++ .../tasks.md | 34 +++++++++++++++++++ 5 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/.openspec.yaml create mode 100644 openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/proposal.md create mode 100644 openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/specs/autofinish-watcher-lock-ttl-orphan-sweep-for-multi-agent-worktrees/spec.md create mode 100644 openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md diff --git a/.agent/STALLED-WORKTREE-RECOVERY.md b/.agent/STALLED-WORKTREE-RECOVERY.md index f5599cc..5d970a3 100644 --- a/.agent/STALLED-WORKTREE-RECOVERY.md +++ b/.agent/STALLED-WORKTREE-RECOVERY.md @@ -7,10 +7,14 @@ The Guardex Codex launcher auto-finishes a branch only when the codex CLI exits To act on the report: - Inspect: `bash scripts/agent-autofinish-watch.sh --once --dry-run` -- Auto-finish once (commit dirty changes, push, create PR, attempt merge): `bash scripts/agent-autofinish-watch.sh --once --auto-merge` -- Run the daemon (poll forever, auto-finish after `--idle-seconds`): `bash scripts/agent-autofinish-watch.sh --daemon --auto-merge` +- Reap merged lanes (prune worktrees whose PR already merged): `bash scripts/agent-autofinish-watch.sh --once --auto-merge` +- Run the daemon (poll forever, reaping merged lanes each cycle): `bash scripts/agent-autofinish-watch.sh --daemon --auto-merge --interval 300` -Defaults: `--idle-seconds=900` (15 min of file silence before auto-commit) and `--branch-prefix=agent/`. The watcher is conservative — it never touches branches outside the configured prefix and only commits worktrees whose files have stopped changing. +Flags: `--idle-minutes` (default 60, or `GUARDEX_AUTOFINISH_IDLE_MINUTES`) gates how long a lane must be quiet before it counts as stalled; `--interval` sets the daemon poll seconds; `--base` overrides the inferred base branch. + +The watcher is deliberately conservative. It only ever **reports** agent worktrees with unmerged work (committed-no-PR or uncommitted) — it never auto-commits, pushes, or opens a PR for un-reviewed work. `--auto-merge` only reaps lanes whose PR has already **merged** (delegating to `gx worktree prune --include-pr-merged --delete-branches`), which is what fixes the post-merge "retained for now" gap. Finishing an un-PR'd lane stays a manual `gx branch finish`. Healthy in-flight lanes (open PR, or a live process in the worktree) produce no output. + +A stalled lane that holds file locks can keep blocking other agents; clear those with `gx locks reap` (removes locks from worktrees idle past `--ttl-hours` / `GUARDEX_LOCK_TTL_HOURS`, default 7 days, with no live process inside). ## Source-probe temp worktree cleanup diff --git a/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/.openspec.yaml b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/.openspec.yaml new file mode 100644 index 0000000..d6b53de --- /dev/null +++ b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-30 diff --git a/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/proposal.md b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/proposal.md new file mode 100644 index 0000000..4dae5cd --- /dev/null +++ b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/proposal.md @@ -0,0 +1,19 @@ +## Why + +Multi-agent worktree recovery had three gaps that strand state and block agents: + +- `scripts/agent-stalled-report.sh` (a `SessionStart` hook) wrapped `scripts/agent-autofinish-watch.sh`, but that watcher was **never authored**, so the hook soft-exited 0 and merged-PR worktrees (the "retained for now" path in `agent-branch-finish.sh`) were never reaped. +- File locks recorded `claimed_at` but had no expiry. A lingering-but-idle worktree (crashed or forgotten lane) keeps blocking other agents on its files forever. +- `gx finish --all` finished each lane but never swept merged-but-stranded worktree dirs whose branch merged out-of-band. + +## What Changes + +- Add `scripts/agent-autofinish-watch.sh`: scans agent worktrees, reports stalled lanes (work present, no open PR, past idle gate) and merged-but-retained lanes, and under `--auto-merge` reaps merged lanes via the existing `gx worktree prune` primitive. Resolves the primary checkout via the git common dir; healthy in-flight lanes stay silent. +- Add `gx locks reap`: clears locks from worktrees idle past a TTL (`--ttl-hours` / `GUARDEX_LOCK_TTL_HOURS`, default 7d) with no live process inside. A blocked `claim` against a past-TTL lock now hints at `reap`. +- `gx finish --all` sweeps merged orphans after a fully-successful run (opt-out `--no-sweep-orphans`, never on dry-run), gated by the pure `shouldSweepOrphans` predicate. + +## Impact + +- Affected surfaces: `scripts/agent-autofinish-watch.sh` (new), `templates/scripts/agent-file-locks.py`, `src/finish/index.js`, `src/cli/args.js`, `.agent/STALLED-WORKTREE-RECOVERY.md`. +- Conservative by design: `--auto-merge` only reaps **merged** lanes; it does not auto-commit/push/PR un-reviewed work. Finishing un-PR'd lanes stays a reported manual action. +- Follow-up (out of scope here, blocked by a foreign lock on `src/cli/commands/claude.js`): distribute the watcher + `agent-stalled-report.sh` to target repos (pair into `templates/scripts/`, register in `MANAGED_TEMPLATE_DESTINATIONS`, add the report to `MANAGED_HOOK_FILES`). The new `gx locks reap` can clear that very stale lock once it ages out. diff --git a/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/specs/autofinish-watcher-lock-ttl-orphan-sweep-for-multi-agent-worktrees/spec.md b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/specs/autofinish-watcher-lock-ttl-orphan-sweep-for-multi-agent-worktrees/spec.md new file mode 100644 index 0000000..a157b28 --- /dev/null +++ b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/specs/autofinish-watcher-lock-ttl-orphan-sweep-for-multi-agent-worktrees/spec.md @@ -0,0 +1,34 @@ +## ADDED Requirements + +### Requirement: Stalled-lane watcher +The system SHALL provide `scripts/agent-autofinish-watch.sh`, which the `SessionStart` shim `scripts/agent-stalled-report.sh` invokes, to detect stalled agent worktrees and reap merged-but-retained lanes. It SHALL resolve the primary checkout via the git common dir so it operates correctly from inside any worktree, and SHALL emit a `[agent-autofinish-watch] agent/: ` line only for actionable lanes. + +#### Scenario: Stalled lane is reported +- **WHEN** an agent worktree has committed or uncommitted work, no open PR, and is idle past the idle gate +- **THEN** the watcher prints an actionable `agent/: ... -> needs finish` line +- **AND** a healthy in-flight lane (open PR or a live process) produces no line. + +#### Scenario: Merged lane is reaped under --auto-merge +- **WHEN** an agent branch's PR has merged but its worktree is still on disk +- **THEN** the watcher reports the lane as `prunable` +- **AND** with `--auto-merge` (and not `--dry-run`) it delegates to `gx worktree prune --include-pr-merged --delete-branches` to remove it. + +### Requirement: Stale lock reaping +The `gx locks` tool SHALL provide a `reap` subcommand that clears file locks held by abandoned worktrees: present on disk, idle beyond a TTL (`--ttl-hours`, `GUARDEX_LOCK_TTL_HOURS`, default 7 days), and with no live process inside. It SHALL never clear locks from a worktree that has a live process, and a blocked `claim` against a past-TTL lock SHALL surface a hint pointing at `gx locks reap`. + +#### Scenario: Abandoned lock is reaped +- **WHEN** `gx locks reap` runs and a sibling worktree holds a lock older than the TTL with no live process +- **THEN** that lock entry is removed from the sibling worktree's lock file +- **AND** `--dry-run` reports the same lock without removing it. + +#### Scenario: Active lock is preserved +- **WHEN** a lock is within the TTL, or its worktree has a live process +- **THEN** `reap` leaves the lock in place. + +### Requirement: Bulk-finish orphan sweep +`gx finish --all` SHALL sweep merged-but-stranded worktree dirs after the per-lane loop, only when every lane succeeded, never on a dry run, and opt-out via `--no-sweep-orphans`. The sweep SHALL be best-effort: a sweep failure warns but does not fail the finish. + +#### Scenario: Sweep fires after a successful bulk finish +- **WHEN** `gx finish --all` completes with no failed lanes and `--no-sweep-orphans` is not set +- **THEN** it runs `gx worktree prune --include-pr-merged --delete-branches` +- **AND** with `--no-sweep-orphans`, `--dry-run`, a single-branch finish, or any failed lane, the sweep does not run. diff --git a/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md new file mode 100644 index 0000000..70e27c9 --- /dev/null +++ b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md @@ -0,0 +1,34 @@ +## Definition of Done + +This change is complete only when **all** of the following are true: + +- Every checkbox below is checked. +- The agent branch reaches `MERGED` state on `origin` and the PR URL + state are recorded in the completion handoff. +- If any step blocks (test failure, conflict, ambiguous result), append a `BLOCKED:` line under section 4 explaining the blocker and **STOP**. Do not tick remaining cleanup boxes; do not silently skip the cleanup pipeline. + +## Handoff + +- Handoff: change=`agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05`; branch=`agent//`; scope=`TODO`; action=`continue this sandbox or finish cleanup after a usage-limit/manual takeover`. +- Copy prompt: Continue `agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05` on branch `agent//`. Work inside the existing sandbox, review `openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md`, continue from the current state instead of creating a new sandbox, and when the work is done run `gx branch finish --branch agent// --base dev --via-pr --wait-for-merge --cleanup`. + +## 1. Specification + +- [x] 1.1 Finalize proposal scope and acceptance criteria for `agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05`. +- [x] 1.2 Define normative requirements in `specs/autofinish-watcher-lock-ttl-orphan-sweep-for-multi-agent-worktrees/spec.md`. + +## 2. Implementation + +- [x] 2.1 Implement scoped behavior changes. +- [x] 2.2 Add/update focused regression coverage. + +## 3. Verification + +- [ ] 3.1 Run targeted project verification commands. +- [ ] 3.2 Run `openspec validate agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05 --type change --strict`. +- [ ] 3.3 Run `openspec validate --specs`. + +## 4. Cleanup (mandatory; run before claiming completion) + +- [ ] 4.1 Run the cleanup pipeline: `gx branch finish --branch agent// --base dev --via-pr --wait-for-merge --cleanup`. This handles commit -> push -> PR create -> merge wait -> worktree prune in one invocation. +- [ ] 4.2 Record the PR URL and final merge state (`MERGED`) in the completion handoff. +- [ ] 4.3 Confirm the sandbox worktree is gone (`git worktree list` no longer shows the agent path; `git branch -a` shows no surviving local/remote refs for the branch). From c02eb0b1638d779c649864900e84602d8859de37 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Tue, 30 Jun 2026 02:45:47 +0200 Subject: [PATCH 5/6] fix(agents): guard value-taking watcher flags against shift-2 crash Review found that --interval/--idle-minutes/--base ran `shift 2` even when the flag was the last argument; under `set -euo pipefail` that exits 1 and silently kills the script (the daemon would die on startup with no output). Guard each with `[[ $# -ge 2 ]]` and a clear error. Also correct the dynamic-scope comment and document the 200-PR detection cap in --help. --- scripts/agent-autofinish-watch.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/agent-autofinish-watch.sh b/scripts/agent-autofinish-watch.sh index c9f42b3..d4d4103 100755 --- a/scripts/agent-autofinish-watch.sh +++ b/scripts/agent-autofinish-watch.sh @@ -41,11 +41,19 @@ while [[ $# -gt 0 ]]; do --daemon) MODE="daemon"; shift ;; --dry-run) DRY_RUN=1; shift ;; --auto-merge) AUTO_MERGE=1; shift ;; - --interval) INTERVAL="${2:-300}"; shift 2 ;; - --idle-minutes) IDLE_MINUTES="${2:-60}"; shift 2 ;; - --base) BASE_BRANCH="${2:-}"; shift 2 ;; + --interval) + [[ $# -ge 2 ]] || { echo "[agent-autofinish-watch] --interval requires a value" >&2; exit 1; } + INTERVAL="$2"; shift 2 ;; + --idle-minutes) + [[ $# -ge 2 ]] || { echo "[agent-autofinish-watch] --idle-minutes requires a value" >&2; exit 1; } + IDLE_MINUTES="$2"; shift 2 ;; + --base) + [[ $# -ge 2 ]] || { echo "[agent-autofinish-watch] --base requires a value" >&2; exit 1; } + BASE_BRANCH="$2"; shift 2 ;; -h|--help) echo "Usage: $0 [--once|--daemon] [--dry-run] [--auto-merge] [--interval SEC] [--idle-minutes MIN] [--base BRANCH]" + echo "Note: merged/open PR detection reads the most recent 200 PRs per state; a" + echo " branch whose merged PR is older than that will not be auto-reaped." exit 0 ;; *) @@ -206,8 +214,8 @@ run_once() { "$scanned" "$stalled" "$merged" "$reaped" } -# process_lane mutates scanned/stalled/merged/reaped/merged_lanes in the caller -# scope (bash dynamic scope via run_once locals). +# process_lane mutates scanned/stalled/merged in the caller scope (bash dynamic +# scope via run_once's locals); reaped is a top-level global set by reap_merged. process_lane() { local wt="$1" branch="$2" [[ -n "$wt" && -n "$branch" ]] || return 0 From 64b615da65fdba4f48116f15b4aa3ded34e116a7 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Tue, 30 Jun 2026 02:46:12 +0200 Subject: [PATCH 6/6] chore(openspec): tick verification tasks for multi-agent worktree change --- .../tasks.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md index 70e27c9..534cbc5 100644 --- a/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md +++ b/openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md @@ -8,7 +8,7 @@ This change is complete only when **all** of the following are true: ## Handoff -- Handoff: change=`agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05`; branch=`agent//`; scope=`TODO`; action=`continue this sandbox or finish cleanup after a usage-limit/manual takeover`. +- Handoff: change=`agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05`; branch=`agent//`; scope=`watcher + lock reap + finish --all orphan sweep; distribution to target repos is a follow-up (blocked by foreign lock on src/cli/commands/claude.js)`; action=`continue this sandbox or finish cleanup after a usage-limit/manual takeover`. - Copy prompt: Continue `agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05` on branch `agent//`. Work inside the existing sandbox, review `openspec/changes/agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05/tasks.md`, continue from the current state instead of creating a new sandbox, and when the work is done run `gx branch finish --branch agent// --base dev --via-pr --wait-for-merge --cleanup`. ## 1. Specification @@ -23,9 +23,9 @@ This change is complete only when **all** of the following are true: ## 3. Verification -- [ ] 3.1 Run targeted project verification commands. -- [ ] 3.2 Run `openspec validate agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05 --type change --strict`. -- [ ] 3.3 Run `openspec validate --specs`. +- [x] 3.1 Run targeted project verification commands. +- [x] 3.2 Run `openspec validate agent-claude-autofinish-watcher-lock-ttl-orphan-sweep-2026-06-30-02-05 --type change --strict`. +- [x] 3.3 Run `openspec validate --specs`. ## 4. Cleanup (mandatory; run before claiming completion)