From f39e62b51b07fa00f02771fc98a88eeb3afa8f60 Mon Sep 17 00:00:00 2001 From: Alex Fedotyev <61838744+alex-fedotyev@users.noreply.github.com> Date: Tue, 5 May 2026 13:04:30 +0000 Subject: [PATCH] agent/engine: guard against stale sdk_session_id when .jsonl is missing When a Nerve container is restarted without persisting /root/.claude, the .jsonl conversation files are wiped but the DB still holds the old sdk_session_id. On the next turn the CLI is invoked with --resume and immediately exits 1 with "No conversation found with session ID", which hangs the session mid-turn. Two fixes: 1. Engine guard (_sdk_resume_file_exists): before passing --resume to the CLI, check that the .jsonl exists. If it is missing, clear the stale sdk_session_id from the DB and start a fresh conversation instead of crashing. Forks are exempt because their context is owned by the source session. 2. Bootstrap (.claude mount): add ~/.nerve/claude:/root/.claude to the generated docker-compose so conversation history survives restarts. The mount is siloed under ~/.nerve to keep the agent's CLI state separate from the host user's personal ~/.claude directory. Tests cover file-present, file-absent, fail-open on exception, and workspace-slash encoding. Co-Authored-By: Claude Opus --- nerve/agent/engine.py | 60 +++++++++++++++++++++++++++++++++++++++++ nerve/bootstrap.py | 18 ++++++++++--- tests/test_engine.py | 62 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 3 deletions(-) diff --git a/nerve/agent/engine.py b/nerve/agent/engine.py index f48c948..2f0b371 100644 --- a/nerve/agent/engine.py +++ b/nerve/agent/engine.py @@ -1092,6 +1092,39 @@ def _is_client_dead(client: ClaudeSDKClient) -> bool: return True return process.returncode is not None + def _sdk_resume_file_exists(self, sdk_session_id: str) -> bool: + """Check whether Claude Code still has the conversation .jsonl + for the given SDK session ID on this filesystem. + + The CLI stores history at:: + + ~/.claude/projects//.jsonl + + where is the absolute cwd path with every '/' + replaced by '-'. If the file is gone (typically because the + container's /root/.claude was not bind-mounted and got wiped on + restart), passing --resume to the CLI fails with exit 1. + + Best-effort check: any unexpected error returns True so we still + attempt the resume and let the CLI surface the real error, + rather than masking unrelated bugs. + """ + try: + cwd = str(self.config.workspace) + encoded = cwd.replace("/", "-") + jsonl = ( + os.path.expanduser("~/.claude/projects") + + "/" + encoded + + "/" + sdk_session_id + ".jsonl" + ) + return os.path.isfile(jsonl) + except Exception as e: + logger.debug( + "Could not stat resume jsonl for %s: %s, assuming present", + sdk_session_id[:12], e, + ) + return True + async def _get_or_create_client( self, session_id: str, source: str, model: str | None, fork_from: str | None = None, @@ -1130,6 +1163,33 @@ async def _get_or_create_client( if fork_from and not sdk_resume_id: sdk_resume_id = fork_from + # Defensive: verify the resume target's conversation .jsonl + # actually exists before passing it to the CLI. Claude Code + # stores conversation history in /root/.claude/projects/ + # /.jsonl. If that directory is + # not bind-mounted from the host, a container restart wipes + # the .jsonl files but the Nerve DB still holds the stale + # sdk_session_id; the CLI dies with "No conversation + # found with session ID" exit 1. + # + # When the file is missing, clear the stale id and start a + # fresh conversation rather than crashing the turn. Forks + # are exempt: the source session's context lives in the + # source's row, and a fresh fork has nothing to recover to. + if sdk_resume_id and not fork_from: + if not self._sdk_resume_file_exists(sdk_resume_id): + logger.warning( + "Session %s resume target %s.jsonl is missing; " + "starting a fresh CLI conversation. This usually " + "means /root/.claude was not persisted across a " + "container restart.", + session_id, sdk_resume_id[:12], + ) + await self.db.update_session_fields( + session_id, {"sdk_session_id": None}, + ) + sdk_resume_id = None + if sdk_resume_id: logger.info( "Resuming session %s with SDK session %s", diff --git a/nerve/bootstrap.py b/nerve/bootstrap.py index 8ec567f..8f0c05f 100644 --- a/nerve/bootstrap.py +++ b/nerve/bootstrap.py @@ -1881,18 +1881,22 @@ def _build_docker_compose( extra_mounts: Additional host:container mount pairs (e.g. ["~/code:/code"]). """ # Required mounts (always present) + # ~/.nerve/claude:/root/.claude persists Claude Code's conversation + # .jsonl files across container restarts. Without this mount the files + # are wiped on every `docker compose down/up` and the Nerve DB's stale + # sdk_session_id rows fail every --resume with "No conversation found" + # exit 1. Siloed under ~/.nerve so the agent's CLI is isolated from + # the host user's personal ~/.claude (where macOS stores OAuth tokens). volumes = [ ".:/nerve", "~/.nerve:/root/.nerve", + "~/.nerve/claude:/root/.claude", f"{workspace_path}:/root/nerve-workspace", ] # Optional auth mounts — only include if the host directory exists. # Docker would create missing dirs as root-owned empties, which # confuses the tools and pollutes the host filesystem. - # Note: ~/.claude is NOT mounted — macOS stores OAuth tokens in the - # system Keychain, not on disk. The entrypoint exports ANTHROPIC_API_KEY - # from config.local.yaml instead, which the claude CLI picks up. _optional_mounts = [ ("~/.config/gh", "/root/.config/gh", "gh CLI auth"), ("~/.config/gog", "/root/.config/gog", "gog CLI auth"), @@ -1960,6 +1964,14 @@ def _build_docker_compose( [ -n "$_gh" ] && export GH_TOKEN="$_gh" fi +# Ensure the persisted Claude Code state dir exists and is writable +# before any tool that touches /root/.claude runs. The bind mount in +# docker-compose creates it as a host-owned empty dir on first boot; +# we need it owned by root with 0700 so the CLI can drop its config +# file and projects/ tree there without ENOENT or EACCES. +mkdir -p /root/.claude +chmod 700 /root/.claude + # Clean up stale PID file from previous container runs rm -f ~/.nerve/nerve.pid diff --git a/tests/test_engine.py b/tests/test_engine.py index d18a9db..92d992a 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -1,5 +1,9 @@ """Tests for nerve.agent.engine — pure helpers (no SDK state).""" +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import patch + import pytest from nerve.agent.engine import AgentEngine @@ -41,3 +45,61 @@ def test_effective_effort(value, model, expected): def test_effective_effort_model_default_none(): # Signature symmetry with _parse_thinking_config assert AgentEngine._effective_effort("max") == "max" + + +# --------------------------------------------------------------------------- +# _sdk_resume_file_exists +# --------------------------------------------------------------------------- + +def _make_engine(workspace: str = "/root/nerve-workspace") -> AgentEngine: + """Minimal AgentEngine stub (only config.workspace is needed).""" + engine = AgentEngine.__new__(AgentEngine) + engine.config = SimpleNamespace(workspace=Path(workspace)) + return engine + + +class TestSdkResumeFileExists: + def test_returns_true_when_file_present(self): + engine = _make_engine() + with patch("nerve.agent.engine.os.path.isfile", return_value=True): + assert engine._sdk_resume_file_exists("some-session-id") is True + + def test_returns_false_when_file_missing(self): + engine = _make_engine() + with patch("nerve.agent.engine.os.path.isfile", return_value=False): + assert engine._sdk_resume_file_exists("some-session-id") is False + + def test_fail_open_on_exception(self): + """Any unexpected error returns True rather than crashing the turn.""" + engine = _make_engine() + with patch("nerve.agent.engine.os.path.isfile", side_effect=OSError("denied")): + assert engine._sdk_resume_file_exists("some-session-id") is True + + def test_path_encodes_workspace_slashes(self): + """'/' in the workspace path are replaced with '-' in the projects subdir.""" + engine = _make_engine("/root/nerve-workspace") + captured: dict = {} + + def _capture(path: str) -> bool: + captured["path"] = path + return True + + with patch("nerve.agent.engine.os.path.isfile", side_effect=_capture): + engine._sdk_resume_file_exists("sid-abc") + + assert "-root-nerve-workspace" in captured["path"] + assert "sid-abc.jsonl" in captured["path"] + + def test_path_ends_with_jsonl(self): + """The constructed path always ends with .jsonl.""" + engine = _make_engine("/workspace") + captured: dict = {} + + def _capture(path: str) -> bool: + captured["path"] = path + return True + + with patch("nerve.agent.engine.os.path.isfile", side_effect=_capture): + engine._sdk_resume_file_exists("myid") + + assert captured["path"].endswith("myid.jsonl")