From ac55f03adc60262a1ae8fde199cd1f9a521565d1 Mon Sep 17 00:00:00 2001 From: Alex Fedotyev <61838744+alex-fedotyev@users.noreply.github.com> Date: Wed, 6 May 2026 03:45:54 +0000 Subject: [PATCH] agent/sessions: keep channel sticky to active sessions on hang Symptom: a chat that hangs gets orphaned. The user types "continue" to nudge the stuck session and the next message lands in a brand-new, empty session instead. All context from the original session is gone. Root cause: get_active_session checks _is_within_sticky_period before returning the channel's mapped session, and _is_within_sticky_period only looks at last_activity_at vs the 120-min cutoff. last_activity_at is written by mark_active(), which fires from engine.run at two moments: when a fresh client is created, and after a turn completes successfully. A hung turn never reaches the post-turn mark_active, so last_activity_at freezes at turn-start. Once 120 min elapse from that frozen timestamp, the cutoff trips, the function returns False, and get_active_session mints a new session id and overwrites the channel_sessions row. The hung session is now orphaned. Web sessions don't see this because gateway/server.py routes through get_last_session, which has no sticky check. Fix: treat any session with status=active as sticky regardless of the timestamp. An active session is by definition the channel's current owner; either it's making progress (and the next mark_active will refresh the timestamp) or it's hung. The engine's idle-message timeout in receive_response (cli_idle_timeout_seconds, default 15 min, shipped in #66) bounds how long a truly hung session can hold the channel: when the timeout fires the engine flips the session to idle/error in its exception path, and the next inbound message after that point falls through to the time-based check and rolls over to a fresh session normally. Tests: - test_auto_session_reused_when_active_despite_old_timestamp: an active session with a back-dated last_activity_at is still reused. - test_auto_session_rotated_when_idle_after_sticky_period: once the hung session has been recovered to idle, the time-based cutoff applies and the channel rolls over. The existing test_auto_session_rotated_after_sticky_period continues to cover the rotation path for sessions that were never marked active in the first place. --- nerve/agent/sessions.py | 26 ++++++++++++++++++---- tests/test_sessions.py | 49 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/nerve/agent/sessions.py b/nerve/agent/sessions.py index aa6734f..74bd07b 100644 --- a/nerve/agent/sessions.py +++ b/nerve/agent/sessions.py @@ -205,9 +205,9 @@ async def get_active_session( ) -> str: """Get or create the active session for a channel. - If the channel has a mapped session with activity within the sticky - period, reuse it. Otherwise, create a fresh session and map the - channel to it. + Reuses the channel's mapped session when it is currently active + (a turn is in flight) or when its last activity falls within the + sticky period. Otherwise creates a fresh session and remaps. """ row = await self.db.get_channel_session(channel_key) if row: @@ -222,7 +222,25 @@ async def get_active_session( return session_id def _is_within_sticky_period(self, session: dict) -> bool: - """Check if a session had activity within the sticky period.""" + """Check whether a session is still the channel's owner. + + Active sessions are always sticky regardless of the timestamp. + A turn that hangs never reaches mark_active() at engine.run's + end, so last_activity_at freezes at turn-start; without this + carve-out, a hang lasting longer than sticky_period_minutes + would orphan the session and route the user's follow-up + message into a fresh, empty one. The engine's idle-message + timeout in receive_response (cli_idle_timeout_seconds) bounds + how long a truly hung session can hold the channel before it + flips to idle/error and the next message can mint a fresh + session. + + Idle sessions fall back to the time-based check so channels + that have been quiet for longer than sticky_period_minutes + roll over to a new session as before. + """ + if session.get("status") == SessionStatus.ACTIVE.value: + return True ts = session.get("last_activity_at") or session.get("updated_at") if not ts: return False diff --git a/tests/test_sessions.py b/tests/test_sessions.py index 48295a7..59d7a8d 100644 --- a/tests/test_sessions.py +++ b/tests/test_sessions.py @@ -127,6 +127,55 @@ async def test_auto_session_rotated_after_sticky_period(self, sm: SessionManager sid2 = await sm.get_active_session("telegram:222", source="telegram") assert sid1 != sid2 + async def test_auto_session_reused_when_active_despite_old_timestamp( + self, sm: SessionManager, db: Database, + ): + """An active session keeps the channel even if last_activity_at is stale. + + A hung turn never reaches mark_active() at engine.run's end, so + last_activity_at freezes at turn-start. Without the active-status + carve-out in _is_within_sticky_period, a hang lasting longer than + sticky_period_minutes would orphan the session and route the + user's follow-up message into a fresh, empty one. + """ + sid1 = await sm.get_active_session("telegram:333", source="telegram") + # Mark active and back-date timestamps to look like a hung turn that + # started long before the sticky-period cutoff. + await sm.mark_active(sid1, sdk_session_id="sdk-stuck") + await db.update_session_fields(sid1, { + "last_activity_at": "2020-01-01T00:00:00+00:00", + }) + await db.db.execute( + "UPDATE sessions SET updated_at = '2020-01-01T00:00:00' WHERE id = ?", + (sid1,), + ) + await db.db.commit() + sid2 = await sm.get_active_session("telegram:333", source="telegram") + assert sid1 == sid2 + + async def test_auto_session_rotated_when_idle_after_sticky_period( + self, sm: SessionManager, db: Database, + ): + """Idle sessions still roll over after the sticky period. + + Once a hung session has been recovered (status flipped to idle by + the engine's exception path), the time-based cutoff applies again + and a new follow-up message mints a fresh session. + """ + sid1 = await sm.get_active_session("telegram:444", source="telegram") + await sm.mark_active(sid1, sdk_session_id="sdk-x") + await sm.mark_idle(sid1) + await db.update_session_fields(sid1, { + "last_activity_at": "2020-01-01T00:00:00+00:00", + }) + await db.db.execute( + "UPDATE sessions SET updated_at = '2020-01-01T00:00:00' WHERE id = ?", + (sid1,), + ) + await db.db.commit() + sid2 = await sm.get_active_session("telegram:444", source="telegram") + assert sid1 != sid2 + async def test_set_and_get_active_session(self, sm: SessionManager, db: Database): await sm.get_or_create("ch-1") await sm.set_active_session("telegram:123", "ch-1")