Skip to content

Commit d36bc1d

Browse files
RichardAtCTharipatel07
authored andcommitted
feat: add exponential backoff retry for transient SDK errors (rebased RichardAtCT#127) (RichardAtCT#170)
* feat: add exponential backoff retry for transient SDK errors Closes RichardAtCT#60 - adds configurable retry logic to ClaudeSDKManager.execute_command() for transient CLIConnectionError failures (non-MCP). Changes: - src/utils/constants.py: 4 new retry default constants - src/config/settings.py: 4 new settings fields (claude_retry_max_attempts, claude_retry_base_delay, claude_retry_backoff_factor, claude_retry_max_delay) - src/claude/sdk_integration.py: _is_retryable_error() helper + retry loop wrapping asyncio.wait_for() in execute_command() Retry decision: - CLIConnectionError (non-MCP): retried with exponential backoff - asyncio.TimeoutError: not retried (user-configured timeout, intentional) - CLINotFoundError, ProcessError, CLIJSONDecodeError: not retried Default backoff: 1s → 3s → 9s, capped at 30s (CLAUDE_RETRY_MAX_ATTEMPTS=0 disables) Tests: 491 passed, 0 failed * fix: address review nits - clean retry loop, tighter MCP filter, field validation, retry tests * fix: reset messages per retry, document timeout bypass and ge=0 delay semantics * fix: add missing last_exc variable in retry loop --------- Co-authored-by: Hari Patel <patelhariv18@gmail.com>
1 parent 7724c78 commit d36bc1d

4 files changed

Lines changed: 204 additions & 31 deletions

File tree

src/claude/sdk_integration.py

Lines changed: 82 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,16 @@ def __init__(
259259
else:
260260
logger.info("No API key provided, using existing Claude CLI authentication")
261261

262+
def _is_retryable_error(self, exc: BaseException) -> bool:
263+
"""Return True for transient errors that warrant a retry.
264+
asyncio.TimeoutError is intentional (user-configured timeout) — not retried.
265+
Only non-MCP CLIConnectionError is considered transient.
266+
"""
267+
if isinstance(exc, CLIConnectionError):
268+
msg = str(exc).lower()
269+
return "mcp" not in msg # "server" alone is too broad
270+
return False
271+
262272
async def execute_command(
263273
self,
264274
prompt: str,
@@ -393,43 +403,84 @@ async def _run_client() -> None:
393403
finally:
394404
await client.disconnect()
395405

396-
# Execute: race client against timeout and optional interrupt
397-
run_task = asyncio.create_task(_run_client())
406+
# Execute with timeout and retry, racing against optional interrupt
407+
max_attempts = max(1, self.config.claude_retry_max_attempts)
408+
last_exc: Optional[BaseException] = None
409+
410+
for attempt in range(max_attempts):
411+
# Reset message accumulator each attempt so that a failed attempt
412+
# does not pollute the next one with partial/duplicate messages.
413+
# _run_client() closes over `messages` by reference (late-binding
414+
# closure), so clearing it here is seen by every new call.
415+
messages.clear()
416+
417+
if attempt > 0:
418+
delay = min(
419+
self.config.claude_retry_base_delay
420+
* (self.config.claude_retry_backoff_factor ** (attempt - 1)),
421+
self.config.claude_retry_max_delay,
422+
)
423+
logger.warning(
424+
"Retrying Claude SDK command",
425+
attempt=attempt + 1,
426+
max_attempts=max_attempts,
427+
delay_seconds=delay,
428+
)
429+
await asyncio.sleep(delay)
398430

399-
interrupt_watcher: Optional["asyncio.Task[None]"] = None
400-
if interrupt_event is not None:
431+
run_task = asyncio.create_task(_run_client())
401432

402-
async def _cancel_on_interrupt() -> None:
403-
nonlocal interrupted
404-
await interrupt_event.wait()
405-
interrupted = True
406-
run_task.cancel()
433+
interrupt_watcher: Optional["asyncio.Task[None]"] = None
434+
if interrupt_event is not None:
407435

408-
interrupt_watcher = asyncio.create_task(_cancel_on_interrupt())
436+
async def _cancel_on_interrupt() -> None:
437+
nonlocal interrupted
438+
await interrupt_event.wait()
439+
interrupted = True
440+
run_task.cancel()
409441

410-
try:
411-
await asyncio.wait_for(
412-
asyncio.shield(run_task),
413-
timeout=self.config.claude_timeout_seconds,
414-
)
415-
except asyncio.CancelledError:
416-
if not interrupted:
417-
raise
418-
# Interrupt cancelled the task — wait for cleanup
419-
try:
420-
await run_task
421-
except asyncio.CancelledError:
422-
pass
423-
except asyncio.TimeoutError:
424-
run_task.cancel()
442+
interrupt_watcher = asyncio.create_task(_cancel_on_interrupt())
443+
444+
# Note: asyncio.TimeoutError is intentionally NOT retried —
445+
# it reflects a user-configured hard limit.
425446
try:
426-
await run_task
447+
await asyncio.wait_for(
448+
asyncio.shield(run_task),
449+
timeout=self.config.claude_timeout_seconds,
450+
)
451+
break # success — exit retry loop
427452
except asyncio.CancelledError:
428-
pass
429-
raise
430-
finally:
431-
if interrupt_watcher is not None:
432-
interrupt_watcher.cancel()
453+
if not interrupted:
454+
raise
455+
# Interrupt cancelled the task — wait for cleanup
456+
try:
457+
await run_task
458+
except asyncio.CancelledError:
459+
pass
460+
break # user interrupted — don't retry
461+
except asyncio.TimeoutError:
462+
run_task.cancel()
463+
try:
464+
await run_task
465+
except asyncio.CancelledError:
466+
pass
467+
raise # timeout — don't retry
468+
except CLIConnectionError as exc:
469+
if self._is_retryable_error(exc) and attempt < max_attempts - 1:
470+
last_exc = exc
471+
logger.warning(
472+
"Transient connection error, will retry",
473+
attempt=attempt + 1,
474+
error=str(exc),
475+
)
476+
continue
477+
raise # non-retryable or attempts exhausted
478+
finally:
479+
if interrupt_watcher is not None:
480+
interrupt_watcher.cancel()
481+
else:
482+
if last_exc is not None:
483+
raise last_exc
433484

434485
# Extract cost, tools, and session_id from result message
435486
cost = 0.0

src/config/settings.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
DEFAULT_RATE_LIMIT_BURST,
2727
DEFAULT_RATE_LIMIT_REQUESTS,
2828
DEFAULT_RATE_LIMIT_WINDOW,
29+
DEFAULT_RETRY_BACKOFF_FACTOR,
30+
DEFAULT_RETRY_BASE_DELAY,
31+
DEFAULT_RETRY_MAX_ATTEMPTS,
32+
DEFAULT_RETRY_MAX_DELAY,
2933
DEFAULT_SESSION_TIMEOUT_HOURS,
3034
)
3135

@@ -121,6 +125,34 @@ class Settings(BaseSettings):
121125
description="List of explicitly disallowed Claude tools/commands",
122126
)
123127

128+
# Retry settings
129+
claude_retry_max_attempts: int = Field(
130+
DEFAULT_RETRY_MAX_ATTEMPTS,
131+
ge=0,
132+
description="Max retry attempts for transient SDK errors (0 = disabled)",
133+
)
134+
claude_retry_base_delay: float = Field(
135+
DEFAULT_RETRY_BASE_DELAY,
136+
ge=0,
137+
description=(
138+
"Base delay in seconds between retries. "
139+
"0 means retries are attempted immediately with no pause."
140+
),
141+
)
142+
claude_retry_backoff_factor: float = Field(
143+
DEFAULT_RETRY_BACKOFF_FACTOR,
144+
gt=0,
145+
description="Exponential backoff multiplier",
146+
)
147+
claude_retry_max_delay: float = Field(
148+
DEFAULT_RETRY_MAX_DELAY,
149+
ge=0,
150+
description=(
151+
"Maximum delay cap in seconds. "
152+
"0 disables the cap entirely (delays grow unbounded with backoff)."
153+
),
154+
)
155+
124156
# Sandbox settings
125157
sandbox_enabled: bool = Field(
126158
True,

src/utils/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,5 +85,11 @@
8585
DEFAULT_CLAUDE_BINARY = "claude"
8686
DEFAULT_CLAUDE_OUTPUT_FORMAT = "stream-json"
8787

88+
# Retry defaults
89+
DEFAULT_RETRY_MAX_ATTEMPTS = 3
90+
DEFAULT_RETRY_BASE_DELAY = 1.0
91+
DEFAULT_RETRY_BACKOFF_FACTOR = 3.0
92+
DEFAULT_RETRY_MAX_DELAY = 30.0
93+
8894
# Logging
8995
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

tests/unit/test_claude/test_sdk_integration.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,90 @@ async def test_execute_command_no_resume_for_new_session(self, sdk_manager):
390390
not hasattr(captured_options[0], "resume") or not captured_options[0].resume
391391
)
392392

393+
async def test_retry_on_transient_cli_connection_error(self, sdk_manager):
394+
"""Test that transient CLIConnectionError triggers retry and succeeds."""
395+
from claude_agent_sdk import CLIConnectionError
396+
397+
call_count = 0
398+
399+
async def flaky_receive():
400+
nonlocal call_count
401+
call_count += 1
402+
if call_count == 1:
403+
raise CLIConnectionError("connection reset")
404+
# Second attempt succeeds - yield a ResultMessage
405+
yield
406+
407+
# Use a config with 2 attempts
408+
sdk_manager.config.claude_retry_max_attempts = 2
409+
410+
client = AsyncMock()
411+
client.connect = AsyncMock()
412+
client.disconnect = AsyncMock()
413+
client.query = AsyncMock()
414+
query_mock = AsyncMock()
415+
query_mock.receive_messages = flaky_receive
416+
client._query = query_mock
417+
418+
# Should not raise - second attempt succeeds
419+
with patch("src.claude.sdk_integration.ClaudeSDKClient", return_value=client):
420+
with patch("asyncio.sleep", new_callable=AsyncMock):
421+
try:
422+
await sdk_manager.execute_command(
423+
prompt="Test",
424+
working_directory=Path("/test"),
425+
)
426+
except Exception:
427+
pass # Response parsing may fail - what matters is retry happened
428+
assert call_count == 2
429+
430+
async def test_no_retry_on_mcp_connection_error(self, sdk_manager):
431+
"""Test that MCP CLIConnectionError is NOT retried."""
432+
from claude_agent_sdk import CLIConnectionError
433+
434+
from src.claude.exceptions import ClaudeMCPError
435+
436+
client = AsyncMock()
437+
client.connect = AsyncMock()
438+
client.disconnect = AsyncMock()
439+
client.query = AsyncMock(side_effect=CLIConnectionError("mcp server failed"))
440+
441+
with patch("src.claude.sdk_integration.ClaudeSDKClient", return_value=client):
442+
with pytest.raises((ClaudeMCPError, Exception)):
443+
await sdk_manager.execute_command(
444+
prompt="Test",
445+
working_directory=Path("/test"),
446+
)
447+
# Only called once - no retry for MCP errors
448+
assert client.query.call_count == 1
449+
450+
async def test_retry_disabled_when_max_attempts_zero(self, sdk_manager):
451+
"""Test that setting max_attempts=0 effectively disables retries (1 attempt)."""
452+
sdk_manager.config.claude_retry_max_attempts = 0
453+
assert max(1, sdk_manager.config.claude_retry_max_attempts) == 1
454+
455+
def test_is_retryable_error_transient(self, sdk_manager):
456+
"""Test _is_retryable_error returns True for transient connection errors."""
457+
from claude_agent_sdk import CLIConnectionError
458+
459+
assert (
460+
sdk_manager._is_retryable_error(CLIConnectionError("connection reset"))
461+
is True
462+
)
463+
464+
def test_is_retryable_error_mcp(self, sdk_manager):
465+
"""Test _is_retryable_error returns False for MCP errors."""
466+
from claude_agent_sdk import CLIConnectionError
467+
468+
assert (
469+
sdk_manager._is_retryable_error(CLIConnectionError("mcp server failed"))
470+
is False
471+
)
472+
473+
def test_is_retryable_error_timeout(self, sdk_manager):
474+
"""Test _is_retryable_error returns False for timeout errors."""
475+
assert sdk_manager._is_retryable_error(asyncio.TimeoutError()) is False
476+
393477

394478
class TestClaudeSandboxSettings:
395479
"""Test sandbox and system_prompt settings on ClaudeAgentOptions."""

0 commit comments

Comments
 (0)