From c8601456c01e0d23de4a0bd4127d7b34d5410b5a Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Mon, 15 Jun 2026 09:18:04 -0500 Subject: [PATCH 1/3] Bound Claude call to a deadline under the API Gateway 30s limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users report intermittent 503s from the explain endpoint. The Lambda itself is healthy (Throttles=0, Errors=0) and the app only ever logs 200/307. The 503s come from API Gateway: its HTTP API integration timeout is a hard 30s ceiling, but the Lambda timeout was raised to 60s. Requests whose generation runs past 30s (218 over the last 4 days, up to ~40s) are severed by the gateway with an opaque 503 while the Lambda keeps running to completion — billing tokens we never deliver. Bound each Claude call to a wall-clock budget (default 27s, configurable via ANTHROPIC_TIMEOUT_SECONDS) that sits comfortably below the gateway ceiling, and set the SDK per-attempt timeout and retry budget to match. On timeout or a transient upstream failure, return a structured, retryable error response (status="error", ClaudeExplainTimeout metric) instead of letting the gateway emit a 503. Genuine client errors (e.g. a 400) still propagate as real failures. 🤖 Generated by LLM (Claude, via OpenClaw) Co-Authored-By: Claude Opus 4.8 (1M context) --- app/config.py | 9 +++++++ app/explain.py | 62 ++++++++++++++++++++++++++++++++++++++++++--- app/main.py | 7 ++++- app/test_explain.py | 22 ++++++++++++++++ 4 files changed, 96 insertions(+), 4 deletions(-) diff --git a/app/config.py b/app/config.py index 9509310..2359bbd 100644 --- a/app/config.py +++ b/app/config.py @@ -17,6 +17,15 @@ class Settings(BaseSettings): cache_ttl: str = "2d" # HTTP Cache-Control max-age (e.g., "2d", "48h", "172800s") cache_ttl_seconds: int = 172800 # Computed from cache_ttl for Cache-Control header log_level: str = "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR) + # Wall-clock budget for a single Claude API call. Kept comfortably below the + # API Gateway HTTP API integration timeout (a hard 30s ceiling that cannot be + # raised, unlike the Lambda timeout) so we return a clean, handled error + # instead of the gateway severing the connection with an opaque 503. + anthropic_timeout_seconds: float = 27.0 + # SDK auto-retries. Retries share the wall-clock budget above, so a fast + # transient failure (e.g. an overloaded 529) can still be retried while the + # total time can never exceed anthropic_timeout_seconds. + anthropic_max_retries: int = 2 model_config = SettingsConfigDict(env_file=".env") @field_validator("cache_ttl_seconds", mode="before") diff --git a/app/explain.py b/app/explain.py index bcead24..a8e715e 100644 --- a/app/explain.py +++ b/app/explain.py @@ -1,6 +1,7 @@ +import asyncio import logging -from anthropic import AsyncAnthropic +from anthropic import APIConnectionError, APIStatusError, APITimeoutError, AsyncAnthropic from app.cache import CacheProvider, cache_response, get_cached_response from app.explain_api import CostBreakdown, ExplainRequest, ExplainResponse, TokenUsage @@ -16,6 +17,11 @@ MAX_CODE_LENGTH = 10000 # 10K chars should be enough for most source files MAX_ASM_LENGTH = 20000 # 20K chars for assembly output +# Default wall-clock budget for the Claude call. Overridden per request from +# settings.anthropic_timeout_seconds; the default keeps direct callers (tests, +# local server) bounded below the API Gateway 30s integration ceiling too. +DEFAULT_ANTHROPIC_DEADLINE_SECONDS = 27.0 + async def process_request( body: ExplainRequest, @@ -23,6 +29,7 @@ async def process_request( prompt: Prompt, metrics_provider: MetricsProvider, cache_provider: CacheProvider | None = None, + deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS, ) -> ExplainResponse: """Process a request and return the response. @@ -35,6 +42,7 @@ async def process_request( prompt: Prompt instance for generating messages metrics_provider: metrics provider for tracking stats cache_provider: cache provider for storing/retrieving responses + deadline_seconds: wall-clock budget for the Claude call before giving up Returns: An ExplainResponse Pydantic model @@ -57,7 +65,7 @@ async def process_request( return cached_response # Cache miss or no cache - proceed with Anthropic API call - response = await _call_anthropic_api(body, client, prompt, metrics_provider) + response = await _call_anthropic_api(body, client, prompt, metrics_provider, deadline_seconds) # Cache the response (if cache provider is available). Don't cache # error responses — they consume real tokens but produce no useful @@ -69,11 +77,44 @@ async def process_request( return response +def _transient_error_response( + body: ExplainRequest, + model: str, + metrics_provider: MetricsProvider, + error: Exception, +) -> ExplainResponse: + """Build a structured error response for a timed-out or transiently failed call. + + Returned (rather than raised) so the client gets a clear, retryable message + well within the API Gateway 30s window instead of an opaque 503. No token + usage is available because the call did not complete. + """ + message_text = ( + "Claude Explain could not generate an explanation in time. " + "This usually means the model was busy or the input was very large; " + "please try again in a moment." + ) + LOGGER.warning("Anthropic call failed (%s): %s", type(error).__name__, error) + metrics_provider.set_property("language", body.language) + metrics_provider.set_property("compiler", body.compiler) + metrics_provider.set_property("instructionSet", body.instructionSet or "unknown") + metrics_provider.set_property("cached", "false") + metrics_provider.put_metric("ClaudeExplainRequest", 1) + metrics_provider.put_metric("ClaudeExplainTimeout", 1) + return ExplainResponse( + status="error", + message=message_text, + model=model, + usage=TokenUsage(inputTokens=0, outputTokens=0, totalTokens=0), + ) + + async def _call_anthropic_api( body: ExplainRequest, client: AsyncAnthropic, prompt: Prompt, metrics_provider: MetricsProvider, + deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS, ) -> ExplainResponse: """Make the actual call to Anthropic API and create response. @@ -98,7 +139,22 @@ async def _call_anthropic_api( prompt_data["model"], bool(prompt_data.get("thinking")), ) - message = await client.messages.create(**prompt_data) + # Bound the call to a wall-clock budget below the API Gateway HTTP API + # integration timeout (a hard 30s ceiling). Without this, a slow generation + # runs to completion inside the Lambda — billing tokens we never deliver — + # while the gateway has already returned an opaque 503 to the user. Failing + # within the budget lets us surface a clear, retryable message instead. + try: + async with asyncio.timeout(deadline_seconds): + message = await client.messages.create(**prompt_data) + except (TimeoutError, APITimeoutError, APIConnectionError) as e: + return _transient_error_response(body, prompt_data["model"], metrics_provider, e) + except APIStatusError as e: + # Surface only transient upstream failures gracefully; let genuine + # client errors (e.g. a malformed 400) propagate as a real failure. + if e.status_code in (408, 409, 429, 500, 502, 503, 504, 529): + return _transient_error_response(body, prompt_data["model"], metrics_provider, e) + raise # Extract usage information input_tokens = message.usage.input_tokens diff --git a/app/main.py b/app/main.py index 957c066..adbc1f7 100644 --- a/app/main.py +++ b/app/main.py @@ -42,7 +42,11 @@ async def lifespan(app: FastAPI): # Store shared resources in app.state app.state.settings = settings - app.state.anthropic_client = AsyncAnthropic(api_key=settings.anthropic_api_key) + app.state.anthropic_client = AsyncAnthropic( + api_key=settings.anthropic_api_key, + timeout=settings.anthropic_timeout_seconds, + max_retries=settings.anthropic_max_retries, + ) # Load the prompt configuration prompt_config_path = Path(__file__).parent / "prompt.yaml" @@ -133,4 +137,5 @@ async def explain(explain_request: ExplainRequest, request: Request) -> ExplainR request.app.state.prompt, metrics_provider, cache_provider, + deadline_seconds=request.app.state.settings.anthropic_timeout_seconds, ) diff --git a/app/test_explain.py b/app/test_explain.py index b0d6150..2723f17 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -222,6 +222,28 @@ async def test_returns_error_when_no_text_block(self, sample_request, noop_metri assert response.usage.inputTokens == 100 assert response.usage.outputTokens == 50 + @pytest.mark.asyncio + async def test_returns_error_when_call_exceeds_deadline(self, sample_request, noop_metrics): + """A Claude call that overruns the wall-clock budget must return a + structured error well inside the API Gateway 30s window, not hang until + the gateway severs the connection with an opaque 503.""" + import asyncio + + async def slow_create(**_kwargs): + await asyncio.sleep(1.0) + + mock_client = MagicMock() + mock_client.messages.create = AsyncMock(side_effect=slow_create) + + test_prompt = Prompt(Path("app/prompt.yaml")) + response = await process_request(sample_request, mock_client, test_prompt, noop_metrics, deadline_seconds=0.01) + + assert response.status == "error" + assert response.explanation is None + assert "in time" in response.message + assert response.usage is not None + assert response.usage.totalTokens == 0 + class TestPromptValidation: """Validation rules enforced at Prompt construction.""" From 3bb478d63548eb8a024cbde5684d9509abf4cb2b Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Mon, 15 Jun 2026 11:35:05 -0500 Subject: [PATCH 2/3] Tailor transient error message by exception type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot review noted the original single message ('could not generate an explanation in time') was inaccurate for non-timeout failures like connection errors or transient 5xx responses. Now returns a timeout-specific message for TimeoutError/APITimeoutError and a generic 'temporarily unavailable' message for everything else. 🤖 Generated by LLM (Claude, via OpenClaw) --- app/explain.py | 12 +++++++----- app/test_explain.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/app/explain.py b/app/explain.py index a8e715e..c7a326d 100644 --- a/app/explain.py +++ b/app/explain.py @@ -89,11 +89,13 @@ def _transient_error_response( well within the API Gateway 30s window instead of an opaque 503. No token usage is available because the call did not complete. """ - message_text = ( - "Claude Explain could not generate an explanation in time. " - "This usually means the model was busy or the input was very large; " - "please try again in a moment." - ) + if isinstance(error, (TimeoutError, APITimeoutError)): + message_text = ( + "Claude Explain took too long to respond — the input may be very large " + "or the model is under heavy load. Please try again in a moment." + ) + else: + message_text = "Claude Explain is temporarily unavailable. Please try again in a moment." LOGGER.warning("Anthropic call failed (%s): %s", type(error).__name__, error) metrics_provider.set_property("language", body.language) metrics_provider.set_property("compiler", body.compiler) diff --git a/app/test_explain.py b/app/test_explain.py index 2723f17..e189ee9 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -240,7 +240,7 @@ async def slow_create(**_kwargs): assert response.status == "error" assert response.explanation is None - assert "in time" in response.message + assert "too long" in response.message assert response.usage is not None assert response.usage.totalTokens == 0 From 157e78ef5a1cd8aef5da360a3efc3b414bc46130 Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Mon, 15 Jun 2026 11:40:29 -0500 Subject: [PATCH 3/3] Emit ClaudeExplainTimeout only for actual timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot review noted ClaudeExplainTimeout was being emitted for all transient failures (connection errors, 5xx, etc.), making it impossible to distinguish real timeout rate from upstream error rate in dashboards. Now emits ClaudeExplainTimeout only for TimeoutError/APITimeoutError, and ClaudeExplainTransientError for everything else. 🤖 Generated by LLM (Claude, via OpenClaw) --- app/explain.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/app/explain.py b/app/explain.py index c7a326d..ab3b0fc 100644 --- a/app/explain.py +++ b/app/explain.py @@ -102,7 +102,10 @@ def _transient_error_response( metrics_provider.set_property("instructionSet", body.instructionSet or "unknown") metrics_provider.set_property("cached", "false") metrics_provider.put_metric("ClaudeExplainRequest", 1) - metrics_provider.put_metric("ClaudeExplainTimeout", 1) + if isinstance(error, (TimeoutError, APITimeoutError)): + metrics_provider.put_metric("ClaudeExplainTimeout", 1) + else: + metrics_provider.put_metric("ClaudeExplainTransientError", 1) return ExplainResponse( status="error", message=message_text,