From c8601456c01e0d23de4a0bd4127d7b34d5410b5a Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Mon, 15 Jun 2026 09:18:04 -0500
Subject: [PATCH 1/3] Bound Claude call to a deadline under the API Gateway 30s
 limit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Users report intermittent 503s from the explain endpoint. The Lambda
itself is healthy (Throttles=0, Errors=0) and the app only ever logs
200/307. The 503s come from API Gateway: its HTTP API integration
timeout is a hard 30s ceiling, but the Lambda timeout was raised to 60s.
Requests whose generation runs past 30s (218 over the last 4 days, up to
~40s) are severed by the gateway with an opaque 503 while the Lambda
keeps running to completion — billing tokens we never deliver.

Bound each Claude call to a wall-clock budget (default 27s, configurable
via ANTHROPIC_TIMEOUT_SECONDS) that sits comfortably below the gateway
ceiling, and set the SDK per-attempt timeout and retry budget to match.
On timeout or a transient upstream failure, return a structured,
retryable error response (status="error", ClaudeExplainTimeout metric)
instead of letting the gateway emit a 503. Genuine client errors (e.g. a
400) still propagate as real failures.

🤖 Generated by LLM (Claude, via OpenClaw)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/config.py       |  9 +++++++
 app/explain.py      | 62 ++++++++++++++++++++++++++++++++++++++++++---
 app/main.py         |  7 ++++-
 app/test_explain.py | 22 ++++++++++++++++
 4 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/app/config.py b/app/config.py
index 9509310..2359bbd 100644
--- a/app/config.py
+++ b/app/config.py
@@ -17,6 +17,15 @@ class Settings(BaseSettings):
     cache_ttl: str = "2d"  # HTTP Cache-Control max-age (e.g., "2d", "48h", "172800s")
     cache_ttl_seconds: int = 172800  # Computed from cache_ttl for Cache-Control header
     log_level: str = "INFO"  # Logging level (DEBUG, INFO, WARNING, ERROR)
+    # Wall-clock budget for a single Claude API call. Kept comfortably below the
+    # API Gateway HTTP API integration timeout (a hard 30s ceiling that cannot be
+    # raised, unlike the Lambda timeout) so we return a clean, handled error
+    # instead of the gateway severing the connection with an opaque 503.
+    anthropic_timeout_seconds: float = 27.0
+    # SDK auto-retries. Retries share the wall-clock budget above, so a fast
+    # transient failure (e.g. an overloaded 529) can still be retried while the
+    # total time can never exceed anthropic_timeout_seconds.
+    anthropic_max_retries: int = 2
     model_config = SettingsConfigDict(env_file=".env")
 
     @field_validator("cache_ttl_seconds", mode="before")
diff --git a/app/explain.py b/app/explain.py
index bcead24..a8e715e 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -1,6 +1,7 @@
+import asyncio
 import logging
 
-from anthropic import AsyncAnthropic
+from anthropic import APIConnectionError, APIStatusError, APITimeoutError, AsyncAnthropic
 
 from app.cache import CacheProvider, cache_response, get_cached_response
 from app.explain_api import CostBreakdown, ExplainRequest, ExplainResponse, TokenUsage
@@ -16,6 +17,11 @@
 MAX_CODE_LENGTH = 10000  # 10K chars should be enough for most source files
 MAX_ASM_LENGTH = 20000  # 20K chars for assembly output
 
+# Default wall-clock budget for the Claude call. Overridden per request from
+# settings.anthropic_timeout_seconds; the default keeps direct callers (tests,
+# local server) bounded below the API Gateway 30s integration ceiling too.
+DEFAULT_ANTHROPIC_DEADLINE_SECONDS = 27.0
+
 
 async def process_request(
     body: ExplainRequest,
@@ -23,6 +29,7 @@ async def process_request(
     prompt: Prompt,
     metrics_provider: MetricsProvider,
     cache_provider: CacheProvider | None = None,
+    deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS,
 ) -> ExplainResponse:
     """Process a request and return the response.
 
@@ -35,6 +42,7 @@ async def process_request(
         prompt: Prompt instance for generating messages
         metrics_provider: metrics provider for tracking stats
         cache_provider: cache provider for storing/retrieving responses
+        deadline_seconds: wall-clock budget for the Claude call before giving up
 
     Returns:
         An ExplainResponse Pydantic model
@@ -57,7 +65,7 @@ async def process_request(
             return cached_response
 
     # Cache miss or no cache - proceed with Anthropic API call
-    response = await _call_anthropic_api(body, client, prompt, metrics_provider)
+    response = await _call_anthropic_api(body, client, prompt, metrics_provider, deadline_seconds)
 
     # Cache the response (if cache provider is available). Don't cache
     # error responses — they consume real tokens but produce no useful
@@ -69,11 +77,44 @@ async def process_request(
     return response
 
 
+def _transient_error_response(
+    body: ExplainRequest,
+    model: str,
+    metrics_provider: MetricsProvider,
+    error: Exception,
+) -> ExplainResponse:
+    """Build a structured error response for a timed-out or transiently failed call.
+
+    Returned (rather than raised) so the client gets a clear, retryable message
+    well within the API Gateway 30s window instead of an opaque 503. No token
+    usage is available because the call did not complete.
+    """
+    message_text = (
+        "Claude Explain could not generate an explanation in time. "
+        "This usually means the model was busy or the input was very large; "
+        "please try again in a moment."
+    )
+    LOGGER.warning("Anthropic call failed (%s): %s", type(error).__name__, error)
+    metrics_provider.set_property("language", body.language)
+    metrics_provider.set_property("compiler", body.compiler)
+    metrics_provider.set_property("instructionSet", body.instructionSet or "unknown")
+    metrics_provider.set_property("cached", "false")
+    metrics_provider.put_metric("ClaudeExplainRequest", 1)
+    metrics_provider.put_metric("ClaudeExplainTimeout", 1)
+    return ExplainResponse(
+        status="error",
+        message=message_text,
+        model=model,
+        usage=TokenUsage(inputTokens=0, outputTokens=0, totalTokens=0),
+    )
+
+
 async def _call_anthropic_api(
     body: ExplainRequest,
     client: AsyncAnthropic,
     prompt: Prompt,
     metrics_provider: MetricsProvider,
+    deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS,
 ) -> ExplainResponse:
     """Make the actual call to Anthropic API and create response.
 
@@ -98,7 +139,22 @@ async def _call_anthropic_api(
         prompt_data["model"],
         bool(prompt_data.get("thinking")),
     )
-    message = await client.messages.create(**prompt_data)
+    # Bound the call to a wall-clock budget below the API Gateway HTTP API
+    # integration timeout (a hard 30s ceiling). Without this, a slow generation
+    # runs to completion inside the Lambda — billing tokens we never deliver —
+    # while the gateway has already returned an opaque 503 to the user. Failing
+    # within the budget lets us surface a clear, retryable message instead.
+    try:
+        async with asyncio.timeout(deadline_seconds):
+            message = await client.messages.create(**prompt_data)
+    except (TimeoutError, APITimeoutError, APIConnectionError) as e:
+        return _transient_error_response(body, prompt_data["model"], metrics_provider, e)
+    except APIStatusError as e:
+        # Surface only transient upstream failures gracefully; let genuine
+        # client errors (e.g. a malformed 400) propagate as a real failure.
+        if e.status_code in (408, 409, 429, 500, 502, 503, 504, 529):
+            return _transient_error_response(body, prompt_data["model"], metrics_provider, e)
+        raise
 
     # Extract usage information
     input_tokens = message.usage.input_tokens
diff --git a/app/main.py b/app/main.py
index 957c066..adbc1f7 100644
--- a/app/main.py
+++ b/app/main.py
@@ -42,7 +42,11 @@ async def lifespan(app: FastAPI):
 
     # Store shared resources in app.state
     app.state.settings = settings
-    app.state.anthropic_client = AsyncAnthropic(api_key=settings.anthropic_api_key)
+    app.state.anthropic_client = AsyncAnthropic(
+        api_key=settings.anthropic_api_key,
+        timeout=settings.anthropic_timeout_seconds,
+        max_retries=settings.anthropic_max_retries,
+    )
 
     # Load the prompt configuration
     prompt_config_path = Path(__file__).parent / "prompt.yaml"
@@ -133,4 +137,5 @@ async def explain(explain_request: ExplainRequest, request: Request) -> ExplainR
             request.app.state.prompt,
             metrics_provider,
             cache_provider,
+            deadline_seconds=request.app.state.settings.anthropic_timeout_seconds,
         )
diff --git a/app/test_explain.py b/app/test_explain.py
index b0d6150..2723f17 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -222,6 +222,28 @@ async def test_returns_error_when_no_text_block(self, sample_request, noop_metri
         assert response.usage.inputTokens == 100
         assert response.usage.outputTokens == 50
 
+    @pytest.mark.asyncio
+    async def test_returns_error_when_call_exceeds_deadline(self, sample_request, noop_metrics):
+        """A Claude call that overruns the wall-clock budget must return a
+        structured error well inside the API Gateway 30s window, not hang until
+        the gateway severs the connection with an opaque 503."""
+        import asyncio
+
+        async def slow_create(**_kwargs):
+            await asyncio.sleep(1.0)
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(side_effect=slow_create)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics, deadline_seconds=0.01)
+
+        assert response.status == "error"
+        assert response.explanation is None
+        assert "in time" in response.message
+        assert response.usage is not None
+        assert response.usage.totalTokens == 0
+
 
 class TestPromptValidation:
     """Validation rules enforced at Prompt construction."""

From 3bb478d63548eb8a024cbde5684d9509abf4cb2b Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Mon, 15 Jun 2026 11:35:05 -0500
Subject: [PATCH 2/3] Tailor transient error message by exception type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot review noted the original single message ('could not generate an
explanation in time') was inaccurate for non-timeout failures like connection
errors or transient 5xx responses. Now returns a timeout-specific message for
TimeoutError/APITimeoutError and a generic 'temporarily unavailable' message
for everything else.

🤖 Generated by LLM (Claude, via OpenClaw)
---
 app/explain.py      | 12 +++++++-----
 app/test_explain.py |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/app/explain.py b/app/explain.py
index a8e715e..c7a326d 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -89,11 +89,13 @@ def _transient_error_response(
     well within the API Gateway 30s window instead of an opaque 503. No token
     usage is available because the call did not complete.
     """
-    message_text = (
-        "Claude Explain could not generate an explanation in time. "
-        "This usually means the model was busy or the input was very large; "
-        "please try again in a moment."
-    )
+    if isinstance(error, (TimeoutError, APITimeoutError)):
+        message_text = (
+            "Claude Explain took too long to respond — the input may be very large "
+            "or the model is under heavy load. Please try again in a moment."
+        )
+    else:
+        message_text = "Claude Explain is temporarily unavailable. Please try again in a moment."
     LOGGER.warning("Anthropic call failed (%s): %s", type(error).__name__, error)
     metrics_provider.set_property("language", body.language)
     metrics_provider.set_property("compiler", body.compiler)
diff --git a/app/test_explain.py b/app/test_explain.py
index 2723f17..e189ee9 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -240,7 +240,7 @@ async def slow_create(**_kwargs):
 
         assert response.status == "error"
         assert response.explanation is None
-        assert "in time" in response.message
+        assert "too long" in response.message
         assert response.usage is not None
         assert response.usage.totalTokens == 0
 

From 157e78ef5a1cd8aef5da360a3efc3b414bc46130 Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Mon, 15 Jun 2026 11:40:29 -0500
Subject: [PATCH 3/3] Emit ClaudeExplainTimeout only for actual timeouts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot review noted ClaudeExplainTimeout was being emitted for all
transient failures (connection errors, 5xx, etc.), making it impossible
to distinguish real timeout rate from upstream error rate in dashboards.

Now emits ClaudeExplainTimeout only for TimeoutError/APITimeoutError,
and ClaudeExplainTransientError for everything else.

🤖 Generated by LLM (Claude, via OpenClaw)
---
 app/explain.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/app/explain.py b/app/explain.py
index c7a326d..ab3b0fc 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -102,7 +102,10 @@ def _transient_error_response(
     metrics_provider.set_property("instructionSet", body.instructionSet or "unknown")
     metrics_provider.set_property("cached", "false")
     metrics_provider.put_metric("ClaudeExplainRequest", 1)
-    metrics_provider.put_metric("ClaudeExplainTimeout", 1)
+    if isinstance(error, (TimeoutError, APITimeoutError)):
+        metrics_provider.put_metric("ClaudeExplainTimeout", 1)
+    else:
+        metrics_provider.put_metric("ClaudeExplainTransientError", 1)
     return ExplainResponse(
         status="error",
         message=message_text,