diff --git a/app/config.py b/app/config.py
index 9509310..2359bbd 100644
--- a/app/config.py
+++ b/app/config.py
@@ -17,6 +17,15 @@ class Settings(BaseSettings):
     cache_ttl: str = "2d"  # HTTP Cache-Control max-age (e.g., "2d", "48h", "172800s")
     cache_ttl_seconds: int = 172800  # Computed from cache_ttl for Cache-Control header
     log_level: str = "INFO"  # Logging level (DEBUG, INFO, WARNING, ERROR)
+    # Wall-clock budget for a single Claude API call. Kept comfortably below the
+    # API Gateway HTTP API integration timeout (a hard 30s ceiling that cannot be
+    # raised, unlike the Lambda timeout) so we return a clean, handled error
+    # instead of the gateway severing the connection with an opaque 503.
+    anthropic_timeout_seconds: float = 27.0
+    # SDK auto-retries. Retries share the wall-clock budget above, so a fast
+    # transient failure (e.g. an overloaded 529) can still be retried while the
+    # total time can never exceed anthropic_timeout_seconds.
+    anthropic_max_retries: int = 2
     model_config = SettingsConfigDict(env_file=".env")
 
     @field_validator("cache_ttl_seconds", mode="before")
diff --git a/app/explain.py b/app/explain.py
index bcead24..ab3b0fc 100644
--- a/app/explain.py
+++ b/app/explain.py
@@ -1,6 +1,7 @@
+import asyncio
 import logging
 
-from anthropic import AsyncAnthropic
+from anthropic import APIConnectionError, APIStatusError, APITimeoutError, AsyncAnthropic
 
 from app.cache import CacheProvider, cache_response, get_cached_response
 from app.explain_api import CostBreakdown, ExplainRequest, ExplainResponse, TokenUsage
@@ -16,6 +17,11 @@
 MAX_CODE_LENGTH = 10000  # 10K chars should be enough for most source files
 MAX_ASM_LENGTH = 20000  # 20K chars for assembly output
 
+# Default wall-clock budget for the Claude call. Overridden per request from
+# settings.anthropic_timeout_seconds; the default keeps direct callers (tests,
+# local server) bounded below the API Gateway 30s integration ceiling too.
+DEFAULT_ANTHROPIC_DEADLINE_SECONDS = 27.0
+
 
 async def process_request(
     body: ExplainRequest,
@@ -23,6 +29,7 @@ async def process_request(
     prompt: Prompt,
     metrics_provider: MetricsProvider,
     cache_provider: CacheProvider | None = None,
+    deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS,
 ) -> ExplainResponse:
     """Process a request and return the response.
 
@@ -35,6 +42,7 @@ async def process_request(
         prompt: Prompt instance for generating messages
         metrics_provider: metrics provider for tracking stats
         cache_provider: cache provider for storing/retrieving responses
+        deadline_seconds: wall-clock budget for the Claude call before giving up
 
     Returns:
         An ExplainResponse Pydantic model
@@ -57,7 +65,7 @@ async def process_request(
             return cached_response
 
     # Cache miss or no cache - proceed with Anthropic API call
-    response = await _call_anthropic_api(body, client, prompt, metrics_provider)
+    response = await _call_anthropic_api(body, client, prompt, metrics_provider, deadline_seconds)
 
     # Cache the response (if cache provider is available). Don't cache
     # error responses — they consume real tokens but produce no useful
@@ -69,11 +77,49 @@ async def process_request(
     return response
 
 
+def _transient_error_response(
+    body: ExplainRequest,
+    model: str,
+    metrics_provider: MetricsProvider,
+    error: Exception,
+) -> ExplainResponse:
+    """Build a structured error response for a timed-out or transiently failed call.
+
+    Returned (rather than raised) so the client gets a clear, retryable message
+    well within the API Gateway 30s window instead of an opaque 503. No token
+    usage is available because the call did not complete.
+    """
+    if isinstance(error, (TimeoutError, APITimeoutError)):
+        message_text = (
+            "Claude Explain took too long to respond — the input may be very large "
+            "or the model is under heavy load. Please try again in a moment."
+        )
+    else:
+        message_text = "Claude Explain is temporarily unavailable. Please try again in a moment."
+    LOGGER.warning("Anthropic call failed (%s): %s", type(error).__name__, error)
+    metrics_provider.set_property("language", body.language)
+    metrics_provider.set_property("compiler", body.compiler)
+    metrics_provider.set_property("instructionSet", body.instructionSet or "unknown")
+    metrics_provider.set_property("cached", "false")
+    metrics_provider.put_metric("ClaudeExplainRequest", 1)
+    if isinstance(error, (TimeoutError, APITimeoutError)):
+        metrics_provider.put_metric("ClaudeExplainTimeout", 1)
+    else:
+        metrics_provider.put_metric("ClaudeExplainTransientError", 1)
+    return ExplainResponse(
+        status="error",
+        message=message_text,
+        model=model,
+        usage=TokenUsage(inputTokens=0, outputTokens=0, totalTokens=0),
+    )
+
+
 async def _call_anthropic_api(
     body: ExplainRequest,
     client: AsyncAnthropic,
     prompt: Prompt,
     metrics_provider: MetricsProvider,
+    deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS,
 ) -> ExplainResponse:
     """Make the actual call to Anthropic API and create response.
 
@@ -98,7 +144,22 @@ async def _call_anthropic_api(
         prompt_data["model"],
         bool(prompt_data.get("thinking")),
     )
-    message = await client.messages.create(**prompt_data)
+    # Bound the call to a wall-clock budget below the API Gateway HTTP API
+    # integration timeout (a hard 30s ceiling). Without this, a slow generation
+    # runs to completion inside the Lambda — billing tokens we never deliver —
+    # while the gateway has already returned an opaque 503 to the user. Failing
+    # within the budget lets us surface a clear, retryable message instead.
+    try:
+        async with asyncio.timeout(deadline_seconds):
+            message = await client.messages.create(**prompt_data)
+    except (TimeoutError, APITimeoutError, APIConnectionError) as e:
+        return _transient_error_response(body, prompt_data["model"], metrics_provider, e)
+    except APIStatusError as e:
+        # Surface only transient upstream failures gracefully; let genuine
+        # client errors (e.g. a malformed 400) propagate as a real failure.
+        if e.status_code in (408, 409, 429, 500, 502, 503, 504, 529):
+            return _transient_error_response(body, prompt_data["model"], metrics_provider, e)
+        raise
 
     # Extract usage information
     input_tokens = message.usage.input_tokens
diff --git a/app/main.py b/app/main.py
index 957c066..adbc1f7 100644
--- a/app/main.py
+++ b/app/main.py
@@ -42,7 +42,11 @@ async def lifespan(app: FastAPI):
 
     # Store shared resources in app.state
     app.state.settings = settings
-    app.state.anthropic_client = AsyncAnthropic(api_key=settings.anthropic_api_key)
+    app.state.anthropic_client = AsyncAnthropic(
+        api_key=settings.anthropic_api_key,
+        timeout=settings.anthropic_timeout_seconds,
+        max_retries=settings.anthropic_max_retries,
+    )
 
     # Load the prompt configuration
     prompt_config_path = Path(__file__).parent / "prompt.yaml"
@@ -133,4 +137,5 @@ async def explain(explain_request: ExplainRequest, request: Request) -> ExplainR
             request.app.state.prompt,
             metrics_provider,
             cache_provider,
+            deadline_seconds=request.app.state.settings.anthropic_timeout_seconds,
         )
diff --git a/app/test_explain.py b/app/test_explain.py
index b0d6150..e189ee9 100644
--- a/app/test_explain.py
+++ b/app/test_explain.py
@@ -222,6 +222,28 @@ async def test_returns_error_when_no_text_block(self, sample_request, noop_metri
         assert response.usage.inputTokens == 100
         assert response.usage.outputTokens == 50
 
+    @pytest.mark.asyncio
+    async def test_returns_error_when_call_exceeds_deadline(self, sample_request, noop_metrics):
+        """A Claude call that overruns the wall-clock budget must return a
+        structured error well inside the API Gateway 30s window, not hang until
+        the gateway severs the connection with an opaque 503."""
+        import asyncio
+
+        async def slow_create(**_kwargs):
+            await asyncio.sleep(1.0)
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(side_effect=slow_create)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics, deadline_seconds=0.01)
+
+        assert response.status == "error"
+        assert response.explanation is None
+        assert "too long" in response.message
+        assert response.usage is not None
+        assert response.usage.totalTokens == 0
+
 
 class TestPromptValidation:
     """Validation rules enforced at Prompt construction."""