diff --git a/app/config.py b/app/config.py index 9509310..2359bbd 100644 --- a/app/config.py +++ b/app/config.py @@ -17,6 +17,15 @@ class Settings(BaseSettings): cache_ttl: str = "2d" # HTTP Cache-Control max-age (e.g., "2d", "48h", "172800s") cache_ttl_seconds: int = 172800 # Computed from cache_ttl for Cache-Control header log_level: str = "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR) + # Wall-clock budget for a single Claude API call. Kept comfortably below the + # API Gateway HTTP API integration timeout (a hard 30s ceiling that cannot be + # raised, unlike the Lambda timeout) so we return a clean, handled error + # instead of the gateway severing the connection with an opaque 503. + anthropic_timeout_seconds: float = 27.0 + # SDK auto-retries. Retries share the wall-clock budget above, so a fast + # transient failure (e.g. an overloaded 529) can still be retried while the + # total time can never exceed anthropic_timeout_seconds. + anthropic_max_retries: int = 2 model_config = SettingsConfigDict(env_file=".env") @field_validator("cache_ttl_seconds", mode="before") diff --git a/app/explain.py b/app/explain.py index bcead24..ab3b0fc 100644 --- a/app/explain.py +++ b/app/explain.py @@ -1,6 +1,7 @@ +import asyncio import logging -from anthropic import AsyncAnthropic +from anthropic import APIConnectionError, APIStatusError, APITimeoutError, AsyncAnthropic from app.cache import CacheProvider, cache_response, get_cached_response from app.explain_api import CostBreakdown, ExplainRequest, ExplainResponse, TokenUsage @@ -16,6 +17,11 @@ MAX_CODE_LENGTH = 10000 # 10K chars should be enough for most source files MAX_ASM_LENGTH = 20000 # 20K chars for assembly output +# Default wall-clock budget for the Claude call. Overridden per request from +# settings.anthropic_timeout_seconds; the default keeps direct callers (tests, +# local server) bounded below the API Gateway 30s integration ceiling too. +DEFAULT_ANTHROPIC_DEADLINE_SECONDS = 27.0 + async def process_request( body: ExplainRequest, @@ -23,6 +29,7 @@ async def process_request( prompt: Prompt, metrics_provider: MetricsProvider, cache_provider: CacheProvider | None = None, + deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS, ) -> ExplainResponse: """Process a request and return the response. @@ -35,6 +42,7 @@ async def process_request( prompt: Prompt instance for generating messages metrics_provider: metrics provider for tracking stats cache_provider: cache provider for storing/retrieving responses + deadline_seconds: wall-clock budget for the Claude call before giving up Returns: An ExplainResponse Pydantic model @@ -57,7 +65,7 @@ async def process_request( return cached_response # Cache miss or no cache - proceed with Anthropic API call - response = await _call_anthropic_api(body, client, prompt, metrics_provider) + response = await _call_anthropic_api(body, client, prompt, metrics_provider, deadline_seconds) # Cache the response (if cache provider is available). Don't cache # error responses — they consume real tokens but produce no useful @@ -69,11 +77,49 @@ async def process_request( return response +def _transient_error_response( + body: ExplainRequest, + model: str, + metrics_provider: MetricsProvider, + error: Exception, +) -> ExplainResponse: + """Build a structured error response for a timed-out or transiently failed call. + + Returned (rather than raised) so the client gets a clear, retryable message + well within the API Gateway 30s window instead of an opaque 503. No token + usage is available because the call did not complete. + """ + if isinstance(error, (TimeoutError, APITimeoutError)): + message_text = ( + "Claude Explain took too long to respond — the input may be very large " + "or the model is under heavy load. Please try again in a moment." + ) + else: + message_text = "Claude Explain is temporarily unavailable. Please try again in a moment." + LOGGER.warning("Anthropic call failed (%s): %s", type(error).__name__, error) + metrics_provider.set_property("language", body.language) + metrics_provider.set_property("compiler", body.compiler) + metrics_provider.set_property("instructionSet", body.instructionSet or "unknown") + metrics_provider.set_property("cached", "false") + metrics_provider.put_metric("ClaudeExplainRequest", 1) + if isinstance(error, (TimeoutError, APITimeoutError)): + metrics_provider.put_metric("ClaudeExplainTimeout", 1) + else: + metrics_provider.put_metric("ClaudeExplainTransientError", 1) + return ExplainResponse( + status="error", + message=message_text, + model=model, + usage=TokenUsage(inputTokens=0, outputTokens=0, totalTokens=0), + ) + + async def _call_anthropic_api( body: ExplainRequest, client: AsyncAnthropic, prompt: Prompt, metrics_provider: MetricsProvider, + deadline_seconds: float = DEFAULT_ANTHROPIC_DEADLINE_SECONDS, ) -> ExplainResponse: """Make the actual call to Anthropic API and create response. @@ -98,7 +144,22 @@ async def _call_anthropic_api( prompt_data["model"], bool(prompt_data.get("thinking")), ) - message = await client.messages.create(**prompt_data) + # Bound the call to a wall-clock budget below the API Gateway HTTP API + # integration timeout (a hard 30s ceiling). Without this, a slow generation + # runs to completion inside the Lambda — billing tokens we never deliver — + # while the gateway has already returned an opaque 503 to the user. Failing + # within the budget lets us surface a clear, retryable message instead. + try: + async with asyncio.timeout(deadline_seconds): + message = await client.messages.create(**prompt_data) + except (TimeoutError, APITimeoutError, APIConnectionError) as e: + return _transient_error_response(body, prompt_data["model"], metrics_provider, e) + except APIStatusError as e: + # Surface only transient upstream failures gracefully; let genuine + # client errors (e.g. a malformed 400) propagate as a real failure. + if e.status_code in (408, 409, 429, 500, 502, 503, 504, 529): + return _transient_error_response(body, prompt_data["model"], metrics_provider, e) + raise # Extract usage information input_tokens = message.usage.input_tokens diff --git a/app/main.py b/app/main.py index 957c066..adbc1f7 100644 --- a/app/main.py +++ b/app/main.py @@ -42,7 +42,11 @@ async def lifespan(app: FastAPI): # Store shared resources in app.state app.state.settings = settings - app.state.anthropic_client = AsyncAnthropic(api_key=settings.anthropic_api_key) + app.state.anthropic_client = AsyncAnthropic( + api_key=settings.anthropic_api_key, + timeout=settings.anthropic_timeout_seconds, + max_retries=settings.anthropic_max_retries, + ) # Load the prompt configuration prompt_config_path = Path(__file__).parent / "prompt.yaml" @@ -133,4 +137,5 @@ async def explain(explain_request: ExplainRequest, request: Request) -> ExplainR request.app.state.prompt, metrics_provider, cache_provider, + deadline_seconds=request.app.state.settings.anthropic_timeout_seconds, ) diff --git a/app/test_explain.py b/app/test_explain.py index b0d6150..e189ee9 100644 --- a/app/test_explain.py +++ b/app/test_explain.py @@ -222,6 +222,28 @@ async def test_returns_error_when_no_text_block(self, sample_request, noop_metri assert response.usage.inputTokens == 100 assert response.usage.outputTokens == 50 + @pytest.mark.asyncio + async def test_returns_error_when_call_exceeds_deadline(self, sample_request, noop_metrics): + """A Claude call that overruns the wall-clock budget must return a + structured error well inside the API Gateway 30s window, not hang until + the gateway severs the connection with an opaque 503.""" + import asyncio + + async def slow_create(**_kwargs): + await asyncio.sleep(1.0) + + mock_client = MagicMock() + mock_client.messages.create = AsyncMock(side_effect=slow_create) + + test_prompt = Prompt(Path("app/prompt.yaml")) + response = await process_request(sample_request, mock_client, test_prompt, noop_metrics, deadline_seconds=0.01) + + assert response.status == "error" + assert response.explanation is None + assert "too long" in response.message + assert response.usage is not None + assert response.usage.totalTokens == 0 + class TestPromptValidation: """Validation rules enforced at Prompt construction."""