From f471335664613b6ccac9932cb55bb8cdfbf270ab Mon Sep 17 00:00:00 2001 From: bjoern Date: Tue, 10 Feb 2026 17:13:53 +0100 Subject: [PATCH] Handle Ollama offline gracefully with retry and 503 response Problem: When the Ollama provider was offline or unreachable, the raw TypeError from the failed fetch bubbled up to the Express error middleware, producing an unhelpful 500 error. The retry client also did not recognize ECONNREFUSED as a retryable error, so connection-refused failures failed immediately without any backoff attempt. Changes implemented: 1. Add ECONNREFUSED to retryable errors (src/clients/retry.js) - Added ECONNREFUSED to the retryableErrors list alongside ECONNRESET, etc. - Added check for nested .cause.code on errors, since Node's undici wraps connection errors as TypeError with the actual code on .cause - Connection-refused errors now get retried with exponential backoff 2. Wrap invokeModel in try/catch (src/orchestrator/index.js) - Catches connection errors (ECONNREFUSED, fetch failed) from invokeModel - Returns a structured 503 response with provider_unreachable error type - Prevents raw TypeError from bubbling to Express error middleware - Non-connection errors are re-thrown unchanged Testing: - Stopped Ollama, sent a request: returns clean 503 with provider_unreachable - Previously: raw TypeError crash in Express middleware - Started Ollama back up: requests resume normally - Retry behavior verified: ECONNREFUSED triggers backoff retries before 503 - npm run test:unit passes with no regressions --- src/clients/retry.js | 7 ++++++- src/orchestrator/index.js | 28 +++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/clients/retry.js b/src/clients/retry.js index 2178206..5d90654 100644 --- a/src/clients/retry.js +++ b/src/clients/retry.js @@ -10,7 +10,7 @@ const DEFAULT_CONFIG = { backoffMultiplier: 2, jitterFactor: 0.1, // 10% jitter retryableStatuses: [429, 500, 502, 503, 504], - retryableErrors: ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'ENETUNREACH'], + retryableErrors: ['ECONNRESET', 'ETIMEDOUT', 'ENOTFOUND', 'ENETUNREACH', 'ECONNREFUSED'], }; /** @@ -44,6 +44,11 @@ function isRetryable(error, response, config) { return true; } + // Check nested cause (Node undici wraps connection errors as TypeError) + if (error && error.cause?.code && config.retryableErrors.includes(error.cause.code)) { + return true; + } + // Check for network errors if (error && (error.name === 'FetchError' || error.name === 'AbortError')) { return true; diff --git a/src/orchestrator/index.js b/src/orchestrator/index.js index d553b69..f46f98d 100644 --- a/src/orchestrator/index.js +++ b/src/orchestrator/index.js @@ -1694,7 +1694,33 @@ IMPORTANT TOOL USAGE RULES: }); } - const databricksResponse = await invokeModel(cleanPayload); + let databricksResponse; + try { + databricksResponse = await invokeModel(cleanPayload); + } catch (modelError) { + const isConnectionError = modelError.cause?.code === 'ECONNREFUSED' + || modelError.message?.includes('fetch failed') + || modelError.code === 'ECONNREFUSED'; + if (isConnectionError) { + logger.error(`Provider ${providerType} is unreachable (connection refused). Is it running?`); + return { + response: { + status: 503, + body: { + error: { + type: "provider_unreachable", + message: `Provider ${providerType} is unreachable. Is the service running?`, + }, + }, + terminationReason: "provider_unreachable", + }, + steps, + durationMs: Date.now() - start, + terminationReason: "provider_unreachable", + }; + } + throw modelError; + } // Extract and log actual token usage const actualUsage = databricksResponse.ok && config.tokenTracking?.enabled !== false