diff --git a/.env.example b/.env.example index 3567be1..63006b5 100644 --- a/.env.example +++ b/.env.example @@ -19,6 +19,9 @@ WS_AUTH_SECRET= # Solana SOLANA_RPC_URL=https://api.mainnet-beta.solana.com +# Fallback RPC — used automatically when primary is down or rate-limited. +# IMPORTANT: defaults to devnet if unset — must be set for mainnet deployments. +FALLBACK_RPC_URL= # Supabase SUPABASE_URL= diff --git a/src/routes/adl.ts b/src/routes/adl.ts index 471953d..609cfa7 100644 --- a/src/routes/adl.ts +++ b/src/routes/adl.ts @@ -50,6 +50,8 @@ import { createLogger, sanitizeSlabAddress, } from "@percolator/shared"; +import { withRpcFallback } from "../utils/rpc-fallback.js"; +import { RpcTimeoutError } from "../utils/rpc-timeout.js"; import { isBlockedSlab } from "../middleware/validateSlab.js"; const logger = createLogger("api:adl"); @@ -170,11 +172,18 @@ export function adlRoutes(): Hono { return c.json({ error: "Market not found" }, 404); } - const connection = getConnection(); let data: Uint8Array; try { - data = await fetchSlab(connection, new PublicKey(slab)); + data = await withRpcFallback( + (conn) => fetchSlab(conn, new PublicKey(slab)), + getConnection(), + `fetchSlab(${slab})`, + ); } catch (err) { + if (err instanceof RpcTimeoutError) { + logger.warn("RPC timeout fetching slab for ADL", { slab, timeoutMs: err.timeoutMs }); + return c.json({ error: "Upstream RPC timeout", slab }, 504); + } const msg = err instanceof Error ? err.message : String(err); if (msg.includes("not found")) { return c.json({ error: "Slab account not found", slab }, 404); diff --git a/src/routes/health.ts b/src/routes/health.ts index 31b4045..6701510 100644 --- a/src/routes/health.ts +++ b/src/routes/health.ts @@ -1,5 +1,7 @@ import { Hono } from "hono"; import { getConnection, getSupabase, createLogger, truncateErrorMessage } from "@percolator/shared"; +import { withRpcFallback } from "../utils/rpc-fallback.js"; +import { HEALTH_RPC_TIMEOUT_MS } from "../utils/rpc-timeout.js"; import { getWebSocketMetrics } from "./ws.js"; import { requireApiKey } from "../middleware/auth.js"; @@ -26,7 +28,12 @@ export function healthRoutes(): Hono { // Check RPC connectivity try { - await getConnection().getSlot(); + await withRpcFallback( + (conn) => conn.getSlot(), + getConnection(), + "healthcheck:getSlot", + HEALTH_RPC_TIMEOUT_MS, + ); checks.rpc = true; } catch (err) { logger.error("RPC check failed", { error: truncateErrorMessage(err instanceof Error ? err.message : err, 120) }); diff --git a/src/routes/markets.ts b/src/routes/markets.ts index fa86676..d2fa674 100644 --- a/src/routes/markets.ts +++ b/src/routes/markets.ts @@ -5,6 +5,8 @@ import { cacheMiddleware } from "../middleware/cache.js"; import { withDbCacheFallback } from "../middleware/db-cache-fallback.js"; import { fetchSlab, parseHeader, parseConfig, parseEngine } from "@percolator/sdk"; import { getConnection, getSupabase, getNetwork, createLogger, sanitizeSlabAddress, truncateErrorMessage } from "@percolator/shared"; +import { withRpcFallback } from "../utils/rpc-fallback.js"; +import { RpcTimeoutError } from "../utils/rpc-timeout.js"; const logger = createLogger("api:markets"); @@ -121,9 +123,12 @@ export function marketRoutes(): Hono { const slab = c.req.param("slab"); if (!slab) return c.json({ error: "slab required" }, 400); try { - const connection = getConnection(); const slabPubkey = new PublicKey(slab); - const data = await fetchSlab(connection, slabPubkey); + const data = await withRpcFallback( + (conn) => fetchSlab(conn, slabPubkey), + getConnection(), + `fetchSlab(${slab})`, + ); const header = parseHeader(data); const cfg = parseConfig(data); const engine = parseEngine(data); @@ -150,6 +155,10 @@ export function marketRoutes(): Hono { }, }); } catch (err) { + if (err instanceof RpcTimeoutError) { + logger.warn("RPC timeout fetching market", { slab, timeoutMs: err.timeoutMs }); + return c.json({ error: "Upstream RPC timeout" }, 504); + } const detail = err instanceof Error ? err.message : "Unknown error"; const isNotFound = detail.includes("not found") || detail.includes("Account does not exist"); if (isNotFound) { diff --git a/src/utils/rpc-fallback.ts b/src/utils/rpc-fallback.ts new file mode 100644 index 0000000..49f5b7f --- /dev/null +++ b/src/utils/rpc-fallback.ts @@ -0,0 +1,47 @@ +/** + * RPC failover for read-only on-chain calls. + * + * Tries the primary connection first; on ANY error, retries once against + * the fallback connection (FALLBACK_RPC_URL). Each attempt is independently + * wrapped in withRpcTimeout so a hung primary doesn't consume the fallback's + * timeout budget. + * + * If FALLBACK_RPC_URL is not explicitly set, the original primary error is + * re-thrown unchanged. This prevents silent failover to the devnet default + * that @percolator/shared uses when the env var is missing. + */ + +import type { Connection } from "@solana/web3.js"; +import { getFallbackConnection, createLogger } from "@percolator/shared"; +import { withRpcTimeout } from "./rpc-timeout.js"; + +const logger = createLogger("api:rpc-fallback"); + +/** True only when the operator has explicitly configured a fallback RPC. */ +const hasFallbackRpc = Boolean(process.env.FALLBACK_RPC_URL); + +export async function withRpcFallback( + fn: (conn: Connection) => Promise, + primary: Connection, + operation: string, + timeoutMs?: number, +): Promise { + try { + return await withRpcTimeout(fn(primary), operation, timeoutMs); + } catch (primaryErr) { + if (!hasFallbackRpc) { + throw primaryErr; // no explicit fallback configured — re-throw original + } + + logger.warn("Primary RPC failed, trying fallback", { + operation, + error: primaryErr instanceof Error ? primaryErr.message : String(primaryErr), + }); + + return await withRpcTimeout( + fn(getFallbackConnection()), + `${operation}[fallback]`, + timeoutMs, + ); + } +} diff --git a/src/utils/rpc-timeout.ts b/src/utils/rpc-timeout.ts new file mode 100644 index 0000000..f2bacd4 --- /dev/null +++ b/src/utils/rpc-timeout.ts @@ -0,0 +1,41 @@ +/** + * Timeout wrapper for RPC calls that don't accept AbortSignal. + * + * fetchSlab() and getConnection().getSlot() from the SDK/shared libs take a + * Connection object, not an AbortSignal, so AbortSignal.timeout() cannot be + * threaded through. Promise.race is the only viable approach. + * + * The underlying RPC call is NOT cancelled — Node will GC the dangling promise + * once it settles. This is acceptable because fetchSlab/getSlot are read-only. + */ + +const DEFAULT_RPC_TIMEOUT_MS = 10_000; +const DEFAULT_HEALTH_RPC_TIMEOUT_MS = 5_000; + +export const RPC_TIMEOUT_MS: number = + Number(process.env.RPC_TIMEOUT_MS) || DEFAULT_RPC_TIMEOUT_MS; + +export const HEALTH_RPC_TIMEOUT_MS: number = + Number(process.env.HEALTH_RPC_TIMEOUT_MS) || DEFAULT_HEALTH_RPC_TIMEOUT_MS; + +export class RpcTimeoutError extends Error { + public readonly timeoutMs: number; + + constructor(operation: string, timeoutMs: number) { + super(`RPC timeout: ${operation} did not complete within ${timeoutMs}ms`); + this.name = "RpcTimeoutError"; + this.timeoutMs = timeoutMs; + } +} + +export function withRpcTimeout( + promise: Promise, + operation: string, + timeoutMs: number = RPC_TIMEOUT_MS, +): Promise { + let timer: ReturnType; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => reject(new RpcTimeoutError(operation, timeoutMs)), timeoutMs); + }); + return Promise.race([promise, timeout]).finally(() => clearTimeout(timer!)); +}