From 80fb1dfc1a12df462e9a375761bba001c3457fc3 Mon Sep 17 00:00:00 2001 From: Raymond Jacobson Date: Fri, 22 May 2026 16:43:29 -0700 Subject: [PATCH] fix(eth-indexer): make /eth/health O(1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous GetHealth ran a UNION/COUNT across users + associated_wallets to populate tracked_wallets, plus a COUNT(*) on eth_wallet_balances for cached_wallets. On prod that's ~3.15M rows through a seq scan + dedup sort and consistently times out (or hangs the handler — there was no statement timeout). Cheap locally, lethal in prod. Drop both counts from the response. They were nice-to-have stats, not liveness signals — a health endpoint that takes 30s to tell you the indexer is alive is worse than no endpoint. If you need population stats, query eth_wallet_balances directly. What's left is all O(1): - connected, rpc_configured, last_block_seen, last_event_at: in-memory - checkpoint_block: single-row PK lookup on eth_indexer_checkpoints Also add a 2s context timeout to the handler. Even if a future query is added that turns slow, the request fails fast instead of hanging the ingress. --- eth/indexer/eth_indexer.go | 25 +++++++------------------ eth/indexer/server.go | 11 ++++++++++- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/eth/indexer/eth_indexer.go b/eth/indexer/eth_indexer.go index 9f4e56ff..e33d8b26 100644 --- a/eth/indexer/eth_indexer.go +++ b/eth/indexer/eth_indexer.go @@ -501,30 +501,21 @@ type ethHealth struct { LastBlockSeen uint64 `json:"last_block_seen"` CheckpointBlock uint64 `json:"checkpoint_block"` LastEventAt *time.Time `json:"last_event_at"` - TrackedWallets int64 `json:"tracked_wallets"` - CachedWallets int64 `json:"cached_wallets"` } +// GetHealth returns indexer liveness in O(1) — all values are either in +// memory or come from a single-row PK lookup. Wallet-population counts +// previously lived on this response but were expensive on prod +// (UNION/COUNT across users + associated_wallets ≈ 3M rows, no index, can +// take 30s+) and don't actually answer "is the indexer alive?", which is +// what a health endpoint is for. If you want population stats, query +// eth_wallet_balances directly. func (e *EthIndexer) GetHealth(ctx context.Context, maxEventLagSecs int64) (*ethHealth, error) { checkpoint, err := e.loadCheckpoint(ctx) if err != nil { return nil, fmt.Errorf("loading checkpoint: %w", err) } - var tracked, cached int64 - err = e.pool.QueryRow(ctx, ` - SELECT - (SELECT COUNT(*) FROM ( - SELECT LOWER(wallet) FROM users WHERE wallet IS NOT NULL AND wallet <> '' - UNION - SELECT LOWER(wallet) FROM associated_wallets WHERE chain = 'eth' AND is_delete = FALSE - ) t) AS tracked, - (SELECT COUNT(*) FROM eth_wallet_balances) AS cached - `).Scan(&tracked, &cached) - if err != nil { - return nil, fmt.Errorf("counting wallets: %w", err) - } - errs := make([]string, 0) if !e.connected.Load() && e.wsURL != "" { errs = append(errs, "websocket subscription not connected") @@ -545,8 +536,6 @@ func (e *EthIndexer) GetHealth(ctx context.Context, maxEventLagSecs int64) (*eth LastBlockSeen: e.lastBlockSeen.Load(), CheckpointBlock: checkpoint, LastEventAt: e.lastEventAt.Load(), - TrackedWallets: tracked, - CachedWallets: cached, }, nil } diff --git a/eth/indexer/server.go b/eth/indexer/server.go index 82e0a55c..d5c68095 100644 --- a/eth/indexer/server.go +++ b/eth/indexer/server.go @@ -5,12 +5,19 @@ import ( "encoding/json" "net" "net/http" + "time" "github.com/gofiber/fiber/v2" "github.com/mcuadros/go-defaults" "go.uber.org/zap" ) +// healthHandlerTimeout caps how long /eth/health is willing to wait on +// downstream work (DB, etc.) before returning an error. The endpoint is +// supposed to be O(1); a 2s ceiling is generous and stops a future slow +// query from hanging the handler indefinitely. +const healthHandlerTimeout = 2 * time.Second + type Server struct { *fiber.App logger *zap.Logger @@ -39,7 +46,9 @@ func NewServer(indexer *EthIndexer) *Server { } defaults.SetDefaults(&q) - health, err := indexer.GetHealth(c.Context(), q.MaxEventLagSecs) + ctx, cancel := context.WithTimeout(c.Context(), healthHandlerTimeout) + defer cancel() + health, err := indexer.GetHealth(ctx, q.MaxEventLagSecs) if err != nil { return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{ "error": err.Error(),