From e138c7a90c527e5ed7aadba75cc802ae16af168d Mon Sep 17 00:00:00 2001 From: matthew-pilot Date: Sat, 30 May 2026 07:39:36 +0000 Subject: [PATCH] =?UTF-8?q?fix(replication):=20tighten=20standby=20read=20?= =?UTF-8?q?deadline=2045s=E2=86=9230s,=20heartbeat=2015s=E2=86=9210s=20(PI?= =?UTF-8?q?LOT-281)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The primary heartbeats every 15s with a 5s write deadline, but the standby read deadline was 45s — creating up to a ~60s window where a standby cannot detect a silent primary failure (worst case: primary crashes immediately after a heartbeat, standby waits 45s). Tightening the read deadline to 30s and heartbeat to 10s reduces the stale-detect window to at most 30s, matching the documented intent in StartHeartbeat's comment. The 5s per-message write deadline is unchanged — that guards individual message hangs, not primary liveness. Closes PILOT-281 --- replication.go | 2 +- replication/replication.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/replication.go b/replication.go index a44f618..8821b77 100644 --- a/replication.go +++ b/replication.go @@ -525,7 +525,7 @@ func (s *Server) standbySession(primaryAddr string) error { default: } - conn.SetReadDeadline(time.Now().Add(45 * time.Second)) //nolint:errcheck + conn.SetReadDeadline(time.Now().Add(30 * time.Second)) //nolint:errcheck msg, err := readMessage(conn) if err != nil { if err == io.EOF { diff --git a/replication/replication.go b/replication/replication.go index 755b190..74271f9 100644 --- a/replication/replication.go +++ b/replication/replication.go @@ -145,7 +145,7 @@ func (m *Manager) PushDelta(entries []walpkg.DeltaEntry, seqNo uint64) { // subscribers so standbys can detect primary failure within ~30s. // It blocks until done is closed. func (m *Manager) StartHeartbeat(done <-chan struct{}) { - ticker := time.NewTicker(15 * time.Second) + ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() for { select {