diff --git a/gossip/discovery/discovery_impl.go b/gossip/discovery/discovery_impl.go index e4bb09bc8a0..23bc2925eeb 100644 --- a/gossip/discovery/discovery_impl.go +++ b/gossip/discovery/discovery_impl.go @@ -505,6 +505,8 @@ func (d *gossipDiscoveryImpl) handleAliveMessage(m *protoext.SignedGossipMessage d.lock.RLock() _, known := d.id2Member[string(pkiID)] + lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)] + lastDeadTS, isDead := d.deadLastTS[string(pkiID)] d.lock.RUnlock() if !known { @@ -512,11 +514,6 @@ func (d *gossipDiscoveryImpl) handleAliveMessage(m *protoext.SignedGossipMessage return } - d.lock.RLock() - _, isAlive := d.aliveLastTS[string(pkiID)] - lastDeadTS, isDead := d.deadLastTS[string(pkiID)] - d.lock.RUnlock() - if !isAlive && !isDead { d.logger.Panicf("Member %s is known but not found neither in alive nor in dead lastTS maps, isAlive=%v, isDead=%v", m.GetAliveMsg().Membership.Endpoint, isAlive, isDead) return @@ -537,10 +534,6 @@ func (d *gossipDiscoveryImpl) handleAliveMessage(m *protoext.SignedGossipMessage return } - d.lock.RLock() - lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)] - d.lock.RUnlock() - if isAlive { if before(lastAliveTS, ts) { d.learnExistingMembers([]*protoext.SignedGossipMessage{m}) @@ -845,6 +838,10 @@ func (d *gossipDiscoveryImpl) learnExistingMembers(aliveArr []*protoext.SignedGo // update member's data member := d.id2Member[string(am.Membership.PkiId)] + if member == nil { + d.logger.Debugf("Member with PkiId %x was purged during alive message processing, skipping update", am.Membership.PkiId) + continue + } member.Endpoint = am.Membership.Endpoint member.Metadata = am.Membership.Metadata member.InternalEndpoint = internalEndpoint diff --git a/gossip/discovery/discovery_test.go b/gossip/discovery/discovery_test.go index e54dd0a43bb..033215fb780 100644 --- a/gossip/discovery/discovery_test.go +++ b/gossip/discovery/discovery_test.go @@ -1928,3 +1928,54 @@ func portOfEndpoint(endpoint string) int { port, _ := strconv.ParseInt(strings.Split(endpoint, ":")[1], 10, 64) return int(port) } + +func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) { + inst := createDiscoveryInstanceWithNoGossip(33900, "d0", nil) + defer inst.Stop() + + d := inst.discoveryImpl() + + memberPKIid := common.PKIidType("purged-member-pkiid") + memberEndpoint := "localhost:9999" + + // Simulate the TOCTOU race fixed in PR #5397: + // handleAliveMessage sees isAlive=true under one lock, then a concurrent + // purge() removes the member from all maps before learnExistingMembers + // acquires the write lock. The result is a member present in aliveLastTS + // at decision time but absent from id2Member when learnExistingMembers runs. + d.lock.Lock() + d.aliveLastTS[string(memberPKIid)] = ×tamp{ + incTime: time.Now(), + seqNum: 1, + lastSeen: time.Now(), + } + // Intentionally NOT adding to id2Member to reproduce the nil dereference: + // before the fix, learnExistingMembers accessed member.Endpoint without + // a nil guard, causing a panic. + d.lock.Unlock() + + aliveMsg := &proto.GossipMessage{ + Tag: proto.GossipMessage_EMPTY, + Content: &proto.GossipMessage_AliveMsg{ + AliveMsg: &proto.AliveMessage{ + Membership: &proto.Member{ + Endpoint: memberEndpoint, + PkiId: memberPKIid, + }, + Timestamp: &proto.PeerTime{ + IncNum: uint64(time.Now().UnixNano()), + SeqNum: 2, + }, + }, + }, + } + signedMsg, err := protoext.NoopSign(aliveMsg) + require.NoError(t, err) + + // Before the fix: panics with nil pointer dereference on member.Endpoint + // because id2Member[memberPKIid] is nil (member was concurrently purged). + // After the fix: nil member detected, update skipped, no panic. + require.NotPanics(t, func() { + d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg}) + }) +}