Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions gossip/discovery/discovery_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -505,18 +505,15 @@ func (d *gossipDiscoveryImpl) handleAliveMessage(m *protoext.SignedGossipMessage

d.lock.RLock()
_, known := d.id2Member[string(pkiID)]
lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)]
lastDeadTS, isDead := d.deadLastTS[string(pkiID)]
d.lock.RUnlock()

if !known {
d.learnNewMembers([]*protoext.SignedGossipMessage{m}, []*protoext.SignedGossipMessage{})
return
}

d.lock.RLock()
_, isAlive := d.aliveLastTS[string(pkiID)]
lastDeadTS, isDead := d.deadLastTS[string(pkiID)]
d.lock.RUnlock()

if !isAlive && !isDead {
d.logger.Panicf("Member %s is known but not found neither in alive nor in dead lastTS maps, isAlive=%v, isDead=%v", m.GetAliveMsg().Membership.Endpoint, isAlive, isDead)
return
Expand All @@ -537,10 +534,6 @@ func (d *gossipDiscoveryImpl) handleAliveMessage(m *protoext.SignedGossipMessage
return
}

d.lock.RLock()
lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)]
d.lock.RUnlock()

if isAlive {
if before(lastAliveTS, ts) {
d.learnExistingMembers([]*protoext.SignedGossipMessage{m})
Expand Down Expand Up @@ -845,6 +838,10 @@ func (d *gossipDiscoveryImpl) learnExistingMembers(aliveArr []*protoext.SignedGo

// update member's data
member := d.id2Member[string(am.Membership.PkiId)]
if member == nil {
d.logger.Debugf("Member with PkiId %x was purged during alive message processing, skipping update", am.Membership.PkiId)
continue
}
member.Endpoint = am.Membership.Endpoint
member.Metadata = am.Membership.Metadata
member.InternalEndpoint = internalEndpoint
Expand Down
51 changes: 51 additions & 0 deletions gossip/discovery/discovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1928,3 +1928,54 @@ func portOfEndpoint(endpoint string) int {
port, _ := strconv.ParseInt(strings.Split(endpoint, ":")[1], 10, 64)
return int(port)
}

func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) {
inst := createDiscoveryInstanceWithNoGossip(33900, "d0", nil)
defer inst.Stop()

d := inst.discoveryImpl()

memberPKIid := common.PKIidType("purged-member-pkiid")
memberEndpoint := "localhost:9999"

// Simulate the TOCTOU race fixed in PR #5397:
// handleAliveMessage sees isAlive=true under one lock, then a concurrent
// purge() removes the member from all maps before learnExistingMembers
// acquires the write lock. The result is a member present in aliveLastTS
// at decision time but absent from id2Member when learnExistingMembers runs.
d.lock.Lock()
d.aliveLastTS[string(memberPKIid)] = &timestamp{
incTime: time.Now(),
seqNum: 1,
lastSeen: time.Now(),
}
// Intentionally NOT adding to id2Member to reproduce the nil dereference:
// before the fix, learnExistingMembers accessed member.Endpoint without
// a nil guard, causing a panic.
d.lock.Unlock()

aliveMsg := &proto.GossipMessage{
Tag: proto.GossipMessage_EMPTY,
Content: &proto.GossipMessage_AliveMsg{
AliveMsg: &proto.AliveMessage{
Membership: &proto.Member{
Endpoint: memberEndpoint,
PkiId: memberPKIid,
},
Timestamp: &proto.PeerTime{
IncNum: uint64(time.Now().UnixNano()),
SeqNum: 2,
},
},
},
}
signedMsg, err := protoext.NoopSign(aliveMsg)
require.NoError(t, err)

// Before the fix: panics with nil pointer dereference on member.Endpoint
// because id2Member[memberPKIid] is nil (member was concurrently purged).
// After the fix: nil member detected, update skipped, no panic.
require.NotPanics(t, func() {
d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg})
})
}