Skip to content

Commit 5572b87

Browse files
author
shijiashuai
committed
refactor: fix writePump goroutine leak and data race, add room/client limits for DoS prevention, add WebSocket auto-reconnection
1 parent 03d9f37 commit 5572b87

3 files changed

Lines changed: 88 additions & 15 deletions

File tree

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Stability & Security Refactoring
2+
3+
Date: 2026-03-09
4+
5+
## Critical Bug Fixes
6+
7+
### writePump goroutine leak and data race (hub.go)
8+
- `HandleWS` used a `defer` to close resources, but the ordering caused a race:
9+
- `writePump` goroutine called `c.conn.Close()` on write error
10+
- The read goroutine's defer also called `c.Close()`
11+
- Two goroutines closing the same connection concurrently = data race
12+
- `writePump` could also leak if `close(client.send)` was never reached
13+
- **Fix**: Explicit sequential cleanup after read loop exits:
14+
1. `removeClient` — prevents new messages from being routed to send channel
15+
2. `close(client.send)` — terminates writePump's range loop
16+
3. `c.Close()` — only the read goroutine owns conn lifecycle now
17+
- `writePump` no longer calls `c.conn.Close()` on write error; instead drains remaining messages and returns
18+
19+
### No room/client limits — DoS vector (hub.go)
20+
- Any client could create unlimited rooms and join unlimited times, exhausting server memory
21+
- **Fix**: Added `MaxRooms = 1000` and `MaxClientsPerRoom = 50` limits in `addClient()`
22+
- Rejections are logged for monitoring
23+
24+
## Improvements
25+
26+
### WebSocket auto-reconnection (app.js)
27+
- On unexpected disconnect, the client just set state to 'idle' — user had to manually rejoin
28+
- **Fix**: On unexpected disconnect (not manual leave), automatically attempt reconnection after 2 seconds if a roomId exists
29+
- Shows "连接断开,正在重连…" status message during reconnection
30+
31+
### Files Modified
32+
- `internal/signal/hub.go` — goroutine lifecycle fix, room limits, writePump drain
33+
- `web/app.js` — WebSocket auto-reconnection

internal/signal/hub.go

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ import (
1010
"github.com/gorilla/websocket"
1111
)
1212

13+
// Limits to prevent resource exhaustion
14+
const (
15+
MaxRooms = 1000
16+
MaxClientsPerRoom = 50
17+
)
18+
1319
type Hub struct {
1420
mu sync.RWMutex
1521
rooms map[string]map[string]*Client
@@ -86,11 +92,8 @@ func (h *Hub) HandleWS(w http.ResponseWriter, r *http.Request) {
8692
log.Printf("signal: ws connected from %s", r.RemoteAddr)
8793
client := &Client{conn: c, send: make(chan Message, 32)}
8894
go client.writePump()
89-
defer func() {
90-
h.removeClient(client)
91-
close(client.send)
92-
c.Close()
93-
}()
95+
96+
// readPump: blocks until read error (disconnect / protocol error)
9497
for {
9598
var msg Message
9699
if err := c.ReadJSON(&msg); err != nil {
@@ -115,6 +118,14 @@ func (h *Hub) HandleWS(w http.ResponseWriter, r *http.Request) {
115118
log.Printf("signal: unknown msg type=%s room=%s from=%s", msg.Type, msg.Room, msg.From)
116119
}
117120
}
121+
122+
// Cleanup: order matters to avoid goroutine leak and data race.
123+
// 1. Remove from hub first (prevents new messages being sent to client.send)
124+
h.removeClient(client)
125+
// 2. Close send channel (terminates writePump's range loop)
126+
close(client.send)
127+
// 3. Close WebSocket (writePump may still be draining; conn.Close is safe to call concurrently)
128+
c.Close()
118129
}
119130

120131
func (h *Hub) addClient(c *Client) {
@@ -125,11 +136,21 @@ func (h *Hub) addClient(c *Client) {
125136
}
126137
m, ok := h.rooms[c.room]
127138
if !ok {
139+
// Enforce max rooms limit
140+
if len(h.rooms) >= MaxRooms {
141+
log.Printf("signal: room limit reached (%d), rejecting join room=%s id=%s", MaxRooms, c.room, c.id)
142+
return
143+
}
128144
m = make(map[string]*Client)
129145
h.rooms[c.room] = m
130146
}
147+
// Enforce per-room client limit
148+
if len(m) >= MaxClientsPerRoom {
149+
log.Printf("signal: room %s full (%d clients), rejecting id=%s", c.room, MaxClientsPerRoom, c.id)
150+
return
151+
}
131152
m[c.id] = c
132-
log.Printf("signal: join room=%s id=%s", c.room, c.id)
153+
log.Printf("signal: join room=%s id=%s (room size: %d, total rooms: %d)", c.room, c.id, len(m), len(h.rooms))
133154
broadcastMembers(c.room, m)
134155
}
135156

@@ -197,8 +218,11 @@ func (c *Client) writePump() {
197218
for msg := range c.send {
198219
if err := c.conn.WriteJSON(msg); err != nil {
199220
log.Printf("signal: write message error room=%s id=%s: %v", c.room, c.id, err)
200-
c.conn.Close()
201-
break
221+
// Don't close conn here — the read goroutine owns conn lifecycle.
222+
// Drain remaining messages from send channel so close(send) doesn't block.
223+
for range c.send {
224+
}
225+
return
202226
}
203227
}
204228
}

web/app.js

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -190,13 +190,29 @@ function connectWS() {
190190
ws.onclose = () => {
191191
stopPing()
192192
ws = null
193-
closePeerConnection()
194-
roomId = null
195-
if (remoteInput) remoteInput.value = ''
196-
if (manualClose) { manualClose = false; setError('') }
197-
else { setError('信令服务器连接已关闭') }
198-
setState('idle')
199-
renderMembers([])
193+
if (manualClose) {
194+
manualClose = false
195+
setError('')
196+
closePeerConnection()
197+
roomId = null
198+
if (remoteInput) remoteInput.value = ''
199+
setState('idle')
200+
renderMembers([])
201+
} else {
202+
// Unexpected disconnect — attempt reconnection
203+
setError('连接断开,正在重连…')
204+
if (roomId) {
205+
setTimeout(() => {
206+
if (roomId && state !== 'idle') {
207+
connectWS()
208+
}
209+
}, 2000)
210+
} else {
211+
closePeerConnection()
212+
setState('idle')
213+
renderMembers([])
214+
}
215+
}
200216
}
201217
}
202218

0 commit comments

Comments
 (0)