Skip to content

Commit 196b51f

Browse files
committed
dialog: fix refcount race between BYE and timer expiry (GH-3835)
Two race paths between BYE processing and dlg_ontimeout() cause use-after-free when bye_on_timeout is enabled (create_dialog("B")): 1. next_state_dlg() releases the hash lock before the caller removes the dialog timer, allowing a concurrent worker to race on cleanup. 2. dlg_ontimeout() reads dlg->state without the hash lock, which on ARM64 (relaxed memory ordering) can return a stale CONFIRMED value after a BYE worker has already set DELETED. Fix: move remove_dlg_timer() inside next_state_dlg()'s lock scope so timer removal is atomic with the state transition, and read dlg->state under the hash lock in dlg_ontimeout() so the timer handler sees concurrent state changes. Remove the now-redundant caller-side remove_dlg_timer() blocks from dlg_onroute(), dual_bye_event(), dlg_replicated_delete(), and drop_dlg(). Closes #3835
1 parent 6157557 commit 196b51f

File tree

4 files changed

+45
-91
lines changed

4 files changed

+45
-91
lines changed

modules/dialog/dlg_handlers.c

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,7 +1889,7 @@ void dlg_onroute(struct sip_msg* req, str *route_params, void *param)
18891889
int event;
18901890
unsigned int update_val;
18911891
unsigned int dir,dst_leg,src_leg;
1892-
int ret = 0,ok = 1;
1892+
int ok = 1;
18931893
struct dlg_entry *d_entry;
18941894
str *msg_cseq;
18951895
char *final_cseq;
@@ -2130,31 +2130,6 @@ void dlg_onroute(struct sip_msg* req, str *route_params, void *param)
21302130

21312131
after_unlock5:
21322132

2133-
/* remove from timer */
2134-
ret = remove_dlg_timer(&dlg->tl);
2135-
if (ret < 0) {
2136-
LM_CRIT("unable to unlink the timer on dlg %p [%u:%u] "
2137-
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
2138-
dlg, dlg->h_entry, dlg->h_id,
2139-
dlg->callid.len, dlg->callid.s,
2140-
dlg->legs[DLG_CALLER_LEG].tag.len,
2141-
dlg->legs[DLG_CALLER_LEG].tag.s,
2142-
dlg->legs[callee_idx(dlg)].tag.len,
2143-
ZSW(dlg->legs[callee_idx(dlg)].tag.s));
2144-
} else if (ret > 0) {
2145-
LM_DBG("dlg expired (not in timer list) on dlg %p [%u:%u] "
2146-
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
2147-
dlg, dlg->h_entry, dlg->h_id,
2148-
dlg->callid.len, dlg->callid.s,
2149-
dlg->legs[DLG_CALLER_LEG].tag.len,
2150-
dlg->legs[DLG_CALLER_LEG].tag.s,
2151-
dlg->legs[callee_idx(dlg)].tag.len,
2152-
ZSW(dlg->legs[callee_idx(dlg)].tag.s));
2153-
} else {
2154-
/* dialog successfully removed from timer -> unref */
2155-
unref++;
2156-
}
2157-
21582133
/* dialog terminated (BYE) */
21592134
run_dlg_callbacks(DLGCB_TERMINATED, dlg, req, dir, dst_leg, NULL, 0, is_active);
21602135

@@ -2452,14 +2427,34 @@ void dlg_ontimeout(struct dlg_tl *tl)
24522427
context_p old_ctx;
24532428
context_p *new_ctx;
24542429
struct dlg_cell *dlg;
2430+
struct dlg_entry *d_entry;
24552431
int new_state;
24562432
int old_state;
24572433
int unref;
24582434
int do_expire_actions = 1;
2435+
int dlg_state;
24592436

24602437
dlg = get_dlg_tl_payload(tl);
2438+
/* d_table is global, initialized in mod_init(); guaranteed valid if
2439+
* the timer callback fires. dlg->h_entry is set during dialog
2440+
* creation (init_new_dialog) and never changes; get_dlg_tl_payload
2441+
* is a container_of macro that derives the dlg pointer from the
2442+
* timer link — cannot return NULL since the timer fired on this
2443+
* link. Same pattern as next_state_dlg (line ~1193) which does
2444+
* the identical d_entry lookup without null/bounds checks. */
2445+
d_entry = &(d_table->entries[dlg->h_entry]);
2446+
2447+
/* Read the dialog state under lock to ensure visibility of
2448+
* concurrent state changes (GH-3835). On architectures with
2449+
* relaxed memory ordering (e.g. ARM64), an unlocked read of
2450+
* dlg->state can return a stale value, causing the timer to
2451+
* proceed as if the dialog is still CONFIRMED when a BYE
2452+
* worker has already transitioned it to DELETED. */
2453+
dlg_lock(d_table, d_entry);
2454+
dlg_state = dlg->state;
2455+
dlg_unlock(d_table, d_entry);
24612456

2462-
LM_DBG("byeontimeout ? flags = %d , state = %d\n",dlg->flags,dlg->state);
2457+
LM_DBG("byeontimeout ? flags = %d , state = %d\n",dlg->flags,dlg_state);
24632458

24642459
if (dialog_repl_cluster) {
24652460
/* if dialog replication is used, send BYEs only if the current node
@@ -2471,7 +2466,7 @@ void dlg_ontimeout(struct dlg_tl *tl)
24712466
* the dialog. We this self prolonging only once! */
24722467
if (!do_expire_actions
24732468
&& ref_script_route_check_and_update(dlg->rt_on_timeout)
2474-
&& dlg->state<DLG_STATE_DELETED
2469+
&& dlg_state<DLG_STATE_DELETED
24752470
&& !(dlg->flags&DLG_FLAG_SELF_EXTENDED_TIMEOUT)) {
24762471
LM_DBG("self prolonging with 10 mins to see what the active"
24772472
"decides after the on-timeout route\n");
@@ -2494,7 +2489,7 @@ void dlg_ontimeout(struct dlg_tl *tl)
24942489

24952490
if (do_expire_actions
24962491
&& ref_script_route_check_and_update(dlg->rt_on_timeout)
2497-
&& dlg->state<DLG_STATE_DELETED) {
2492+
&& dlg_state<DLG_STATE_DELETED) {
24982493
struct dlg_tl bk_tl = *tl;
24992494
/* allow the dialog to be re-inserted in the timer list */
25002495
tl->next = tl->prev = NULL;
@@ -2521,7 +2516,7 @@ void dlg_ontimeout(struct dlg_tl *tl)
25212516
}
25222517

25232518
if ((dlg->flags&DLG_FLAG_BYEONTIMEOUT) &&
2524-
(dlg->state==DLG_STATE_CONFIRMED_NA || dlg->state==DLG_STATE_CONFIRMED)) {
2519+
(dlg_state==DLG_STATE_CONFIRMED_NA || dlg_state==DLG_STATE_CONFIRMED)) {
25252520

25262521
if (do_expire_actions) {
25272522
if (dlg->flags & DLG_FLAG_RACE_CONDITION_OCCURRED)

modules/dialog/dlg_hash.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,23 @@ void next_state_dlg(struct dlg_cell *dlg, int event, int dir, int *old_state,
13301330
}
13311331
*new_state = dlg->state;
13321332

1333+
/* Remove dialog timer under lock to prevent race with concurrent
1334+
* unref (GH-3835): if we transitioned to DELETED, remove the timer
1335+
* atomically with the state change so no other worker can see
1336+
* state=DELETED and race to unref before timer cleanup */
1337+
if (*old_state != DLG_STATE_DELETED && *new_state == DLG_STATE_DELETED) {
1338+
int ret = remove_dlg_timer(&dlg->tl);
1339+
if (ret == 0)
1340+
(*unref)++;
1341+
else if (ret < 0)
1342+
LM_CRIT("unable to unlink the timer on dlg %p [%u:%u] "
1343+
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
1344+
dlg, dlg->h_entry, dlg->h_id,
1345+
dlg->callid.len, dlg->callid.s,
1346+
dlg_leg_print_info(dlg, DLG_CALLER_LEG, tag),
1347+
dlg_leg_print_info(dlg, callee_idx(dlg), tag));
1348+
}
1349+
13331350
dlg_unlock( d_table, d_entry);
13341351

13351352
if (*old_state != *new_state)

modules/dialog/dlg_replication.c

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -599,7 +599,7 @@ int dlg_replicated_delete(bin_packet_t *packet)
599599
str call_id, from_tag, to_tag;
600600
unsigned int dir, _ = -1;
601601
struct dlg_cell *dlg;
602-
int old_state, new_state, unref, ret;
602+
int old_state, new_state, unref;
603603
unsigned int h_id;
604604
int h_entry;
605605
short pkg_ver = get_bin_pkg_version(packet);
@@ -639,30 +639,6 @@ int dlg_replicated_delete(bin_packet_t *packet)
639639
return -1;
640640
}
641641

642-
ret = remove_dlg_timer(&dlg->tl);
643-
if (ret < 0) {
644-
LM_CRIT("unable to unlink the timer on dlg %p [%u:%u] "
645-
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
646-
dlg, dlg->h_entry, dlg->h_id,
647-
dlg->callid.len, dlg->callid.s,
648-
dlg->legs[DLG_CALLER_LEG].tag.len,
649-
dlg->legs[DLG_CALLER_LEG].tag.s,
650-
dlg->legs[callee_idx(dlg)].tag.len,
651-
ZSW(dlg->legs[callee_idx(dlg)].tag.s));
652-
} else if (ret > 0) {
653-
LM_DBG("dlg expired (not in timer list) on dlg %p [%u:%u] "
654-
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
655-
dlg, dlg->h_entry, dlg->h_id,
656-
dlg->callid.len, dlg->callid.s,
657-
dlg->legs[DLG_CALLER_LEG].tag.len,
658-
dlg->legs[DLG_CALLER_LEG].tag.s,
659-
dlg->legs[callee_idx(dlg)].tag.len,
660-
ZSW(dlg->legs[callee_idx(dlg)].tag.s));
661-
} else {
662-
/* dialog successfully removed from timer -> unref */
663-
unref++;
664-
}
665-
666642
unref_dlg(dlg, 1 + unref);
667643
if_update_stat(dlg_enable_stats, active_dlgs, -1);
668644

@@ -1200,7 +1176,7 @@ static int receive_sync_request(int node_id)
12001176
struct dlg_cell *drop_dlg(struct dlg_cell *dlg, int i)
12011177
{
12021178
struct dlg_cell *next_dlg;
1203-
int ret, unref, old_state, new_state;
1179+
int unref, old_state, new_state;
12041180

12051181
/* make sure dialog is not freed while we don't hold the lock */
12061182
ref_dlg_unsafe(dlg, 1);
@@ -1226,19 +1202,6 @@ struct dlg_cell *drop_dlg(struct dlg_cell *dlg, int i)
12261202

12271203
dlg_lock(d_table, &d_table->entries[i]);
12281204

1229-
/* remove from timer, even though it may be done already */
1230-
ret = remove_dlg_timer(&dlg->tl);
1231-
if (ret < 0) {
1232-
LM_ERR("unable to unlink the timer on dlg %p [%u:%u] "
1233-
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
1234-
dlg, dlg->h_entry, dlg->h_id,
1235-
dlg->callid.len, dlg->callid.s,
1236-
dlg_leg_print_info(dlg, DLG_CALLER_LEG, tag),
1237-
dlg_leg_print_info(dlg, callee_idx(dlg), tag));
1238-
} else if (ret == 0)
1239-
/* successfully removed from timer list */
1240-
unref++;
1241-
12421205
if (dlg_db_mode != DB_MODE_NONE) {
12431206
if (dlg_db_mode == DB_MODE_DELAYED &&
12441207
!(dlg->flags&DLG_FLAG_DB_DELETED))

modules/dialog/dlg_req_within.c

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ dlg_t * build_dialog_info(struct dlg_cell * cell, int dst_leg, int src_leg,
212212
static void dual_bye_event(struct dlg_cell* dlg, struct sip_msg *req,
213213
int is_active)
214214
{
215-
int event, old_state, new_state, unref, ret;
215+
int event, old_state, new_state, unref;
216216
struct sip_msg *fake_msg=NULL;
217217
context_p old_ctx;
218218
context_p *new_ctx;
@@ -233,27 +233,6 @@ static void dual_bye_event(struct dlg_cell* dlg, struct sip_msg *req,
233233
destroy_linkers(dlg);
234234
remove_dlg_prof_table(dlg,is_active);
235235

236-
/* remove from timer */
237-
ret = remove_dlg_timer(&dlg->tl);
238-
if (ret < 0) {
239-
LM_CRIT("unable to unlink the timer on dlg %p [%u:%u] "
240-
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
241-
dlg, dlg->h_entry, dlg->h_id,
242-
dlg->callid.len, dlg->callid.s,
243-
dlg_leg_print_info( dlg, DLG_CALLER_LEG, tag),
244-
dlg_leg_print_info( dlg, callee_idx(dlg), tag));
245-
} else if (ret > 0) {
246-
LM_DBG("dlg already expired (not in timer list) %p [%u:%u] "
247-
"with clid '%.*s' and tags '%.*s' '%.*s'\n",
248-
dlg, dlg->h_entry, dlg->h_id,
249-
dlg->callid.len, dlg->callid.s,
250-
dlg_leg_print_info( dlg, DLG_CALLER_LEG, tag),
251-
dlg_leg_print_info( dlg, callee_idx(dlg), tag));
252-
} else {
253-
/* successfully removed from timer list */
254-
unref++;
255-
}
256-
257236
if (req==NULL) {
258237
/* set new msg & processing context */
259238
if (push_new_processing_context( dlg, &old_ctx, &new_ctx, &fake_msg)==0) {

0 commit comments

Comments
 (0)