From 1c2d0bdac895abc961228406e7ab095ff49f5331 Mon Sep 17 00:00:00 2001 From: Norm Brandinger Date: Tue, 7 Apr 2026 12:00:35 -0400 Subject: [PATCH] dialog: atomic state transition to close BYE/timer races (GH-3835) The previous fix (196b51fa) narrowed the race window between BYE processing and dlg_ontimeout() but could not fully close it: a BYE arriving between the state check and dlg_end_dlg() creates a TM transaction ref (via dlg_set_tm_dialog_ctx) that outlives the dialog. When the timeout-BYE responses destroy the dialog first, the real BYE's transaction callback dereferences freed memory. The same race pattern exists in dlg_options_routine() and dlg_reinvite_routine(), where expired ping dialogs call dlg_end_dlg() without any state transition. Fix: replace racy state checks with atomic next_state_dlg(DLG_EVENT_REQBYE) calls. The hash entry lock guarantees only one code path -- timer, ping handler, or BYE handler -- wins the CONFIRMED -> DELETED transition. Since dual_bye_event() now only sees DELETED -> DELETED for the timeout-BYE responses, the cleanup that it would normally perform on the first CONFIRMED -> DELETED transition (rt_on_hangup, profile linkers, DLGCB_TERMINATED callback, DB removal) is moved into the winning timeout/ping path. Three paths fixed: - dlg_ontimeout() bye_on_timeout (dlg_handlers.c) - dlg_options_routine() expired pings (dlg_timer.c) - dlg_reinvite_routine() expired pings (dlg_timer.c) Ref: opensips/opensips#3835 --- modules/dialog/dlg_handlers.c | 77 ++++++++++++++++++++++++++++------- modules/dialog/dlg_timer.c | 73 +++++++++++++++++++++++++++------ 2 files changed, 124 insertions(+), 26 deletions(-) diff --git a/modules/dialog/dlg_handlers.c b/modules/dialog/dlg_handlers.c index 51c59545595..e1a3755ea6c 100644 --- a/modules/dialog/dlg_handlers.c +++ b/modules/dialog/dlg_handlers.c @@ -2670,22 +2670,71 @@ void dlg_ontimeout(struct dlg_tl *tl) if ((dlg->flags&DLG_FLAG_BYEONTIMEOUT) && (dlg->state==DLG_STATE_CONFIRMED_NA || dlg->state==DLG_STATE_CONFIRMED)) { - if (do_expire_actions) { - if (dlg->flags & DLG_FLAG_RACE_CONDITION_OCCURRED) - init_dlg_term_reason(dlg,"SIP Race Condition", - sizeof("SIP Race Condition")-1); - else - init_dlg_term_reason(dlg,"Lifetime Timeout", - sizeof("Lifetime Timeout")-1); + /* Atomically claim the CONFIRMED -> DELETED transition (GH-3835). + * Without this, a BYE arriving between the state check above and + * dlg_end_dlg() below creates a TM transaction ref that outlives + * the dialog, causing use-after-free ("bogus ref -1"). + * next_state_dlg holds the hash lock during the transition, so + * only one code path (timer or BYE handler) can win. */ + next_state_dlg(dlg, DLG_EVENT_REQBYE, DLG_DIR_DOWNSTREAM, + &old_state, &new_state, &unref, + dlg->legs_no[DLG_LEG_200OK], do_expire_actions); + + if (new_state == DLG_STATE_DELETED && + old_state != DLG_STATE_DELETED) { + /* We won the transition -- handle cleanup and send BYEs. + * dual_bye_event() will see DELETED->DELETED for both BYE + * responses, so cleanup must happen here. */ + + if (do_expire_actions) { + if (dlg->flags & DLG_FLAG_RACE_CONDITION_OCCURRED) + init_dlg_term_reason(dlg,"SIP Race Condition", + sizeof("SIP Race Condition")-1); + else + init_dlg_term_reason(dlg,"Lifetime Timeout", + sizeof("Lifetime Timeout")-1); + } + + if (ref_script_route_check_and_update(dlg->rt_on_hangup)) + run_dlg_script_route(dlg, dlg->rt_on_hangup->idx); + + destroy_linkers(dlg); + remove_dlg_prof_table(dlg, do_expire_actions); + + /* fire DLGCB_TERMINATED -- same callback dual_bye_event + * would normally fire on the first BYE response */ + if (push_new_processing_context(dlg, &old_ctx, + &new_ctx, &fake_msg) == 0) { + if (do_expire_actions) + run_dlg_callbacks(DLGCB_TERMINATED, dlg, + fake_msg, DLG_DIR_NONE, -1, + NULL, 0, do_expire_actions); + if (current_processing_ctx == NULL) + *new_ctx = NULL; + else + context_destroy(CONTEXT_GLOBAL, *new_ctx); + set_global_context(old_ctx); + release_dummy_sip_msg(fake_msg); + } + + if (should_remove_dlg_db()) + remove_dialog_from_db(dlg); + + /* send BYEs in both directions */ + dlg_end_dlg(dlg, NULL, do_expire_actions); + + /* release timer ref (1) + hash ref from + * next_state_dlg (unref) */ + unref_dlg(dlg, unref + 1); + + if_update_stat(dlg_enable_stats, expired_dlgs, 1); + if_update_stat(dlg_enable_stats, active_dlgs, -1); + return; } - /* we just send the BYEs in both directions */ - dlg_end_dlg(dlg, NULL, do_expire_actions); - /* dialog is no longer refed by timer; from now on it is refed - by the send_bye functions */ - unref_dlg(dlg, 1); - /* is not 100% sure, but do it */ - if_update_stat(dlg_enable_stats, expired_dlgs, 1); + /* Lost the race -- a BYE handler already transitioned this + * dialog to DELETED and owns cleanup. Release timer ref. */ + unref_dlg(dlg, 1); return; } diff --git a/modules/dialog/dlg_timer.c b/modules/dialog/dlg_timer.c index 11dfcb1013d..4c0efaad5f1 100644 --- a/modules/dialog/dlg_timer.c +++ b/modules/dialog/dlg_timer.c @@ -26,6 +26,10 @@ #include "dlg_hash.h" #include "dlg_req_within.h" #include "dlg_replication.h" +#include "dlg_handlers.h" +#include "dlg_profile.h" +#include "dlg_cb.h" +#include "dlg_db_handler.h" struct dlg_timer *d_timer = 0; dlg_timer_handler timer_hdl = 0; @@ -888,6 +892,57 @@ void unref_dlg_cb(void *dlg) unref_dlg_destroy_safe((struct dlg_cell*)dlg,1); } +/* Atomically terminate a dialog from a ping/timer context (GH-3835). + * Claims the CONFIRMED->DELETED transition via next_state_dlg so that + * a concurrent BYE cannot race. Returns 1 if the dialog was terminated, + * 0 if another code path already owns it. */ +static int dlg_ping_terminate(struct dlg_cell *dlg) +{ + int old_state, new_state, unref; + struct sip_msg *fake_msg = NULL; + context_p old_ctx; + context_p *new_ctx; + + next_state_dlg(dlg, DLG_EVENT_REQBYE, DLG_DIR_DOWNSTREAM, + &old_state, &new_state, &unref, + dlg->legs_no[DLG_LEG_200OK], 1); + + if (new_state != DLG_STATE_DELETED || old_state == DLG_STATE_DELETED) { + /* Lost the race -- a BYE handler already owns this dialog */ + unref_dlg(dlg, 1); + return 0; + } + + /* Won the transition -- run cleanup before sending BYEs */ + if (ref_script_route_check_and_update(dlg->rt_on_hangup)) + run_dlg_script_route(dlg, dlg->rt_on_hangup->idx); + + destroy_linkers(dlg); + remove_dlg_prof_table(dlg, 1); + + if (push_new_processing_context(dlg, &old_ctx, &new_ctx, &fake_msg) == 0) { + run_dlg_callbacks(DLGCB_TERMINATED, dlg, fake_msg, + DLG_DIR_NONE, -1, NULL, 0, 1); + if (current_processing_ctx == NULL) + *new_ctx = NULL; + else + context_destroy(CONTEXT_GLOBAL, *new_ctx); + set_global_context(old_ctx); + release_dummy_sip_msg(fake_msg); + } + + if (should_remove_dlg_db()) + remove_dialog_from_db(dlg); + + dlg_end_dlg(dlg, NULL, 1); + + /* ping list ref (1) + hash ref from next_state_dlg (unref) */ + unref_dlg(dlg, unref + 1); + + if_update_stat(dlg_enable_stats, active_dlgs, -1); + return 1; +} + void dlg_options_routine(unsigned int ticks , void * attr) { struct dlg_ping_list *expired,*to_be_deleted,*it,*curr,*next; @@ -915,12 +970,9 @@ void dlg_options_routine(unsigned int ticks , void * attr) dlg->legs[callee_idx(dlg)].reply_received); init_dlg_term_reason(dlg, MI_SSTR("Ping Timeout")); } - /* FIXME - maybe better not to send BYE both ways as we know for - * sure one end in down . */ - dlg_end_dlg(dlg,0,1); - - /* no longer reffed in list */ - unref_dlg(dlg,1); + /* Atomically claim CONFIRMED->DELETED before sending BYEs + * to prevent race with concurrent BYE (GH-3835) */ + dlg_ping_terminate(dlg); } it = to_be_deleted; @@ -1022,12 +1074,9 @@ void dlg_reinvite_routine(unsigned int ticks , void * attr) dlg->legs[callee_idx(dlg)].reinvite_confirmed); init_dlg_term_reason(dlg, MI_SSTR("ReINVITE Ping Timeout")); } - /* FIXME - maybe better not to send BYE both ways as we know for - * sure one end in down . */ - dlg_end_dlg(dlg,0,1); - - /* no longer reffed in list */ - unref_dlg(dlg,1); + /* Atomically claim CONFIRMED->DELETED before sending BYEs + * to prevent race with concurrent BYE (GH-3835) */ + dlg_ping_terminate(dlg); } it = to_be_deleted;