From 4257fe210ee5bbb38043833e4df3cc6e81991e4c Mon Sep 17 00:00:00 2001 From: Norm Brandinger Date: Wed, 11 Feb 2026 13:57:32 -0500 Subject: [PATCH] sockets_mgm: fix race in receive_fd causing infinite loop on reload During sockets_reload, all processes receive an IPC RPC to run rpc_socket_reload_proc(). Non-dynamic (worker) processes close their copy of each dynamic socket and then call receive_fd() on the shared sock_mgm_unix[0] socketpair to get a fresh fd from the mgm process. Because sock_mgm_unix[0] is shared across all workers and SOCK_STREAM delivers bytes in order (not per-message), concurrent receive_fd() calls race: worker A can consume the fd response intended for worker B. When this happens, worker B receives worker A's fd response, which references a socket already in worker B's listener list. The sock_listadd() macro then corrupts the linked list into a circular loop (si->next == si), causing push_sock2list() to spin at 100% CPU indefinitely. Add a sock_mgm_reload_lock that serializes the entire send-IPC-to-mgm + receive-fd sequence for worker processes. Dynamic (mgm) processes are excluded from this lock because they create sockets directly via sock_mgm_add_listener() and never call receive_fd(); including them would deadlock since the worker holding the lock blocks on receive_fd() waiting for the mgm to process rpc_sockets_send(). Fixes: OpenSIPS/opensips#3789 --- modules/sockets_mgm/sockets_mgm.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/modules/sockets_mgm/sockets_mgm.c b/modules/sockets_mgm/sockets_mgm.c index 5f6636cde01..49ef23601c2 100644 --- a/modules/sockets_mgm/sockets_mgm.c +++ b/modules/sockets_mgm/sockets_mgm.c @@ -60,6 +60,7 @@ static db_func_t sock_mgm_db_func; static unsigned long *sock_mgm_version; static unsigned int sock_mgm_max_sockets = SOCKETS_MGM_DEFAULT_MAX_SOCKS; static gen_lock_t *sock_mgm_lock; +static gen_lock_t *sock_mgm_reload_lock; static int *sock_mgm_proc_no; static int sock_mgm_unix[2]; extern int is_tcp_main; @@ -219,6 +220,11 @@ static int mod_init(void) LM_ERR("initializing sock_mgm_version lock\n"); return -1; } + sock_mgm_reload_lock = lock_alloc(); + if (!sock_mgm_reload_lock || !lock_init(sock_mgm_reload_lock)) { + LM_ERR("initializing sock_mgm_reload lock\n"); + return -1; + } if (socketpair(AF_UNIX, SOCK_STREAM, 0, sock_mgm_unix) < 0) { LM_ERR("socketpair failed %d/%s\n", @@ -1145,6 +1151,21 @@ static void rpc_socket_reload_proc(int sender_id, void *_ver) int sockets_update_count = 0, fd; LM_NOTICE("Reloading process for version %lu\n", version); + + /* Serialize the entire send-IPC-to-mgm and receive-fd sequence across + * all non-dynamic (worker) processes. Without this, multiple workers + * calling receive_fd() concurrently on the shared sock_mgm_unix + * socketpair can steal each other's fd responses, causing the same + * socket to be added to a worker's listener list twice and corrupting + * it into a circular linked list (infinite loop in sock_listadd). + * + * Dynamic (mgm) processes don't use receive_fd - they create sockets + * directly - so they must NOT acquire this lock, otherwise they would + * deadlock with the worker that holds the lock while blocked on + * receive_fd waiting for the mgm to process rpc_sockets_send. */ + if (!sock_mgm_dynamic_proc) + lock_get(sock_mgm_reload_lock); + lock_get(sock_mgm_lock); if (*sock_mgm_version > version) { LM_WARN("new version %lu available (current=%lu)\n", *sock_mgm_version, version); @@ -1171,6 +1192,9 @@ static void rpc_socket_reload_proc(int sender_id, void *_ver) sock_mgm_update_fd(sock, fd); } } + + if (!sock_mgm_dynamic_proc) + lock_release(sock_mgm_reload_lock); } static int sockets_pool_init(void)