From 4257fe210ee5bbb38043833e4df3cc6e81991e4c Mon Sep 17 00:00:00 2001
From: Norm Brandinger <n.brandinger@gmail.com>
Date: Wed, 11 Feb 2026 13:57:32 -0500
Subject: [PATCH] sockets_mgm: fix race in receive_fd causing infinite loop on
 reload

During sockets_reload, all processes receive an IPC RPC to run
rpc_socket_reload_proc(). Non-dynamic (worker) processes close their
copy of each dynamic socket and then call receive_fd() on the shared
sock_mgm_unix[0] socketpair to get a fresh fd from the mgm process.

Because sock_mgm_unix[0] is shared across all workers and SOCK_STREAM
delivers bytes in order (not per-message), concurrent receive_fd()
calls race: worker A can consume the fd response intended for worker B.
When this happens, worker B receives worker A's fd response, which
references a socket already in worker B's listener list.  The
sock_listadd() macro then corrupts the linked list into a circular
loop (si->next == si), causing push_sock2list() to spin at 100% CPU
indefinitely.

Add a sock_mgm_reload_lock that serializes the entire
send-IPC-to-mgm + receive-fd sequence for worker processes.  Dynamic
(mgm) processes are excluded from this lock because they create
sockets directly via sock_mgm_add_listener() and never call
receive_fd(); including them would deadlock since the worker holding
the lock blocks on receive_fd() waiting for the mgm to process
rpc_sockets_send().

Fixes: OpenSIPS/opensips#3789
---
 modules/sockets_mgm/sockets_mgm.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/modules/sockets_mgm/sockets_mgm.c b/modules/sockets_mgm/sockets_mgm.c
index 5f6636cde01..49ef23601c2 100644
--- a/modules/sockets_mgm/sockets_mgm.c
+++ b/modules/sockets_mgm/sockets_mgm.c
@@ -60,6 +60,7 @@ static db_func_t sock_mgm_db_func;
 static unsigned long *sock_mgm_version;
 static unsigned int sock_mgm_max_sockets = SOCKETS_MGM_DEFAULT_MAX_SOCKS;
 static gen_lock_t *sock_mgm_lock;
+static gen_lock_t *sock_mgm_reload_lock;
 static int *sock_mgm_proc_no;
 static int sock_mgm_unix[2];
 extern int is_tcp_main;
@@ -219,6 +220,11 @@ static int mod_init(void)
 		LM_ERR("initializing sock_mgm_version lock\n");
 		return -1;
 	}
+	sock_mgm_reload_lock = lock_alloc();
+	if (!sock_mgm_reload_lock || !lock_init(sock_mgm_reload_lock)) {
+		LM_ERR("initializing sock_mgm_reload lock\n");
+		return -1;
+	}
 
 	if (socketpair(AF_UNIX, SOCK_STREAM, 0, sock_mgm_unix) < 0) {
 		LM_ERR("socketpair failed %d/%s\n",
@@ -1145,6 +1151,21 @@ static void rpc_socket_reload_proc(int sender_id, void *_ver)
 	int sockets_update_count = 0, fd;
 
 	LM_NOTICE("Reloading process for version %lu\n", version);
+
+	/* Serialize the entire send-IPC-to-mgm and receive-fd sequence across
+	 * all non-dynamic (worker) processes. Without this, multiple workers
+	 * calling receive_fd() concurrently on the shared sock_mgm_unix
+	 * socketpair can steal each other's fd responses, causing the same
+	 * socket to be added to a worker's listener list twice and corrupting
+	 * it into a circular linked list (infinite loop in sock_listadd).
+	 *
+	 * Dynamic (mgm) processes don't use receive_fd - they create sockets
+	 * directly - so they must NOT acquire this lock, otherwise they would
+	 * deadlock with the worker that holds the lock while blocked on
+	 * receive_fd waiting for the mgm to process rpc_sockets_send. */
+	if (!sock_mgm_dynamic_proc)
+		lock_get(sock_mgm_reload_lock);
+
 	lock_get(sock_mgm_lock);
 	if (*sock_mgm_version > version) {
 		LM_WARN("new version %lu available (current=%lu)\n", *sock_mgm_version, version);
@@ -1171,6 +1192,9 @@ static void rpc_socket_reload_proc(int sender_id, void *_ver)
 			sock_mgm_update_fd(sock, fd);
 		}
 	}
+
+	if (!sock_mgm_dynamic_proc)
+		lock_release(sock_mgm_reload_lock);
 }
 
 static int sockets_pool_init(void)