From 6d72b2e1ebf883799d0da4da519a16169508f868 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Sun, 8 Mar 2026 10:20:13 +0100
Subject: [PATCH] Improve executor pool: fix GIL contention and set default to
 4

- Add MIN_EXECUTORS (2) and increase MAX_EXECUTORS to 32
- Set default executors to 4 (benchmarked sweet spot)
- Fix GIL contention: acquire GIL only when processing actual work,
  not while idle waiting for requests. This prevents idle executor
  threads from competing with dirty schedulers running Python work
  via the context-based API.

Benchmark results (14 dirty CPU schedulers):
- 4 executors: best sync (400k/sec) and concurrent performance
- Context API: stable regardless of executor count (GIL fix verified)
---
 c_src/py_exec.c           | 28 +++++++++++++++-------------
 c_src/py_nif.c            |  2 +-
 c_src/py_nif.h            |  8 +++++++-
 src/erlang_python_sup.erl |  6 +++++-
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/c_src/py_exec.c b/c_src/py_exec.c
index aea03af..2fb2820 100644
--- a/c_src/py_exec.c
+++ b/c_src/py_exec.c
@@ -859,21 +859,20 @@ static void executor_stop(void) {
 /**
  * Main function for a multi-executor thread.
  * Each executor has its own queue and processes requests independently.
+ *
+ * GIL handling: Acquire GIL only when processing work, not while idle.
+ * This prevents idle executors from competing with dirty schedulers
+ * running actual Python work via the context-based API.
  */
 static void *multi_executor_thread_main(void *arg) {
     executor_t *exec = (executor_t *)arg;
 
-    /* Acquire GIL for this thread */
-    PyGILState_STATE gstate = PyGILState_Ensure();
-
     exec->running = true;
 
     while (!exec->shutdown) {
         py_request_t *req = NULL;
 
-        /* Release GIL while waiting for work */
-        Py_BEGIN_ALLOW_THREADS
-
+        /* Wait for work - NO GIL held while idle */
         pthread_mutex_lock(&exec->mutex);
         while (exec->queue_head == NULL && !exec->shutdown) {
             pthread_cond_wait(&exec->cond, &exec->mutex);
@@ -890,8 +889,6 @@ static void *multi_executor_thread_main(void *arg) {
         }
         pthread_mutex_unlock(&exec->mutex);
 
-        Py_END_ALLOW_THREADS
-
         if (req != NULL) {
             if (req->type == PY_REQ_SHUTDOWN) {
                 pthread_mutex_lock(&req->mutex);
@@ -900,10 +897,16 @@ static void *multi_executor_thread_main(void *arg) {
                 pthread_mutex_unlock(&req->mutex);
                 break;
             } else {
-                /* Process the request with GIL held */
+                /* Acquire GIL only for actual work */
+                PyGILState_STATE gstate = PyGILState_Ensure();
+
+                /* Process the request */
                 process_request(req);
 
-                /* Signal completion */
+                /* Release GIL immediately after processing */
+                PyGILState_Release(gstate);
+
+                /* Signal completion (outside GIL) */
                 pthread_mutex_lock(&req->mutex);
                 req->completed = true;
                 pthread_cond_signal(&req->cond);
@@ -913,7 +916,6 @@ static void *multi_executor_thread_main(void *arg) {
     }
 
     exec->running = false;
-    PyGILState_Release(gstate);
 
     return NULL;
 }
@@ -953,8 +955,8 @@ static int multi_executor_start(int num_executors) {
         return 0;
     }
 
-    if (num_executors <= 0) {
-        num_executors = 4;
+    if (num_executors < MIN_EXECUTORS) {
+        num_executors = MIN_EXECUTORS;
     }
     if (num_executors > MAX_EXECUTORS) {
         num_executors = MAX_EXECUTORS;
diff --git a/c_src/py_nif.c b/c_src/py_nif.c
index e5c7ae8..9ae1422 100644
--- a/c_src/py_nif.c
+++ b/c_src/py_nif.c
@@ -710,7 +710,7 @@ static ERL_NIF_TERM nif_py_init(ErlNifEnv *env, int argc, const ERL_NIF_TERM arg
         default:
             /* Start multiple executors for GIL contention mode */
             {
-                int num_exec = 4;  /* Default */
+                int num_exec = MIN_EXECUTORS;  /* Fallback if not provided */
                 /* Check for config */
                 if (argc > 0 && enif_is_map(env, argv[0])) {
                     ERL_NIF_TERM key = enif_make_atom(env, "num_executors");
diff --git a/c_src/py_nif.h b/c_src/py_nif.h
index 8002889..01adeee 100644
--- a/c_src/py_nif.h
+++ b/c_src/py_nif.h
@@ -1063,11 +1063,17 @@ typedef struct {
  * @{
  */
 
+/**
+ * @def MIN_EXECUTORS
+ * @brief Minimum number of executor threads in the pool
+ */
+#define MIN_EXECUTORS 2
+
 /**
  * @def MAX_EXECUTORS
  * @brief Maximum number of executor threads in the pool
  */
-#define MAX_EXECUTORS 16
+#define MAX_EXECUTORS 32
 
 /**
  * @struct executor_t
diff --git a/src/erlang_python_sup.erl b/src/erlang_python_sup.erl
index 5847309..ae33ddd 100644
--- a/src/erlang_python_sup.erl
+++ b/src/erlang_python_sup.erl
@@ -37,8 +37,12 @@ init([]) ->
     ContextMode = application:get_env(erlang_python, context_mode, auto),
     NumAsyncWorkers = application:get_env(erlang_python, num_async_workers, 2),
 
+    %% Default executors: 4 (benchmarked sweet spot for most workloads)
+    %% Can be overridden via {erlang_python, [{num_executors, N}]}
+    NumExecutors = application:get_env(erlang_python, num_executors, 4),
+
     %% Initialize Python runtime first
-    ok = py_nif:init(),
+    ok = py_nif:init(#{num_executors => NumExecutors}),
 
     %% Initialize the semaphore ETS table for rate limiting
     ok = py_semaphore:init(),