Skip to content

Commit dd577e0

Browse files
committed
Add No-GIL Safe Mode with atomic state machine
- Add atomic runtime state machine (UNINIT->INITING->RUNNING->SHUTTING_DOWN->STOPPED) - Convert volatile flags to _Atomic for thread safety - Add NIF guards to reject work when not RUNNING - Fix destructor memory corruption for OWN_GIL subinterpreters - Add enif_keep_resource/release for ctx in suspended states - Add debug counters NIF for runtime diagnostics - Add CI sanitizer builds (ASan, TSan, UBSan)
1 parent 3cb5854 commit dd577e0

File tree

7 files changed

+495
-86
lines changed

7 files changed

+495
-86
lines changed

.github/workflows/ci.yml

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,82 @@ jobs:
186186
'
187187
continue-on-error: true # Free-threading is experimental
188188

189+
# Sanitizer builds for detecting memory issues and race conditions
190+
test-sanitizers:
191+
name: ${{ matrix.sanitizer }} / Python ${{ matrix.python }}
192+
runs-on: ubuntu-24.04
193+
194+
strategy:
195+
fail-fast: false
196+
matrix:
197+
include:
198+
# ASan + UBSan with Python 3.12
199+
- sanitizer: "ASan+UBSan"
200+
python: "3.12"
201+
cmake_flags: "-DENABLE_ASAN=ON -DENABLE_UBSAN=ON"
202+
env_vars: "ASAN_OPTIONS=detect_leaks=1:abort_on_error=1"
203+
# ASan + UBSan with Python 3.13
204+
- sanitizer: "ASan+UBSan"
205+
python: "3.13"
206+
cmake_flags: "-DENABLE_ASAN=ON -DENABLE_UBSAN=ON"
207+
env_vars: "ASAN_OPTIONS=detect_leaks=1:abort_on_error=1"
208+
# TSan with Python 3.12 (separate because incompatible with ASan)
209+
- sanitizer: "TSan"
210+
python: "3.12"
211+
cmake_flags: "-DENABLE_TSAN=ON"
212+
env_vars: "TSAN_OPTIONS=second_deadlock_stack=1"
213+
214+
steps:
215+
- name: Checkout
216+
uses: actions/checkout@v4
217+
218+
- name: Set up Python
219+
uses: actions/setup-python@v5
220+
with:
221+
python-version: ${{ matrix.python }}
222+
223+
- name: Set up Erlang
224+
uses: erlef/setup-beam@v1
225+
with:
226+
otp-version: "27.0"
227+
rebar3-version: "3.24"
228+
229+
- name: Install dependencies
230+
run: |
231+
sudo apt-get update
232+
sudo apt-get install -y cmake
233+
234+
- name: Set Python library path
235+
run: |
236+
PYTHON_LIB=$(python3 -c "import sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
237+
echo "LD_LIBRARY_PATH=${PYTHON_LIB}:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
238+
239+
- name: Clean and compile with sanitizers
240+
run: |
241+
rm -rf _build/cmake
242+
mkdir -p _build/cmake
243+
cd _build/cmake
244+
cmake ../../c_src ${{ matrix.cmake_flags }}
245+
cmake --build . -- -j $(nproc)
246+
cd ../..
247+
rebar3 compile
248+
249+
- name: Run tests with sanitizers
250+
env:
251+
ASAN_OPTIONS: ${{ contains(matrix.env_vars, 'ASAN_OPTIONS') && 'detect_leaks=1:abort_on_error=1' || '' }}
252+
TSAN_OPTIONS: ${{ contains(matrix.env_vars, 'TSAN_OPTIONS') && 'second_deadlock_stack=1' || '' }}
253+
run: |
254+
rebar3 ct --readable=compact
255+
256+
- name: Check debug counters
257+
run: |
258+
erl -pa _build/default/lib/erlang_python/ebin -noshell -eval '
259+
application:ensure_all_started(erlang_python),
260+
Counters = py_nif:get_debug_counters(),
261+
io:format("Debug counters: ~p~n", [Counters]),
262+
halt().
263+
'
264+
189265
lint:
190266
name: Lint
191267
runs-on: ubuntu-24.04

c_src/CMakeLists.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,33 @@ if(ASGI_PROFILING)
5757
add_definitions(-DASGI_PROFILING)
5858
endif()
5959

60+
# Sanitizer options for debugging race conditions and memory issues
61+
option(ENABLE_ASAN "Enable AddressSanitizer" OFF)
62+
option(ENABLE_TSAN "Enable ThreadSanitizer" OFF)
63+
option(ENABLE_UBSAN "Enable UndefinedBehaviorSanitizer" OFF)
64+
65+
if(ENABLE_ASAN)
66+
message(STATUS "AddressSanitizer enabled")
67+
add_compile_options(-fsanitize=address -fno-omit-frame-pointer -g -O1)
68+
add_link_options(-fsanitize=address)
69+
# ASan is incompatible with TSan
70+
if(ENABLE_TSAN)
71+
message(FATAL_ERROR "ASan and TSan cannot be used together")
72+
endif()
73+
endif()
74+
75+
if(ENABLE_TSAN)
76+
message(STATUS "ThreadSanitizer enabled")
77+
add_compile_options(-fsanitize=thread -fno-omit-frame-pointer -g -O1)
78+
add_link_options(-fsanitize=thread)
79+
endif()
80+
81+
if(ENABLE_UBSAN)
82+
message(STATUS "UndefinedBehaviorSanitizer enabled")
83+
add_compile_options(-fsanitize=undefined -fno-omit-frame-pointer -g -O1)
84+
add_link_options(-fsanitize=undefined)
85+
endif()
86+
6087
if(PERF_BUILD)
6188
message(STATUS "Performance build enabled - using aggressive optimizations")
6289
# Override compiler flags for maximum performance

c_src/py_callback.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ static suspended_context_state_t *create_suspended_context_state_for_call(
633633
memset(state, 0, sizeof(suspended_context_state_t));
634634

635635
state->ctx = ctx;
636+
enif_keep_resource(ctx); /* Keep ctx alive while suspended state exists */
636637
state->callback_id = tl_pending_callback_id;
637638
state->request_type = PY_REQ_CALL;
638639

@@ -718,6 +719,7 @@ static suspended_context_state_t *create_suspended_context_state_for_eval(
718719
memset(state, 0, sizeof(suspended_context_state_t));
719720

720721
state->ctx = ctx;
722+
enif_keep_resource(ctx); /* Keep ctx alive while suspended state exists */
721723
state->callback_id = tl_pending_callback_id;
722724
state->request_type = PY_REQ_EVAL;
723725

c_src/py_exec.c

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,7 @@ static void *executor_thread_main(void *arg) {
655655
/* Acquire GIL for this thread */
656656
PyGILState_STATE gstate = PyGILState_Ensure();
657657

658-
g_executor_running = true;
658+
atomic_store(&g_executor_running, true);
659659

660660
/*
661661
* Main processing loop.
@@ -670,7 +670,7 @@ static void *executor_thread_main(void *arg) {
670670
Py_BEGIN_ALLOW_THREADS
671671

672672
pthread_mutex_lock(&g_executor_mutex);
673-
while (g_executor_queue_head == NULL && !g_executor_shutdown) {
673+
while (g_executor_queue_head == NULL && !atomic_load(&g_executor_shutdown)) {
674674
pthread_cond_wait(&g_executor_cond, &g_executor_mutex);
675675
}
676676

@@ -682,7 +682,7 @@ static void *executor_thread_main(void *arg) {
682682
g_executor_queue_tail = NULL;
683683
}
684684
req->next = NULL;
685-
} else if (g_executor_shutdown) {
685+
} else if (atomic_load(&g_executor_shutdown)) {
686686
/* Queue is empty and shutdown requested - exit */
687687
should_exit = true;
688688
}
@@ -702,6 +702,9 @@ static void *executor_thread_main(void *arg) {
702702
/* Process the request with GIL held */
703703
process_request(req);
704704

705+
/* Track completed requests */
706+
atomic_fetch_add(&g_counters.complete_count, 1);
707+
705708
/* Signal completion */
706709
pthread_mutex_lock(&req->mutex);
707710
req->completed = true;
@@ -711,7 +714,7 @@ static void *executor_thread_main(void *arg) {
711714
}
712715
}
713716

714-
g_executor_running = false;
717+
atomic_store(&g_executor_running, false);
715718
PyGILState_Release(gstate);
716719

717720
return NULL;
@@ -720,8 +723,19 @@ static void *executor_thread_main(void *arg) {
720723
/**
721724
* Enqueue a request to the appropriate executor based on execution mode.
722725
* Routes to multi-executor pool, single executor, or executes directly.
726+
*
727+
* @return 0 on success, -1 if shutting down (request rejected)
723728
*/
724-
static void executor_enqueue(py_request_t *req) {
729+
static int executor_enqueue(py_request_t *req) {
730+
/* Reject work if runtime is shutting down (except shutdown requests) */
731+
if (runtime_is_shutting_down() && req->type != PY_REQ_SHUTDOWN) {
732+
atomic_fetch_add(&g_counters.rejected_count, 1);
733+
return -1;
734+
}
735+
736+
/* Track enqueued requests */
737+
atomic_fetch_add(&g_counters.enqueue_count, 1);
738+
725739
switch (g_execution_mode) {
726740
#ifdef HAVE_FREE_THREADED
727741
case PY_MODE_FREE_THREADED:
@@ -736,15 +750,15 @@ static void executor_enqueue(py_request_t *req) {
736750
pthread_cond_signal(&req->cond);
737751
pthread_mutex_unlock(&req->mutex);
738752
}
739-
return;
753+
return 0;
740754
#endif
741755

742756
case PY_MODE_MULTI_EXECUTOR:
743-
if (g_multi_executor_initialized) {
757+
if (atomic_load(&g_multi_executor_initialized)) {
744758
/* Route to multi-executor pool */
745759
int exec_id = select_executor();
746760
multi_executor_enqueue(exec_id, req);
747-
return;
761+
return 0;
748762
}
749763
/* Fall through to single executor if multi not initialized */
750764
break;
@@ -767,6 +781,7 @@ static void executor_enqueue(py_request_t *req) {
767781
}
768782
pthread_cond_signal(&g_executor_cond);
769783
pthread_mutex_unlock(&g_executor_mutex);
784+
return 0;
770785
}
771786

772787
/**
@@ -785,7 +800,7 @@ static void executor_wait(py_request_t *req) {
785800
* Called during Python initialization.
786801
*/
787802
static int executor_start(void) {
788-
g_executor_shutdown = false;
803+
atomic_store(&g_executor_shutdown, false);
789804
g_executor_queue_head = NULL;
790805
g_executor_queue_tail = NULL;
791806

@@ -795,19 +810,19 @@ static int executor_start(void) {
795810

796811
/* Wait for executor to be ready */
797812
int max_wait = 100; /* 1 second max */
798-
while (!g_executor_running && max_wait-- > 0) {
813+
while (!atomic_load(&g_executor_running) && max_wait-- > 0) {
799814
usleep(10000); /* 10ms */
800815
}
801816

802-
return g_executor_running ? 0 : -1;
817+
return atomic_load(&g_executor_running) ? 0 : -1;
803818
}
804819

805820
/**
806821
* Stop the executor thread.
807822
* Called during Python finalization.
808823
*/
809824
static void executor_stop(void) {
810-
if (!g_executor_running) {
825+
if (!atomic_load(&g_executor_running)) {
811826
return;
812827
}
813828

@@ -816,7 +831,7 @@ static void executor_stop(void) {
816831
request_init(&shutdown_req);
817832
shutdown_req.type = PY_REQ_SHUTDOWN;
818833

819-
g_executor_shutdown = true;
834+
atomic_store(&g_executor_shutdown, true);
820835
executor_enqueue(&shutdown_req);
821836
executor_wait(&shutdown_req);
822837
request_cleanup(&shutdown_req);
@@ -926,7 +941,7 @@ static void multi_executor_enqueue(int exec_id, py_request_t *req) {
926941
* Start the multi-executor pool.
927942
*/
928943
static int multi_executor_start(int num_executors) {
929-
if (g_multi_executor_initialized) {
944+
if (atomic_load(&g_multi_executor_initialized)) {
930945
return 0;
931946
}
932947

@@ -978,15 +993,15 @@ static int multi_executor_start(int num_executors) {
978993
}
979994
}
980995

981-
g_multi_executor_initialized = all_ready;
996+
atomic_store(&g_multi_executor_initialized, all_ready);
982997
return all_ready ? 0 : -1;
983998
}
984999

9851000
/**
9861001
* Stop the multi-executor pool.
9871002
*/
9881003
static void multi_executor_stop(void) {
989-
if (!g_multi_executor_initialized) {
1004+
if (!atomic_load(&g_multi_executor_initialized)) {
9901005
return;
9911006
}
9921007

@@ -1023,7 +1038,7 @@ static void multi_executor_stop(void) {
10231038
}
10241039
}
10251040

1026-
g_multi_executor_initialized = false;
1041+
atomic_store(&g_multi_executor_initialized, false);
10271042
}
10281043

10291044
/*

0 commit comments

Comments
 (0)