From 1eae64f6a3a8d6c100f333a8a8b3cc6acf56fb55 Mon Sep 17 00:00:00 2001 From: swoiszwillo Date: Sat, 6 Jun 2026 17:27:19 -0600 Subject: [PATCH 1/3] test: k_sem corruption investigation snapshot (#21) Investigation artifacts, preserved in history before cleanup: - Stack-local sem + K_FOREVER repro harness (same-prio / prio-22 / timer-expiry givers, frame scribbling between cycles) - EXPERIMENT A: faithful reconstructions of the April 2026 crash shapes (embedded-sem timer struct, matched call depth, one-shot and periodic-repark variants). Run on ESP32-S3 with CONFIG_K_TIMER_DISPATCH_ISR=n (task-context ESP_TIMER_TASK giver, the original trigger context): 200/0 -- does not reproduce on the same IDF 5.4 silicon that crashed in April. - EXPERIMENT B (linux-only): zombie injection. Recreates the pre-#18 dangling-task-node state with raw FreeRTOS calls. Frame-reuse runs wedged the kernel at uxListRemove (list.c:209) writing through the poisoned pxNext -- lldb forensics showed a stack-local k_sem's vListInitialise pattern overwritten across the zombie TCB. With the region protected (compiler-proofed 8K pad; clang shrank a volatile pad to 2 bytes until given an escaping pointer + memset), the node stays intact and harmless: corruption requires reuse. Conclusion: the April corruption was the pre-#18 k_thread zombie defect (dangling xStateListItem in dead frames + reuse), fixed by PR #18. k_sem / StaticSemaphore_t exonerated. Closes the evidence for issue #21. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/main/test_k_sem.c | 275 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 274 insertions(+), 1 deletion(-) diff --git a/test/main/test_k_sem.c b/test/main/test_k_sem.c index 892c7e7..3cf8e53 100644 --- a/test/main/test_k_sem.c +++ b/test/main/test_k_sem.c @@ -3,9 +3,11 @@ * Copyright 2026 Intercreate */ +#include +#include + #include "unity.h" #include "zephyr/kernel.h" -#include static void test_sem_init_and_count(void) { @@ -111,8 +113,279 @@ static void test_sem_auto_init_pre_main(void) TEST_ASSERT_EQUAL(2, k_sem_count_get(&auto_sem)); } +/* ---------------------------------------------------------------- + * Stack-local sem + K_FOREVER + cross-priority give + * + * Repro harness for the latent corruption surfaced 2026-04-29 on + * ESP32-S3 (stack-local sem + K_FOREVER take + higher-priority giver + * corrupted the waiter's frame across the yield; static sems and + * same-priority givers were immune). Run EARLY so the rest of the + * suite acts as the delayed-corruption detector -- the original bug + * panicked unrelated later tests, not the trigger itself. + * ---------------------------------------------------------------- */ + +K_THREAD_STACK_DEFINE(sem_giver_stack, 4096); +static struct k_thread sem_giver_thread; +static struct k_sem *volatile giver_target; + +static void sem_giver_entry(void *p1, void *p2, void *p3) +{ + (void)p1; + (void)p2; + (void)p3; + k_msleep(30); + k_sem_give(giver_target); +} + +/* Each cycle uses a fresh stack frame: take on a STACK-LOCAL sem, + * given from a thread at @p giver_prio, then let the frame die and + * scribble over it via the next call. */ +static void stack_sem_forever_cycle(int giver_prio) +{ + struct k_sem sem; /* stack-local: the trigger */ + + TEST_ASSERT_EQUAL(0, k_sem_init(&sem, 0, 1)); + giver_target = &sem; + + k_thread_create(&sem_giver_thread, sem_giver_stack, K_THREAD_STACK_SIZEOF(sem_giver_stack), + sem_giver_entry, NULL, NULL, NULL, giver_prio, 0, K_NO_WAIT); + + TEST_ASSERT_EQUAL(0, k_sem_take(&sem, K_FOREVER)); + TEST_ASSERT_EQUAL(0, k_thread_join(&sem_giver_thread, K_SECONDS(2))); +} + +/* Burn the just-freed frame region so a dangling kernel reference into + * the dead sem shows up as corruption instead of silent luck. */ +static void scribble_stack(void) +{ + volatile uint8_t burn[sizeof(struct k_sem) * 4]; + + for (size_t i = 0; i < sizeof(burn); i++) { + burn[i] = 0xA5; + } +} + +static void test_sem_stack_forever_same_prio(void) +{ + for (int i = 0; i < 5; i++) { + stack_sem_forever_cycle(5); /* same prio as test task pool */ + scribble_stack(); + } +} + +static void test_sem_stack_forever_high_prio(void) +{ + for (int i = 0; i < 5; i++) { + stack_sem_forever_cycle(22); /* ESP_TIMER_TASK-class giver */ + scribble_stack(); + } +} + +/* Timer-expiry giver -- the original PR 4 shape: expiry runs on + * ESP_TIMER_TASK (HW) / the k_timer dispatcher (linux), both higher + * priority than the test task. */ +static void sem_give_expiry(struct k_timer *timer) +{ + k_sem_give((struct k_sem *)k_timer_user_data_get(timer)); +} + +static void test_sem_stack_forever_timer_giver(void) +{ + for (int i = 0; i < 5; i++) { + struct k_sem sem; /* stack-local */ + struct k_timer timer; + + TEST_ASSERT_EQUAL(0, k_sem_init(&sem, 0, 1)); + k_timer_init(&timer, sem_give_expiry, NULL); + k_timer_user_data_set(&timer, &sem); + k_timer_start(&timer, K_MSEC(30), K_NO_WAIT); + + TEST_ASSERT_EQUAL(0, k_sem_take(&sem, K_FOREVER)); + k_timer_stop(&timer); + scribble_stack(); + } +} + +/* ---------------------------------------------------------------- + * EXPERIMENT A: faithful reconstruction of the April 2026 crash + * shapes (devlog/2026-04-29_k_sem_stack_local_kforever_corruption.md). + * Requires CONFIG_K_TIMER_DISPATCH_ISR=n so expiry gives from + * ESP_TIMER_TASK in task context (prio ~22) -- the original giver. + * ---------------------------------------------------------------- */ + +/* PR-4's attempted k_timer shape: sem embedded next to the timer in + * ONE stack-local struct. */ +struct april_sync_timer { + struct k_timer timer; + struct k_sem sem; +}; + +static void april_sync_expiry(struct k_timer *timer) +{ + struct april_sync_timer *st = CONTAINER_OF(timer, struct april_sync_timer, timer); + + k_sem_give(&st->sem); +} + +/* Extra frame to match the original call depth + * (test -> k_timer_status_sync -> k_sem_take -> xQueueSemaphoreTake); + * window-spill behavior on Xtensa depends on it. */ +static __attribute__((noinline)) uint32_t april_fake_status_sync(struct april_sync_timer *st) +{ + TEST_ASSERT_EQUAL(0, k_sem_take(&st->sem, K_FOREVER)); + return 1; +} + +/* April crash #1: one-shot 100 ms, block until first expiry. */ +static void test_sem_april_oneshot_status_sync(void) +{ + struct april_sync_timer st; /* STACK -- the trigger */ + + k_timer_init(&st.timer, april_sync_expiry, NULL); + TEST_ASSERT_EQUAL(0, k_sem_init(&st.sem, 0, 10)); + + k_timer_start(&st.timer, K_MSEC(100), K_NO_WAIT); + TEST_ASSERT_EQUAL(1, april_fake_status_sync(&st)); + + k_timer_stop(&st.timer); + scribble_stack(); +} + +/* April crash #2 (already_fired shape): periodic 20 ms; first take is + * the fast path (count accumulated); second take BLOCKS with the next + * high-prio give landing milliseconds after the park -- the tightest + * timing window. */ +static void test_sem_april_periodic_status_sync(void) +{ + for (int i = 0; i < 5; i++) { + struct april_sync_timer st; /* STACK */ + + k_timer_init(&st.timer, april_sync_expiry, NULL); + TEST_ASSERT_EQUAL(0, k_sem_init(&st.sem, 0, 10)); + + k_timer_start(&st.timer, K_MSEC(20), K_MSEC(20)); + k_msleep(100); /* several expiries accumulate */ + + TEST_ASSERT_EQUAL(1, april_fake_status_sync(&st)); /* fast path */ + TEST_ASSERT_EQUAL(1, april_fake_status_sync(&st)); /* blocks ~20ms */ + + k_timer_stop(&st.timer); + scribble_stack(); + } +} + +#if CONFIG_IDF_TARGET_LINUX +/* ---------------------------------------------------------------- + * EXPERIMENT B: zombie-injection mechanism proof. + * + * Hypothesis (#21): the April 2026 corruption was the pre-#18 + * k_thread defect -- a parked task whose TCB (and thus its + * xStateListItem) lived in a DEAD stack frame. Any later insert into + * the same kernel list (e.g. a K_FOREVER sem take parking the caller + * on xSuspendedTaskList) makes the kernel WRITE through the dangling + * node into memory it doesn't own. + * + * This test recreates that zombie deliberately with raw FreeRTOS + * calls, snapshots the dead-frame TCB bytes (never writing them + * ourselves -- frame depths are controlled so no live local overlaps + * the region), parks a stack-sem waiter, and memcmp's. A delta is + * PROOF of the mechanism. A zero delta is INCONCLUSIVE (node + * adjacency in the shared suspended list isn't guaranteed), not a + * refutation. + * ---------------------------------------------------------------- */ + +static TaskHandle_t zombie_handle; +static uint8_t *volatile zombie_region; + +static void zombie_entry(void *arg) +{ + (void)arg; + vTaskSuspend(NULL); /* park forever: the pre-#18 zombie shape */ +} + +static StackType_t zombie_stack[2048 / sizeof(StackType_t)]; + +static __attribute__((noinline)) void make_zombie(void) +{ + StaticTask_t tcb; /* FUNCTION-LOCAL: becomes the dangling node */ + + zombie_handle = xTaskCreateStaticPinnedToCore(zombie_entry, "zombie", + sizeof(zombie_stack) / sizeof(StackType_t), + NULL, 5, zombie_stack, &tcb, 0); + TEST_ASSERT_NOT_NULL(zombie_handle); + zombie_region = (uint8_t *)&tcb; + k_msleep(20); /* let it reach the park */ + /* Return: tcb now dangles in this dead frame. Deliberate. */ +} + +/* Push the zombie's frame deeper than any later live frame so nothing + * in this test overwrites the region before we inspect it. The pad + * must exceed the deepest subsequent call chain -- on the POSIX port + * the waiter's blocking path (k_sem_take -> xQueueSemaphoreTake -> + * vPortYield -> event_wait -> pthread internals) runs on the same + * pthread stack, so be generous. A 512 B pad demonstrably was NOT + * enough: the waiter chain trampled the zombie TCB and vTaskDelete + * wedged in uxListRemove (which is itself the April mechanism, via + * frame reuse -- but the surgical kernel-write proof needs the region + * untouched by us). */ +static void *volatile pad_escape; /* defeats array shrinking/elision */ + +static __attribute__((noinline)) void call_deep(void (*fn)(void)) +{ + uint8_t pad[8192]; + + pad_escape = pad; /* address escapes: array must exist */ + memset(pad, 0x5A, sizeof(pad)); + __asm__ volatile("" ::"r"(pad) : "memory"); + fn(); + __asm__ volatile("" ::"r"(pad) : "memory"); /* alive across the call */ +} + +static void test_sem_zombie_injection_mechanism(void) +{ + static uint8_t snapshot[sizeof(StaticTask_t)]; + + call_deep(make_zombie); + memcpy(snapshot, zombie_region, sizeof(snapshot)); + + /* Park a stack-sem waiter: the suspended-list insert links our + * TCB adjacent to the dangling zombie node, writing into it. */ + stack_sem_forever_cycle(22); + + int delta = memcmp(snapshot, zombie_region, sizeof(snapshot)); + + /* Clean the list BEFORE asserting: delete-other reclaims + * synchronously on silicon and unlinks the dangling node. On the + * POSIX port teardown is idle-deferred and dereferences the TCB + * in this (still live) frame -- give idle its reap window before + * returning, mirroring k_thread_abort. */ + vTaskDelete(zombie_handle); + k_msleep(2 * portTICK_PERIOD_MS); + + if (delta == 0) { + TEST_MESSAGE("INCONCLUSIVE: no kernel write landed in the " + "zombie region this run (list adjacency not " + "guaranteed)"); + } else { + TEST_MESSAGE("MECHANISM PROVEN: kernel wrote through the " + "dangling suspended-list node into dead-frame " + "memory"); + } + TEST_PASS(); /* outcome is reported, not asserted -- see above */ +} +#endif /* CONFIG_IDF_TARGET_LINUX -- the 8K pad would overflow the S3 \ + * main task stack; the mechanism is proven on linux. */ + void test_k_sem_group(void) { + RUN_TEST(test_sem_stack_forever_same_prio); + RUN_TEST(test_sem_stack_forever_high_prio); + RUN_TEST(test_sem_stack_forever_timer_giver); + RUN_TEST(test_sem_april_oneshot_status_sync); + RUN_TEST(test_sem_april_periodic_status_sync); +#if CONFIG_IDF_TARGET_LINUX + RUN_TEST(test_sem_zombie_injection_mechanism); +#endif RUN_TEST(test_sem_init_and_count); RUN_TEST(test_sem_give_and_take); RUN_TEST(test_sem_take_timeout); From a8d14873acc94cc06a6514d8d228d8d5fc9911fb Mon Sep 17 00:00:00 2001 From: swoiszwillo Date: Sat, 6 Jun 2026 17:30:26 -0600 Subject: [PATCH 2/3] test: keep #21 regression shapes; document the object-lifetime principle Strips the investigation-only zombie-injection experiment (preserved at 1eae64f -- it deliberately wedges the kernel when the hypothesis holds and its stack padding is compiler-fragile) and rewords the remaining tests as permanent regressions: three stack-local-sem K_FOREVER giver shapes plus the two faithful April 2026 corruption shapes, valid under either k_timer dispatch mode. CONFIG_K_TIMER_DISPATCH_ISR restored to y. Adds the #22 deliverable to the zkernel README: "Design principle: object lifetime must be Zephyr-shaped" -- every API ending an object's kernel involvement severs all kernel references before returning, with per-object status and the stack-allocation caveat for objects signaled from other contexts. Closes #21: the April corruption was the pre-#18 k_thread zombie defect (dangling task list nodes in dead frames poisoned by reuse), fixed by PR #18. k_sem exonerated. Evidence: faithful April shapes clean on the same IDF 5.4 silicon with the original task-context ESP_TIMER_TASK giver; zombie injection on linux reproduced the kernel fault on demand (uxListRemove writing through a poisoned dangling node) with lldb memory forensics; the protected-region control showed dangling nodes are latent until frame reuse. Closes #22 Co-Authored-By: Claude Opus 4.8 (1M context) --- components/zkernel/README.md | 31 +++++++++ test/main/test_k_sem.c | 130 ++++------------------------------- 2 files changed, 46 insertions(+), 115 deletions(-) diff --git a/components/zkernel/README.md b/components/zkernel/README.md index 0e446bf..79a0c78 100644 --- a/components/zkernel/README.md +++ b/components/zkernel/README.md @@ -2,6 +2,37 @@ Zephyr-compatible kernel primitives over FreeRTOS. All objects are statically allocated (no malloc). All blocking APIs handle `K_NO_WAIT`, `K_FOREVER`, and finite timeouts. ISR-safe variants are used automatically where applicable. +## Design principle: object lifetime must be Zephyr-shaped + +Upstream Zephyr kernel objects are caller-owned plain structs whose lifecycle +ends are *synchronous*: when `k_thread_abort`/`k_thread_join`/`k_timer_stop`/ +`k_work_cancel_sync` return, the kernel holds **zero references** into the +caller's memory — which is what makes stack allocation idiomatic in Zephyr +code. FreeRTOS assumes the opposite: long-lived objects, handle-based +ownership, asynchronous teardown (idle-task reaping), and kernel list nodes +threaded *through* control blocks with no "the kernel is done with this +memory" signal. + +Boreas embeds FreeRTOS control blocks (`StaticTask_t`, `StaticSemaphore_t`, +…) inside caller-owned Zephyr-shaped structs, importing Zephyr's memory model +— so every Boreas API that ends an object's kernel involvement MUST sever all +kernel references before returning (or document loudly where a target port +makes that impossible). Violations are not theoretical: a parked task whose +control block outlived its stack frame caused months of intermittent +scheduler corruption (issues #18, #21 — kernel list operations writing +through dangling nodes into reused frames). New APIs must be designed against +this principle, and any divergence belongs in an `@note` on the declaration. + +Current status: `k_thread` severs synchronously on silicon and documents the +linux best-effort window; `k_work`/`k_work_sync` unlink synchronously via +their own state machine; the k_timer linux backend dequeues synchronously on +stop; `k_sem`/`k_mutex`/`k_msgq`/`k_event` waiters are unlinked by FreeRTOS +before the blocking call returns (a *blocked* caller's frame is necessarily +live, and give/set/put paths finish with the control block before the waiter +resumes — but do not let a stack-allocated object's frame die while another +context may still be inside a give/send on it; prefer static storage for +objects signaled from other contexts). + ## Headers | Header | Contents | diff --git a/test/main/test_k_sem.c b/test/main/test_k_sem.c index 3cf8e53..a4a2468 100644 --- a/test/main/test_k_sem.c +++ b/test/main/test_k_sem.c @@ -116,12 +116,11 @@ static void test_sem_auto_init_pre_main(void) /* ---------------------------------------------------------------- * Stack-local sem + K_FOREVER + cross-priority give * - * Repro harness for the latent corruption surfaced 2026-04-29 on - * ESP32-S3 (stack-local sem + K_FOREVER take + higher-priority giver - * corrupted the waiter's frame across the yield; static sems and - * same-priority givers were immune). Run EARLY so the rest of the - * suite acts as the delayed-corruption detector -- the original bug - * panicked unrelated later tests, not the trigger itself. + * Regression harness for the corruption surfaced 2026-04-29 on + * ESP32-S3 (root-caused to the pre-#18 k_thread zombie defect, issue + * #21 -- see the April-shapes block below). Run EARLY so the rest of + * the suite acts as the delayed-corruption detector: the original + * bug panicked unrelated later tests, not the trigger itself. * ---------------------------------------------------------------- */ K_THREAD_STACK_DEFINE(sem_giver_stack, 4096); @@ -207,10 +206,16 @@ static void test_sem_stack_forever_timer_giver(void) } /* ---------------------------------------------------------------- - * EXPERIMENT A: faithful reconstruction of the April 2026 crash - * shapes (devlog/2026-04-29_k_sem_stack_local_kforever_corruption.md). - * Requires CONFIG_K_TIMER_DISPATCH_ISR=n so expiry gives from - * ESP_TIMER_TASK in task context (prio ~22) -- the original giver. + * Regression: the April 2026 corruption shapes. + * + * Stack-local timer+sem structs blocking on K_FOREVER takes, given + * from the timer expiry context. Root cause was NOT k_sem: parked + * tasks from the pre-#18 k_thread lifecycle left dangling + * xStateListItem nodes in dead stack frames; frame reuse poisoned + * them and later kernel list operations corrupted memory (proven by + * injection on the linux target, issue #21). Fixed by #18. These + * shapes reproduce the original trigger and must stay green under + * either k_timer dispatch mode. * ---------------------------------------------------------------- */ /* PR-4's attempted k_timer shape: sem embedded next to the timer in @@ -274,108 +279,6 @@ static void test_sem_april_periodic_status_sync(void) } } -#if CONFIG_IDF_TARGET_LINUX -/* ---------------------------------------------------------------- - * EXPERIMENT B: zombie-injection mechanism proof. - * - * Hypothesis (#21): the April 2026 corruption was the pre-#18 - * k_thread defect -- a parked task whose TCB (and thus its - * xStateListItem) lived in a DEAD stack frame. Any later insert into - * the same kernel list (e.g. a K_FOREVER sem take parking the caller - * on xSuspendedTaskList) makes the kernel WRITE through the dangling - * node into memory it doesn't own. - * - * This test recreates that zombie deliberately with raw FreeRTOS - * calls, snapshots the dead-frame TCB bytes (never writing them - * ourselves -- frame depths are controlled so no live local overlaps - * the region), parks a stack-sem waiter, and memcmp's. A delta is - * PROOF of the mechanism. A zero delta is INCONCLUSIVE (node - * adjacency in the shared suspended list isn't guaranteed), not a - * refutation. - * ---------------------------------------------------------------- */ - -static TaskHandle_t zombie_handle; -static uint8_t *volatile zombie_region; - -static void zombie_entry(void *arg) -{ - (void)arg; - vTaskSuspend(NULL); /* park forever: the pre-#18 zombie shape */ -} - -static StackType_t zombie_stack[2048 / sizeof(StackType_t)]; - -static __attribute__((noinline)) void make_zombie(void) -{ - StaticTask_t tcb; /* FUNCTION-LOCAL: becomes the dangling node */ - - zombie_handle = xTaskCreateStaticPinnedToCore(zombie_entry, "zombie", - sizeof(zombie_stack) / sizeof(StackType_t), - NULL, 5, zombie_stack, &tcb, 0); - TEST_ASSERT_NOT_NULL(zombie_handle); - zombie_region = (uint8_t *)&tcb; - k_msleep(20); /* let it reach the park */ - /* Return: tcb now dangles in this dead frame. Deliberate. */ -} - -/* Push the zombie's frame deeper than any later live frame so nothing - * in this test overwrites the region before we inspect it. The pad - * must exceed the deepest subsequent call chain -- on the POSIX port - * the waiter's blocking path (k_sem_take -> xQueueSemaphoreTake -> - * vPortYield -> event_wait -> pthread internals) runs on the same - * pthread stack, so be generous. A 512 B pad demonstrably was NOT - * enough: the waiter chain trampled the zombie TCB and vTaskDelete - * wedged in uxListRemove (which is itself the April mechanism, via - * frame reuse -- but the surgical kernel-write proof needs the region - * untouched by us). */ -static void *volatile pad_escape; /* defeats array shrinking/elision */ - -static __attribute__((noinline)) void call_deep(void (*fn)(void)) -{ - uint8_t pad[8192]; - - pad_escape = pad; /* address escapes: array must exist */ - memset(pad, 0x5A, sizeof(pad)); - __asm__ volatile("" ::"r"(pad) : "memory"); - fn(); - __asm__ volatile("" ::"r"(pad) : "memory"); /* alive across the call */ -} - -static void test_sem_zombie_injection_mechanism(void) -{ - static uint8_t snapshot[sizeof(StaticTask_t)]; - - call_deep(make_zombie); - memcpy(snapshot, zombie_region, sizeof(snapshot)); - - /* Park a stack-sem waiter: the suspended-list insert links our - * TCB adjacent to the dangling zombie node, writing into it. */ - stack_sem_forever_cycle(22); - - int delta = memcmp(snapshot, zombie_region, sizeof(snapshot)); - - /* Clean the list BEFORE asserting: delete-other reclaims - * synchronously on silicon and unlinks the dangling node. On the - * POSIX port teardown is idle-deferred and dereferences the TCB - * in this (still live) frame -- give idle its reap window before - * returning, mirroring k_thread_abort. */ - vTaskDelete(zombie_handle); - k_msleep(2 * portTICK_PERIOD_MS); - - if (delta == 0) { - TEST_MESSAGE("INCONCLUSIVE: no kernel write landed in the " - "zombie region this run (list adjacency not " - "guaranteed)"); - } else { - TEST_MESSAGE("MECHANISM PROVEN: kernel wrote through the " - "dangling suspended-list node into dead-frame " - "memory"); - } - TEST_PASS(); /* outcome is reported, not asserted -- see above */ -} -#endif /* CONFIG_IDF_TARGET_LINUX -- the 8K pad would overflow the S3 \ - * main task stack; the mechanism is proven on linux. */ - void test_k_sem_group(void) { RUN_TEST(test_sem_stack_forever_same_prio); @@ -383,9 +286,6 @@ void test_k_sem_group(void) RUN_TEST(test_sem_stack_forever_timer_giver); RUN_TEST(test_sem_april_oneshot_status_sync); RUN_TEST(test_sem_april_periodic_status_sync); -#if CONFIG_IDF_TARGET_LINUX - RUN_TEST(test_sem_zombie_injection_mechanism); -#endif RUN_TEST(test_sem_init_and_count); RUN_TEST(test_sem_give_and_take); RUN_TEST(test_sem_take_timeout); From 64d7aac1da0c9a84c487cc9d70cd486f861c99c2 Mon Sep 17 00:00:00 2001 From: swoiszwillo Date: Sat, 6 Jun 2026 18:57:49 -0600 Subject: [PATCH 3/3] fix: address PR #39 review feedback - Drop the include orphaned by the zombie-test removal. - Pass the target sem to the giver thread via p1 instead of a shared global. - README precision: the control-block updates that wake a waiter complete before the waiter runs, but the giving context may be preempted by the woken waiter before its call returns and can still touch the control block afterwards (especially under SMP) -- which is exactly why the stack-allocation caveat exists. Co-Authored-By: Claude Opus 4.8 (1M context) --- components/zkernel/README.md | 10 ++++++---- test/main/test_k_sem.c | 8 ++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/components/zkernel/README.md b/components/zkernel/README.md index 79a0c78..4a0e3ee 100644 --- a/components/zkernel/README.md +++ b/components/zkernel/README.md @@ -28,10 +28,12 @@ linux best-effort window; `k_work`/`k_work_sync` unlink synchronously via their own state machine; the k_timer linux backend dequeues synchronously on stop; `k_sem`/`k_mutex`/`k_msgq`/`k_event` waiters are unlinked by FreeRTOS before the blocking call returns (a *blocked* caller's frame is necessarily -live, and give/set/put paths finish with the control block before the waiter -resumes — but do not let a stack-allocated object's frame die while another -context may still be inside a give/send on it; prefer static storage for -objects signaled from other contexts). +live, and the control-block updates that wake a waiter complete before the +waiter runs — but the giving/sending context may be preempted by the woken +waiter before its own call returns, and can still touch the control block +afterwards, especially under SMP. Do not let a stack-allocated object's +frame die while another context may still be inside a give/send on it; +prefer static storage for objects signaled from other contexts). ## Headers diff --git a/test/main/test_k_sem.c b/test/main/test_k_sem.c index a4a2468..9adccca 100644 --- a/test/main/test_k_sem.c +++ b/test/main/test_k_sem.c @@ -4,7 +4,6 @@ */ #include -#include #include "unity.h" #include "zephyr/kernel.h" @@ -125,15 +124,13 @@ static void test_sem_auto_init_pre_main(void) K_THREAD_STACK_DEFINE(sem_giver_stack, 4096); static struct k_thread sem_giver_thread; -static struct k_sem *volatile giver_target; static void sem_giver_entry(void *p1, void *p2, void *p3) { - (void)p1; (void)p2; (void)p3; k_msleep(30); - k_sem_give(giver_target); + k_sem_give((struct k_sem *)p1); } /* Each cycle uses a fresh stack frame: take on a STACK-LOCAL sem, @@ -144,10 +141,9 @@ static void stack_sem_forever_cycle(int giver_prio) struct k_sem sem; /* stack-local: the trigger */ TEST_ASSERT_EQUAL(0, k_sem_init(&sem, 0, 1)); - giver_target = &sem; k_thread_create(&sem_giver_thread, sem_giver_stack, K_THREAD_STACK_SIZEOF(sem_giver_stack), - sem_giver_entry, NULL, NULL, NULL, giver_prio, 0, K_NO_WAIT); + sem_giver_entry, &sem, NULL, NULL, giver_prio, 0, K_NO_WAIT); TEST_ASSERT_EQUAL(0, k_sem_take(&sem, K_FOREVER)); TEST_ASSERT_EQUAL(0, k_thread_join(&sem_giver_thread, K_SECONDS(2)));