From 4422bc5ea57fd588236e9cde3af6b886347cf011 Mon Sep 17 00:00:00 2001 From: swoiszwillo Date: Sun, 7 Jun 2026 12:11:12 -0600 Subject: [PATCH 1/3] fix: k_timer_status_sync blocks on an embedded sem, not a poll loop Rewrites k_timer_status_sync (both backends) from the k_msleep(1) busy-poll to a true blocking wait on a binary k_sem embedded in struct k_timer -- the original PR-4 design, now safe: the April crash shapes that killed it were root-caused to the pre-#18 k_thread zombies (#21) and are green regression tests, and the sem itself is notification-backed (#41) with synchronous severance. Upstream-verified semantics (kernel/timer.c): - single-waiter model (upstream's wait_q holds "the (single) thread waiting on this timer"; expiry/stop wake exactly one) -> a binary sem latch matches exactly - woken by expiry OR stop; status re-read after wake; status reset to 0 on every path; returns 0 immediately when stopped - the wake fires AFTER the expiry callback / stop_fn (upstream order) The sem is a wake LATCH, not a counter -- the expiry count stays in timer->status. A give latched while nobody waited would satisfy a later take early, so status_sync re-checks and re-blocks in a loop; pinned by the new test_timer_status_sync_after_status_get_blocks regression. Wakes are now immediate instead of quantized to the FreeRTOS tick; the busy-poll divergence note on the declaration is replaced. k_sem_give is ISR-safe (IRAM), so the expiry-path give works under CONFIG_K_TIMER_DISPATCH_ISR=y, the ESP_TIMER_TASK fallback, and the linux dispatcher. K_TIMER_DEFINE gains the compile-time sem initializer; k_work_init_delayable inherits the sem init via k_timer_init. struct k_timer grows by sizeof(struct k_sem) (~24 B), including inside k_work_delayable. k_timer_remaining_get needed no change: it already returns uint32_t milliseconds like upstream's k_ticks_to_ms_floor32 shape. linux suite: 206/0 x3 (204 + 2). Closes #28 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../zkernel/include/boreas/zephyr/kernel.h | 22 ++++++---- components/zkernel/src/k_timer.c | 40 +++++++++++++----- components/zkernel/src/k_timer_linux.c | 26 +++++++++--- test/main/test_k_timer.c | 42 +++++++++++++++++++ 4 files changed, 105 insertions(+), 25 deletions(-) diff --git a/components/zkernel/include/boreas/zephyr/kernel.h b/components/zkernel/include/boreas/zephyr/kernel.h index d489852..be7c0f6 100644 --- a/components/zkernel/include/boreas/zephyr/kernel.h +++ b/components/zkernel/include/boreas/zephyr/kernel.h @@ -415,6 +415,10 @@ struct k_timer { k_timer_stop_t stop_fn; void *user_data; uint32_t status; /* expiry count since last status read */ + /* status_sync wake latch (binary sem, upstream's single-waiter + * model): given after each expiry and by k_timer_stop. The COUNT + * lives in `status`; the sem only carries the wake. */ + struct k_sem sync_sem; bool running; bool is_periodic; /* last k_timer_start was periodic; one-shot if false */ bool first_interval_pending; /* start-once then switch to periodic */ @@ -429,6 +433,10 @@ struct k_timer { struct k_timer name = { \ .expiry_fn = (_expiry_fn), \ .stop_fn = (_stop_fn), \ + .sync_sem = {.count = 0, \ + .limit = 1, \ + .waiters = SYS_DLIST_STATIC_INIT(&name.sync_sem.waiters), \ + .lock = portMUX_INITIALIZER_UNLOCKED}, \ } void k_timer_init(struct k_timer *timer, k_timer_expiry_t expiry_fn, k_timer_stop_t stop_fn); @@ -460,15 +468,13 @@ uint32_t k_timer_status_get(struct k_timer *timer); * Returns immediately if the count is already non-zero, or if the * timer is already stopped. * - * @note Must not be called from an interrupt handler (blocks). + * @note Must not be called from an interrupt handler (blocks) -- + * matches upstream. * - * @note Boreas implementation polls every k_msleep(1). Upstream - * Zephyr blocks the calling thread on a wait queue. Same - * caller-visible semantics; the wake granularity here is - * bounded by the FreeRTOS tick period (portTICK_PERIOD_MS, - * set by CONFIG_FREERTOS_HZ -- 1ms at the Boreas default of - * 1000 Hz, 10ms at the ESP-IDF Kconfig default of 100 Hz). - * A wake can therefore be up to one tick period late. + * @note Blocks on the timer's embedded semaphore, woken by expiry or + * k_timer_stop (upstream's single-waiter wait-queue model: one + * thread waits per timer). Like upstream, a thread blocked here + * must not be aborted (see the @note on k_sem_take). */ uint32_t k_timer_status_sync(struct k_timer *timer); diff --git a/components/zkernel/src/k_timer.c b/components/zkernel/src/k_timer.c index ba85dc3..14a845a 100644 --- a/components/zkernel/src/k_timer.c +++ b/components/zkernel/src/k_timer.c @@ -40,6 +40,11 @@ static void K_ISR_SAFE k_timer_esp_callback(void *arg) if (!__atomic_load_n(&timer->is_periodic, __ATOMIC_ACQUIRE)) { __atomic_store_n(&timer->running, false, __ATOMIC_RELEASE); } + + /* Wake a status_sync waiter LAST (upstream wakes at the end of + * its expiration handler, after the expiry callback). ISR-safe + * under both dispatch modes. */ + k_sem_give(&timer->sync_sem); } void k_timer_init(struct k_timer *timer, k_timer_expiry_t expiry_fn, k_timer_stop_t stop_fn) @@ -53,6 +58,9 @@ void k_timer_init(struct k_timer *timer, k_timer_expiry_t expiry_fn, k_timer_sto timer->first_interval_pending = false; timer->period_us = 0; timer->handle = NULL; + /* Binary: the sem is the status_sync wake latch, not a counter + * (the expiry count lives in timer->status). */ + (void)k_sem_init(&timer->sync_sem, 0, 1); const esp_timer_create_args_t args = { .callback = k_timer_esp_callback, @@ -133,6 +141,9 @@ void k_timer_stop(struct k_timer *timer) if (timer->stop_fn) { timer->stop_fn(timer); } + /* Wake a status_sync waiter (upstream unpends one waiter on + * stop, after invoking the stop function). */ + k_sem_give(&timer->sync_sem); } } @@ -143,18 +154,25 @@ uint32_t k_timer_status_get(struct k_timer *timer) uint32_t k_timer_status_sync(struct k_timer *timer) { - /* Block until the timer expires or is stopped. Polls every - * k_msleep(1), which rounds up to one FreeRTOS tick - * (portTICK_PERIOD_MS, set by CONFIG_FREERTOS_HZ -- 1ms at the - * Boreas default of 1000 Hz, 10ms at the ESP-IDF Kconfig default - * of 100 Hz). Caller-visible semantics match upstream Zephyr's - * wait-queue version (block until expiry or stop, return count - * atomically); the divergence is the polling implementation. */ - while (__atomic_load_n(&timer->running, __ATOMIC_ACQUIRE) && - __atomic_load_n(&timer->status, __ATOMIC_ACQUIRE) == 0) { - k_msleep(1); + /* Upstream shape: return the accumulated count immediately if + * non-zero, or 0 if the timer is stopped; otherwise block until + * expiry or stop. Blocking rides the embedded sync sem, which + * the expiry path and k_timer_stop give (k_sem_give is ISR-safe, + * so this works under both dispatch modes). The sem is a wake + * LATCH, not a counter: a give latched while no waiter was + * blocked makes a later take return early, so re-check and + * re-block in a loop. */ + for (;;) { + uint32_t result = __atomic_exchange_n(&timer->status, 0, __ATOMIC_ACQ_REL); + + if (result > 0) { + return result; + } + if (!__atomic_load_n(&timer->running, __ATOMIC_ACQUIRE)) { + return 0; + } + (void)k_sem_take(&timer->sync_sem, K_FOREVER); } - return __atomic_exchange_n(&timer->status, 0, __ATOMIC_ACQ_REL); } uint32_t k_timer_remaining_get(struct k_timer *timer) diff --git a/components/zkernel/src/k_timer_linux.c b/components/zkernel/src/k_timer_linux.c index 426ee67..54efc62 100644 --- a/components/zkernel/src/k_timer_linux.c +++ b/components/zkernel/src/k_timer_linux.c @@ -110,6 +110,8 @@ static void z_timer_dispatcher(void *arg) if (!__atomic_load_n(&due->is_periodic, __ATOMIC_ACQUIRE)) { __atomic_store_n(&due->running, false, __ATOMIC_RELEASE); } + /* Wake a status_sync waiter LAST (see k_timer.c). */ + k_sem_give(&due->sync_sem); continue; /* more timers may already be due */ } @@ -164,6 +166,9 @@ void k_timer_init(struct k_timer *timer, k_timer_expiry_t expiry_fn, k_timer_sto timer->first_interval_pending = false; /* unused on linux; deadline math is direct */ timer->period_us = 0; timer->deadline_us = 0; + /* Binary: the sem is the status_sync wake latch, not a counter + * (the expiry count lives in timer->status). */ + (void)k_sem_init(&timer->sync_sem, 0, 1); timer->node.next = NULL; timer->node.prev = NULL; /* No esp_timer exists on linux; `handle` serves only as the shared @@ -214,6 +219,9 @@ void k_timer_stop(struct k_timer *timer) if (timer->stop_fn) { timer->stop_fn(timer); } + /* Wake a status_sync waiter (upstream unpends one waiter on + * stop, after invoking the stop function). */ + k_sem_give(&timer->sync_sem); } uint32_t k_timer_status_get(struct k_timer *timer) @@ -223,13 +231,19 @@ uint32_t k_timer_status_get(struct k_timer *timer) uint32_t k_timer_status_sync(struct k_timer *timer) { - /* Same poll-based implementation as the hardware backend in - * k_timer.c -- see the divergence note there. */ - while (__atomic_load_n(&timer->running, __ATOMIC_ACQUIRE) && - __atomic_load_n(&timer->status, __ATOMIC_ACQUIRE) == 0) { - k_msleep(1); + /* Same sem-backed implementation as the hardware backend -- see + * the rationale in k_timer.c. */ + for (;;) { + uint32_t result = __atomic_exchange_n(&timer->status, 0, __ATOMIC_ACQ_REL); + + if (result > 0) { + return result; + } + if (!__atomic_load_n(&timer->running, __ATOMIC_ACQUIRE)) { + return 0; + } + (void)k_sem_take(&timer->sync_sem, K_FOREVER); } - return __atomic_exchange_n(&timer->status, 0, __ATOMIC_ACQ_REL); } /* Locked snapshot of deadline_us: the dispatcher re-arms periodic timers diff --git a/test/main/test_k_timer.c b/test/main/test_k_timer.c index 9d2485c..bd215f5 100644 --- a/test/main/test_k_timer.c +++ b/test/main/test_k_timer.c @@ -196,6 +196,46 @@ static void test_timer_status_sync_wakes_on_stop(void) k_thread_join(&waker_thread, K_FOREVER); } +/* #28 regression: the status_sync wake rides a binary sem LATCH, and a + * latched give from an expiry nobody was waiting on must not satisfy a + * later status_sync early -- after status_get drains the count, sync + * must block until the NEXT expiry (the re-check loop in + * k_timer_status_sync is what absorbs the stale latch). */ +static void test_timer_status_sync_after_status_get_blocks(void) +{ + struct k_timer timer; + k_timer_init(&timer, test_timer_cb, NULL); + + /* Expire at least once with NO waiter: the latch is left set. */ + k_timer_start(&timer, K_MSEC(50), K_MSEC(50)); + k_msleep(75); /* mid-period: one expiry has landed */ + TEST_ASSERT_GREATER_OR_EQUAL(1, k_timer_status_get(&timer)); + + uint32_t t0 = (uint32_t)k_uptime_get(); + uint32_t status = k_timer_status_sync(&timer); + uint32_t elapsed = (uint32_t)k_uptime_get() - t0; + + TEST_ASSERT_GREATER_OR_EQUAL(1, status); + /* Blocked for ~the rest of the period -- an immediate return + * (stale latch honored) would land well under 10 ms. */ + TEST_ASSERT_GREATER_OR_EQUAL(10, elapsed); + TEST_ASSERT_LESS_THAN(200, elapsed); + + k_timer_stop(&timer); +} + +static void test_timer_status_sync_stopped_returns_immediately(void) +{ + struct k_timer timer; + k_timer_init(&timer, test_timer_cb, NULL); + + /* Upstream: a stopped/never-started timer with status 0 returns 0 + * without blocking. */ + uint32_t t0 = (uint32_t)k_uptime_get(); + TEST_ASSERT_EQUAL(0, k_timer_status_sync(&timer)); + TEST_ASSERT_LESS_THAN(10, (uint32_t)k_uptime_get() - t0); +} + static void test_timer_remaining_ticks(void) { struct k_timer timer; @@ -455,6 +495,8 @@ void test_k_timer_group(void) RUN_TEST(test_timer_status_sync_blocking); RUN_TEST(test_timer_status_sync_already_fired); RUN_TEST(test_timer_status_sync_wakes_on_stop); + RUN_TEST(test_timer_status_sync_after_status_get_blocks); + RUN_TEST(test_timer_status_sync_stopped_returns_immediately); RUN_TEST(test_timer_remaining_ticks); RUN_TEST(test_timer_expires_ticks); RUN_TEST(test_timer_one_shot_clears_running); From 9e26abf84f885cc63d863407830eb5d69c40f476 Mon Sep 17 00:00:00 2001 From: swoiszwillo Date: Sun, 7 Jun 2026 12:16:31 -0600 Subject: [PATCH 2/3] fix: status_sync review feedback -- close one-shot window, harden test Review fan-out findings folded: - One-shot exchange-vs-running window closed: between status_sync's status exchange and its running load, a one-shot expiry could complete entirely (status=1, running=false, give latched) and the waiter returned 0 where upstream's spinlocked read returns 1 -- terminally lost for a one-shot in a while(status_sync()) drain loop. The !running path now re-reads status instead of returning 0; the callback orders status++ (RELEASE) before running=false (RELEASE), so an ACQUIRE load of running==false is guaranteed to observe the increment. - test_timer_status_sync_after_status_get_blocks made deterministic: one-shot expiries only and t0 captured BEFORE the restart, so the elapsed lower bound holds under arbitrary host stall (the periodic re-arm racing the entry was CI-flaky both directions). - @note on k_timer_start: restart does not wake a blocked status_sync waiter (upstream parity). - @note on k_timer_init: re-init while a status_sync waiter is blocked is caller error (clobbers the embedded sem's waiter list). - Z_SEM_INITIALIZER single-sources the compile-time k_sem initializer body for K_SEM_DEFINE and K_TIMER_DEFINE. Adversarial review refuted (no change needed): zeroed-sem gives are structurally unreachable before k_timer_init (running-guard + handle-guard); stale latches cost at most one extra loop iteration (binary sem caps at 1, no busy-spin); SMP ordering is sound (RELEASE-before-give, portMUX barriers, aligned 32-bit status). linux suite: 206/0 x3. Refs #28 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../zkernel/include/boreas/zephyr/kernel.h | 33 +++++++++++++------ components/zkernel/src/k_timer.c | 9 ++++- components/zkernel/src/k_timer_linux.c | 9 ++++- test/main/test_k_timer.c | 26 +++++++++------ 4 files changed, 55 insertions(+), 22 deletions(-) diff --git a/components/zkernel/include/boreas/zephyr/kernel.h b/components/zkernel/include/boreas/zephyr/kernel.h index be7c0f6..c15e5a0 100644 --- a/components/zkernel/include/boreas/zephyr/kernel.h +++ b/components/zkernel/include/boreas/zephyr/kernel.h @@ -182,18 +182,24 @@ struct k_sem { portMUX_TYPE lock; }; +/* Compile-time k_sem initializer body. @p self must be the sem being + * initialized (the waiters list is self-referential). Single source + * for K_SEM_DEFINE and embedded sems (K_TIMER_DEFINE). */ +#define Z_SEM_INITIALIZER(self, _initial, _limit) \ + { \ + .count = (_initial), \ + .limit = (_limit), \ + .waiters = SYS_DLIST_STATIC_INIT(&(self).waiters), \ + .lock = portMUX_INITIALIZER_UNLOCKED, \ + } + /** * Statically define a fully-initialized semaphore. True compile-time * initializer (matches upstream Zephyr): usable from constructors and * SYS_INIT callbacks without any runtime init step. */ #define K_SEM_DEFINE(name, _initial, _limit) \ - struct k_sem name = { \ - .count = (_initial), \ - .limit = (_limit), \ - .waiters = SYS_DLIST_STATIC_INIT(&name.waiters), \ - .lock = portMUX_INITIALIZER_UNLOCKED, \ - }; \ + struct k_sem name = Z_SEM_INITIALIZER(name, _initial, _limit); \ BUILD_ASSERT(((_limit) != 0) && ((_initial) <= (_limit)), \ "K_SEM_DEFINE: limit must be nonzero and >= initial") /* upstream parity */ @@ -433,12 +439,15 @@ struct k_timer { struct k_timer name = { \ .expiry_fn = (_expiry_fn), \ .stop_fn = (_stop_fn), \ - .sync_sem = {.count = 0, \ - .limit = 1, \ - .waiters = SYS_DLIST_STATIC_INIT(&name.sync_sem.waiters), \ - .lock = portMUX_INITIALIZER_UNLOCKED}, \ + .sync_sem = Z_SEM_INITIALIZER(name.sync_sem, 0, 1), \ } +/** + * @note Re-initializing a timer while a thread is blocked in + * k_timer_status_sync on it is a caller error (it clobbers the + * embedded semaphore's waiter list) -- the same reuse rule as + * re-initializing a k_sem with a blocked taker. + */ void k_timer_init(struct k_timer *timer, k_timer_expiry_t expiry_fn, k_timer_stop_t stop_fn); /** * @brief Start or restart a timer. @@ -453,6 +462,10 @@ void k_timer_init(struct k_timer *timer, k_timer_expiry_t expiry_fn, k_timer_sto * synchronize with an in-flight ISR callback on SMP targets. * On linux, k_timer_stop likewise does not synchronize with a * callback the dispatcher task has already dequeued. + * + * @note Restarting does NOT wake a thread blocked in + * k_timer_status_sync -- it keeps waiting for the restarted + * timer's first expiry (or a stop), matching upstream. */ void k_timer_start(struct k_timer *timer, k_timeout_t duration, k_timeout_t period); void k_timer_stop(struct k_timer *timer); diff --git a/components/zkernel/src/k_timer.c b/components/zkernel/src/k_timer.c index 14a845a..1f858d8 100644 --- a/components/zkernel/src/k_timer.c +++ b/components/zkernel/src/k_timer.c @@ -169,7 +169,14 @@ uint32_t k_timer_status_sync(struct k_timer *timer) return result; } if (!__atomic_load_n(&timer->running, __ATOMIC_ACQUIRE)) { - return 0; + /* Re-read, don't return 0: an expiry (or a stop after + * an expiry) may have completed between the exchange + * above and the running load. The callback bumps + * status (RELEASE) before clearing running (RELEASE), + * so a running==false ACQUIRE load is guaranteed to + * observe that increment -- upstream returns the + * count here, never drops it. */ + return __atomic_exchange_n(&timer->status, 0, __ATOMIC_ACQ_REL); } (void)k_sem_take(&timer->sync_sem, K_FOREVER); } diff --git a/components/zkernel/src/k_timer_linux.c b/components/zkernel/src/k_timer_linux.c index 54efc62..e7153f4 100644 --- a/components/zkernel/src/k_timer_linux.c +++ b/components/zkernel/src/k_timer_linux.c @@ -240,7 +240,14 @@ uint32_t k_timer_status_sync(struct k_timer *timer) return result; } if (!__atomic_load_n(&timer->running, __ATOMIC_ACQUIRE)) { - return 0; + /* Re-read, don't return 0: an expiry (or a stop after + * an expiry) may have completed between the exchange + * above and the running load. The callback bumps + * status (RELEASE) before clearing running (RELEASE), + * so a running==false ACQUIRE load is guaranteed to + * observe that increment -- upstream returns the + * count here, never drops it. */ + return __atomic_exchange_n(&timer->status, 0, __ATOMIC_ACQ_REL); } (void)k_sem_take(&timer->sync_sem, K_FOREVER); } diff --git a/test/main/test_k_timer.c b/test/main/test_k_timer.c index bd215f5..21af2f5 100644 --- a/test/main/test_k_timer.c +++ b/test/main/test_k_timer.c @@ -200,26 +200,32 @@ static void test_timer_status_sync_wakes_on_stop(void) * latched give from an expiry nobody was waiting on must not satisfy a * later status_sync early -- after status_get drains the count, sync * must block until the NEXT expiry (the re-check loop in - * k_timer_status_sync is what absorbs the stale latch). */ + * k_timer_status_sync is what absorbs the stale latch). + * + * Deterministic shape (review): one-shot expiries only, and t0 is + * captured BEFORE the restart -- the new expiry cannot fire earlier + * than start+60, so elapsed >= ~60 holds no matter how the host + * stalls the test thread (a periodic re-arm racing the entry was + * flaky under CI load). A buggy implementation that honors the stale + * latch returns 0 with elapsed ~0. */ static void test_timer_status_sync_after_status_get_blocks(void) { struct k_timer timer; k_timer_init(&timer, test_timer_cb, NULL); - /* Expire at least once with NO waiter: the latch is left set. */ - k_timer_start(&timer, K_MSEC(50), K_MSEC(50)); - k_msleep(75); /* mid-period: one expiry has landed */ - TEST_ASSERT_GREATER_OR_EQUAL(1, k_timer_status_get(&timer)); + /* One-shot expires with NO waiter: the latch is left set. */ + k_timer_start(&timer, K_MSEC(20), K_NO_WAIT); + k_msleep(40); + TEST_ASSERT_EQUAL(1, k_timer_status_get(&timer)); + /* Restart (start does not drain the latch), then sync. */ uint32_t t0 = (uint32_t)k_uptime_get(); + k_timer_start(&timer, K_MSEC(60), K_NO_WAIT); uint32_t status = k_timer_status_sync(&timer); uint32_t elapsed = (uint32_t)k_uptime_get() - t0; - TEST_ASSERT_GREATER_OR_EQUAL(1, status); - /* Blocked for ~the rest of the period -- an immediate return - * (stale latch honored) would land well under 10 ms. */ - TEST_ASSERT_GREATER_OR_EQUAL(10, elapsed); - TEST_ASSERT_LESS_THAN(200, elapsed); + TEST_ASSERT_EQUAL(1, status); + TEST_ASSERT_GREATER_OR_EQUAL(50, elapsed); /* blocked until the NEW expiry */ k_timer_stop(&timer); } From aafc7b1b326d7e92ade919244e4198ca55941034 Mon Sep 17 00:00:00 2001 From: swoiszwillo Date: Sun, 7 Jun 2026 12:58:34 -0600 Subject: [PATCH 3/3] docs: mark k_timer_status_sync no-abort note as a divergence (Copilot) Upstream unpends aborted threads from the timer wait queue; Boreas cannot, so this restriction is a divergence, not parity -- matching the wording already on k_sem_take. Co-Authored-By: Claude Opus 4.8 (1M context) --- components/zkernel/include/boreas/zephyr/kernel.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/components/zkernel/include/boreas/zephyr/kernel.h b/components/zkernel/include/boreas/zephyr/kernel.h index c15e5a0..60b23a6 100644 --- a/components/zkernel/include/boreas/zephyr/kernel.h +++ b/components/zkernel/include/boreas/zephyr/kernel.h @@ -486,8 +486,11 @@ uint32_t k_timer_status_get(struct k_timer *timer); * * @note Blocks on the timer's embedded semaphore, woken by expiry or * k_timer_stop (upstream's single-waiter wait-queue model: one - * thread waits per timer). Like upstream, a thread blocked here - * must not be aborted (see the @note on k_sem_take). + * thread waits per timer). + * @note Divergence: a thread blocked here must NOT be aborted. + * Upstream unpends an aborted thread from the timer wait queue; + * Boreas cannot, so the dead thread would leave a dangling + * waiter node (same limitation as k_sem_take -- see its @note). */ uint32_t k_timer_status_sync(struct k_timer *timer);