From 946f946819924cc29ad83e5253cb5c20ce3f4ee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Chauvat=20=28uncaney=29?= Date: Wed, 6 May 2026 14:32:39 +0200 Subject: [PATCH] rec: add watchdog to detect tlog_rec_transfer spin loop after child exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the wrapped child has exited but buffered packets remain in the position state machine, the main loop in tlog_rec_transfer can spin indefinitely in tlog_pkt_pos_cmp / tlog_sink_write without making any syscalls. The kernel sees pure compute and cannot intervene. Reproduced in production on Ubuntu 24.04 (tlog 14): a tlog-recorded shell running `find / | head -5` left orphan tlog-rec-session processes burning 99.6% CPU forever (verified via strace -c showing 0 syscalls over 5 seconds, perf top showing tlog_pkt_pos_cmp on top). This patch adds a 30-second watchdog inside the loop: - watchdog_last_progress is updated after each successful sink_write or source_read. - If tlog_rec_child_exited && time(NULL) - watchdog_last_progress > 30, the loop breaks with ETIMEDOUT and an error message. No behavior change for healthy sessions — the watchdog only fires when the bug condition is hit. Tested via build + smoke runs on a real session. The deeper bug (why tlog_pkt_pos_cmp never reaches a terminal state when there are buffered packets after EOF on input) is separate and would deserve its own investigation. This patch is a defense-in-depth measure that prevents the visible symptom (CPU burn forever). Signed-off-by: Frédéric Chauvat Signed-off-by: Frédéric Chauvat (uncaney) --- lib/tlog/rec.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/tlog/rec.c b/lib/tlog/rec.c index 3645669..ffa5821 100644 --- a/lib/tlog/rec.c +++ b/lib/tlog/rec.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #ifdef TLOG_JOURNAL_ENABLED @@ -834,6 +835,12 @@ tlog_rec_transfer(struct tlog_errs **perrs, struct tlog_pkt pkt = TLOG_PKT_VOID; struct tlog_pkt_pos tty_pos = TLOG_PKT_POS_VOID; struct tlog_pkt_pos log_pos = TLOG_PKT_POS_VOID; + /* + * Watchdog: detect tlog_pkt_pos_cmp / tlog_sink_write spin loop + * that occurs after the wrapped child exits with buffered data + * that the position state machine never finishes flushing. + */ + time_t watchdog_last_progress = time(NULL); tlog_rec_exit_signum = 0; tlog_rec_alarm_set = false; @@ -894,6 +901,17 @@ tlog_rec_transfer(struct tlog_errs **perrs, * Transfer I/O and window changes */ while (tlog_rec_exit_signum == 0) { + /* + * Watchdog: child has exited but we've made no progress in 30s. + * That's a bug: tlog_pkt_pos_cmp / tlog_sink_write is in a spin + * loop that never reaches a terminal state. Force exit to avoid + * burning a CPU forever (cf armoirjaune incident 2026-05-06). + */ + if (tlog_rec_child_exited && (time(NULL) - watchdog_last_progress > 30)) { + tlog_errs_pushs(perrs, "Watchdog: child exited and no flush progress in 30s, forcing exit"); + return_grc = TLOG_GRC_FROM(errno, ETIMEDOUT); + break; + } /* Expected exit conditions */ if (tlog_rec_child_exited) { if (grc == TLOG_GRC_FROM(errno, EIO) || (tlog_pkt_is_eof(&pkt))) { @@ -932,6 +950,7 @@ tlog_rec_transfer(struct tlog_errs **perrs, } break; } + watchdog_last_progress = time(NULL); continue; } @@ -942,6 +961,7 @@ tlog_rec_transfer(struct tlog_errs **perrs, grc = tlog_sink_write(log_sink, &pkt, &log_pos, NULL); if (grc == TLOG_RC_OK) { log_pending = true; + watchdog_last_progress = time(NULL); } else if (grc != TLOG_GRC_FROM(errno, EINTR)) { return_grc = grc; TLOG_ERRS_RAISECS(grc, "Failed logging terminal data"); @@ -967,6 +987,7 @@ tlog_rec_transfer(struct tlog_errs **perrs, return_grc = grc; } } else if (tlog_pkt_is_eof(&pkt)) { + watchdog_last_progress = time(NULL); tlog_sink_io_close(tty_sink, pkt.data.io.output); /* Continue if only input was closed */ if (pkt.data.io.output) {