From 77db9e3de0217f7e4ea67f0bf6ae023bcae7a1be Mon Sep 17 00:00:00 2001 From: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:54:20 -0700 Subject: [PATCH 1/2] fix: guard CUDA event elapsed_time in perf_metrics_manager to prevent executor crash Wrap the elapsed_time() calls in compute_batch_gpu_times() with try/except RuntimeError. If a CUDA event was not recorded on the current stream, elapsed_time() raises RuntimeError, which propagates up through the executor event loop and kills the executor thread. The main process and Dynamo runtime continue running (serving HTTP, responding to health probes), but with no executor thread, every inference request hangs forever. With this fix, a CUDA event timing failure logs 0.0 for that batch metrics instead of crashing the executor. Signed-off-by: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> --- .../_torch/pyexecutor/perf_metrics_manager.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py index 9c2cbede57b0..aacf52eaa2a1 100644 --- a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py @@ -167,14 +167,21 @@ def compute_batch_gpu_times(self, requests): perf.gpu_forward_end_event.synchronize() if perf.gpu_sample_end_event and not perf.gpu_sample_end_event.query(): perf.gpu_sample_end_event.synchronize() - batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time( - perf.gpu_forward_end_event - ) - batch_gpu_sample_time = ( - perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event) - if perf.gpu_sample_end_event - else 0.0 - ) + try: + batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time( + perf.gpu_forward_end_event + ) + batch_gpu_sample_time = ( + perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event) + if perf.gpu_sample_end_event + else 0.0 + ) + except RuntimeError: + # CUDA event timing can fail if events were not recorded + # on the current stream. Skip metrics for this batch rather + # than crashing the executor thread. + batch_gpu_forward_time = 0.0 + batch_gpu_sample_time = 0.0 target["gpu_forward_time"] = batch_gpu_forward_time target["gpu_sample_time"] = batch_gpu_sample_time From 9bd2416717f9497e1286788979b88e47e953f0cb Mon Sep 17 00:00:00 2001 From: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:54:20 -0700 Subject: [PATCH 2/2] fix: add logger.warning when CUDA event elapsed_time fails Log the RuntimeError details so the failure is visible in logs rather than silently zeroing the metrics. This helps operators diagnose whether the timing failure indicates a deeper issue with the forward pass or stream synchronization. Uses tensorrt_llm.logger (codebase convention) instead of stdlib logging.getLogger. Signed-off-by: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py index aacf52eaa2a1..4d8d8351e2aa 100644 --- a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py @@ -11,6 +11,7 @@ import torch +from tensorrt_llm.logger import logger from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds from .llm_request import PerfTimingInfo @@ -176,10 +177,16 @@ def compute_batch_gpu_times(self, requests): if perf.gpu_sample_end_event else 0.0 ) - except RuntimeError: + except RuntimeError as e: # CUDA event timing can fail if events were not recorded # on the current stream. Skip metrics for this batch rather # than crashing the executor thread. + logger.warning( + "Failed to compute GPU event elapsed_time: %s. " + "Setting batch GPU times to 0.0. This may indicate " + "an issue with the forward pass or stream synchronization.", + e, + ) batch_gpu_forward_time = 0.0 batch_gpu_sample_time = 0.0