From 77db9e3de0217f7e4ea67f0bf6ae023bcae7a1be Mon Sep 17 00:00:00 2001
From: Yifan Jiang <19356972+yifjiang@users.noreply.github.com>
Date: Mon, 13 Apr 2026 16:54:20 -0700
Subject: [PATCH 1/2] fix: guard CUDA event elapsed_time in
 perf_metrics_manager to prevent executor crash

Wrap the elapsed_time() calls in compute_batch_gpu_times() with
try/except RuntimeError. If a CUDA event was not recorded on the
current stream, elapsed_time() raises RuntimeError, which propagates
up through the executor event loop and kills the executor thread.

The main process and Dynamo runtime continue running (serving HTTP,
responding to health probes), but with no executor thread, every
inference request hangs forever.

With this fix, a CUDA event timing failure logs 0.0 for that batch
metrics instead of crashing the executor.

Signed-off-by: Yifan Jiang <19356972+yifjiang@users.noreply.github.com>
---
 .../_torch/pyexecutor/perf_metrics_manager.py | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
index 9c2cbede57b0..aacf52eaa2a1 100644
--- a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
@@ -167,14 +167,21 @@ def compute_batch_gpu_times(self, requests):
                     perf.gpu_forward_end_event.synchronize()
                 if perf.gpu_sample_end_event and not perf.gpu_sample_end_event.query():
                     perf.gpu_sample_end_event.synchronize()
-                batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
-                    perf.gpu_forward_end_event
-                )
-                batch_gpu_sample_time = (
-                    perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
-                    if perf.gpu_sample_end_event
-                    else 0.0
-                )
+                try:
+                    batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
+                        perf.gpu_forward_end_event
+                    )
+                    batch_gpu_sample_time = (
+                        perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
+                        if perf.gpu_sample_end_event
+                        else 0.0
+                    )
+                except RuntimeError:
+                    # CUDA event timing can fail if events were not recorded
+                    # on the current stream. Skip metrics for this batch rather
+                    # than crashing the executor thread.
+                    batch_gpu_forward_time = 0.0
+                    batch_gpu_sample_time = 0.0
 
             target["gpu_forward_time"] = batch_gpu_forward_time
             target["gpu_sample_time"] = batch_gpu_sample_time

From 9bd2416717f9497e1286788979b88e47e953f0cb Mon Sep 17 00:00:00 2001
From: Yifan Jiang <19356972+yifjiang@users.noreply.github.com>
Date: Mon, 13 Apr 2026 16:54:20 -0700
Subject: [PATCH 2/2] fix: add logger.warning when CUDA event elapsed_time
 fails

Log the RuntimeError details so the failure is visible in logs
rather than silently zeroing the metrics. This helps operators
diagnose whether the timing failure indicates a deeper issue
with the forward pass or stream synchronization.

Uses tensorrt_llm.logger (codebase convention) instead of
stdlib logging.getLogger.

Signed-off-by: Yifan Jiang <19356972+yifjiang@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
index aacf52eaa2a1..4d8d8351e2aa 100644
--- a/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
@@ -11,6 +11,7 @@
 
 import torch
 
+from tensorrt_llm.logger import logger
 from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds
 
 from .llm_request import PerfTimingInfo
@@ -176,10 +177,16 @@ def compute_batch_gpu_times(self, requests):
                         if perf.gpu_sample_end_event
                         else 0.0
                     )
-                except RuntimeError:
+                except RuntimeError as e:
                     # CUDA event timing can fail if events were not recorded
                     # on the current stream. Skip metrics for this batch rather
                     # than crashing the executor thread.
+                    logger.warning(
+                        "Failed to compute GPU event elapsed_time: %s. "
+                        "Setting batch GPU times to 0.0. This may indicate "
+                        "an issue with the forward pass or stream synchronization.",
+                        e,
+                    )
                     batch_gpu_forward_time = 0.0
                     batch_gpu_sample_time = 0.0