Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions tensorrt_llm/_torch/pyexecutor/perf_metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import torch

from tensorrt_llm.logger import logger
from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds

from .llm_request import PerfTimingInfo
Expand Down Expand Up @@ -167,14 +168,27 @@ def compute_batch_gpu_times(self, requests):
perf.gpu_forward_end_event.synchronize()
if perf.gpu_sample_end_event and not perf.gpu_sample_end_event.query():
perf.gpu_sample_end_event.synchronize()
batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
perf.gpu_forward_end_event
)
batch_gpu_sample_time = (
perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
if perf.gpu_sample_end_event
else 0.0
)
try:
batch_gpu_forward_time = perf.gpu_forward_start_event.elapsed_time(
perf.gpu_forward_end_event
)
batch_gpu_sample_time = (
perf.gpu_forward_end_event.elapsed_time(perf.gpu_sample_end_event)
if perf.gpu_sample_end_event
else 0.0
)
except RuntimeError as e:
# CUDA event timing can fail if events were not recorded
# on the current stream. Skip metrics for this batch rather
# than crashing the executor thread.
logger.warning(
"Failed to compute GPU event elapsed_time: %s. "
"Setting batch GPU times to 0.0. This may indicate "
"an issue with the forward pass or stream synchronization.",
e,
)
batch_gpu_forward_time = 0.0
batch_gpu_sample_time = 0.0

target["gpu_forward_time"] = batch_gpu_forward_time
target["gpu_sample_time"] = batch_gpu_sample_time
Expand Down
Loading