From 6e65d44db4b88985e51e5b2b43a3bc8d4b03a38c Mon Sep 17 00:00:00 2001 From: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com> Date: Thu, 9 Apr 2026 09:50:06 -0700 Subject: [PATCH 1/2] [None][perf] Use sliding-64 batch sizes for padding-enabled CUDA graphs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When enable_padding=True, replace the sparse powers-of-2 schedule (256, 512, 1024, 2048) with uniform +64 increments after the initial [1,2,4,8,...,128] base, giving denser coverage (192, 256, 320, …, max_batch_size) and reducing padding waste at intermediate batch sizes. Signed-off-by: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com> --- tensorrt_llm/llmapi/llm_args.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 9f001b4e5ae..080845f4d37 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -167,18 +167,20 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int, List of batch sizes to create CUDA graphs for """ if enable_padding: + # Start with [1, 2, 4, 8, 16, 24, ..., 128] (multiples of 8) batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)] + # Sliding 64: extend by increments of 64 up to max_batch_size + while batch_sizes[-1] + 64 <= max_batch_size: + batch_sizes.append(batch_sizes[-1] + 64) else: batch_sizes = list(range(1, 32)) + [32, 64, 128] - - # Add powers of 2 up to max_batch_size - batch_sizes += [ - 2**i for i in range(8, math.ceil(math.log(max_batch_size, 2))) - ] - - # Filter and sort batch sizes - batch_sizes = sorted( - [size for size in batch_sizes if size <= max_batch_size]) + # Add powers of 2 up to max_batch_size + batch_sizes += [ + 2**i for i in range(8, math.ceil(math.log(max_batch_size, 2))) + ] + # Filter and sort batch sizes + batch_sizes = sorted( + [size for size in batch_sizes if size <= max_batch_size]) # Add max_batch_size if not already included if max_batch_size != batch_sizes[-1]: From a557b8b758b4014fcf4c7aac96eb2bf9fe350955 Mon Sep 17 00:00:00 2001 From: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com> Date: Fri, 3 Apr 2026 07:18:56 -0700 Subject: [PATCH 2/2] [None][perf] Clamp padding-mode CUDA graph batch sizes to max_batch_size Move filter/sort outside the if/else so sizes exceeding max_batch_size are dropped in the enable_padding=True branch as well. Add guard for empty list before the max_batch_size append. Add regression tests for edge cases: max_batch_size=64, 129, 320. Signed-off-by: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com> --- tensorrt_llm/llmapi/llm_args.py | 9 +++++---- tests/unittest/llmapi/test_llm_args.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 080845f4d37..2eb6da5f614 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -178,12 +178,13 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int, batch_sizes += [ 2**i for i in range(8, math.ceil(math.log(max_batch_size, 2))) ] - # Filter and sort batch sizes - batch_sizes = sorted( - [size for size in batch_sizes if size <= max_batch_size]) + + # Filter and sort batch sizes for both branches + batch_sizes = sorted( + [size for size in batch_sizes if size <= max_batch_size]) # Add max_batch_size if not already included - if max_batch_size != batch_sizes[-1]: + if not batch_sizes or max_batch_size != batch_sizes[-1]: batch_sizes.append(max_batch_size) return batch_sizes diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index 999edbcdcde..94d7b1cddaf 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -591,6 +591,16 @@ def test_cuda_graph_batch_sizes_case_2(self): 128, True) assert args.cuda_graph_config.max_batch_size == 128 + @pytest.mark.parametrize("max_batch_size", [64, 129, 320]) + def test_generate_cuda_graph_batch_sizes_padding_edge_cases( + self, max_batch_size): + # All sizes must be <= max_batch_size, sorted, and include max_batch_size + batch_sizes = CudaGraphConfig._generate_cuda_graph_batch_sizes( + max_batch_size, enable_padding=True) + assert all(s <= max_batch_size for s in batch_sizes) + assert batch_sizes == sorted(batch_sizes) + assert max_batch_size in batch_sizes + class TestTrtLlmArgs: