From 6e65d44db4b88985e51e5b2b43a3bc8d4b03a38c Mon Sep 17 00:00:00 2001
From: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com>
Date: Thu, 9 Apr 2026 09:50:06 -0700
Subject: [PATCH 1/2] [None][perf] Use sliding-64 batch sizes for
 padding-enabled CUDA graphs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When enable_padding=True, replace the sparse powers-of-2 schedule
(256, 512, 1024, 2048) with uniform +64 increments after the initial
[1,2,4,8,...,128] base, giving denser coverage (192, 256, 320, …,
max_batch_size) and reducing padding waste at intermediate batch sizes.

Signed-off-by: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm_args.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 9f001b4e5ae..080845f4d37 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -167,18 +167,20 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int,
             List of batch sizes to create CUDA graphs for
         """
         if enable_padding:
+            # Start with [1, 2, 4, 8, 16, 24, ..., 128] (multiples of 8)
             batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
+            # Sliding 64: extend by increments of 64 up to max_batch_size
+            while batch_sizes[-1] + 64 <= max_batch_size:
+                batch_sizes.append(batch_sizes[-1] + 64)
         else:
             batch_sizes = list(range(1, 32)) + [32, 64, 128]
-
-        # Add powers of 2 up to max_batch_size
-        batch_sizes += [
-            2**i for i in range(8, math.ceil(math.log(max_batch_size, 2)))
-        ]
-
-        # Filter and sort batch sizes
-        batch_sizes = sorted(
-            [size for size in batch_sizes if size <= max_batch_size])
+            # Add powers of 2 up to max_batch_size
+            batch_sizes += [
+                2**i for i in range(8, math.ceil(math.log(max_batch_size, 2)))
+            ]
+            # Filter and sort batch sizes
+            batch_sizes = sorted(
+                [size for size in batch_sizes if size <= max_batch_size])
 
         # Add max_batch_size if not already included
         if max_batch_size != batch_sizes[-1]:

From a557b8b758b4014fcf4c7aac96eb2bf9fe350955 Mon Sep 17 00:00:00 2001
From: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com>
Date: Fri, 3 Apr 2026 07:18:56 -0700
Subject: [PATCH 2/2] [None][perf] Clamp padding-mode CUDA graph batch sizes to
 max_batch_size

Move filter/sort outside the if/else so sizes exceeding max_batch_size
are dropped in the enable_padding=True branch as well. Add guard for
empty list before the max_batch_size append. Add regression tests for
edge cases: max_batch_size=64, 129, 320.

Signed-off-by: Yijing Li <257409031+yijingl-nvidia@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm_args.py        |  9 +++++----
 tests/unittest/llmapi/test_llm_args.py | 10 ++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 080845f4d37..2eb6da5f614 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -178,12 +178,13 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int,
             batch_sizes += [
                 2**i for i in range(8, math.ceil(math.log(max_batch_size, 2)))
             ]
-            # Filter and sort batch sizes
-            batch_sizes = sorted(
-                [size for size in batch_sizes if size <= max_batch_size])
+
+        # Filter and sort batch sizes for both branches
+        batch_sizes = sorted(
+            [size for size in batch_sizes if size <= max_batch_size])
 
         # Add max_batch_size if not already included
-        if max_batch_size != batch_sizes[-1]:
+        if not batch_sizes or max_batch_size != batch_sizes[-1]:
             batch_sizes.append(max_batch_size)
 
         return batch_sizes
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index 999edbcdcde..94d7b1cddaf 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -591,6 +591,16 @@ def test_cuda_graph_batch_sizes_case_2(self):
             128, True)
         assert args.cuda_graph_config.max_batch_size == 128
 
+    @pytest.mark.parametrize("max_batch_size", [64, 129, 320])
+    def test_generate_cuda_graph_batch_sizes_padding_edge_cases(
+            self, max_batch_size):
+        # All sizes must be <= max_batch_size, sorted, and include max_batch_size
+        batch_sizes = CudaGraphConfig._generate_cuda_graph_batch_sizes(
+            max_batch_size, enable_padding=True)
+        assert all(s <= max_batch_size for s in batch_sizes)
+        assert batch_sizes == sorted(batch_sizes)
+        assert max_batch_size in batch_sizes
+
 
 class TestTrtLlmArgs: