[Model Runner V2] Minor cleanup for build_attn_metadata (vllm-project#29576)

WoosukKwon · web-flow · commit ee80aee1cab6 · 2025-11-26T20:10:12.000-08:00
Signed-off-by: Woosuk Kwon &lt;woosuk.kwon@berkeley.edu&gt;
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
@@ -18,7 +18,6 @@
     KVCacheConfig,
     KVCacheSpec,
 )
-from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.utils import bind_kv_cache
 
 
@@ -145,17 +144,16 @@ def build_attn_metadata(
     attn_metadata_builders: list[AttentionMetadataBuilder],
     num_reqs: int,
     num_tokens: int,
-    query_start_loc: CpuGpuBuffer,
+    query_start_loc_gpu: torch.Tensor,
+    query_start_loc_cpu: torch.Tensor,
     seq_lens: torch.Tensor,
     seq_lens_np: np.ndarray,
     num_computed_tokens_cpu: torch.Tensor | None,
     block_tables: Sequence[torch.Tensor],
     slot_mappings: torch.Tensor,
     kv_cache_config: KVCacheConfig,
 ) -> dict[str, Any]:
-    query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
-    query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
-    max_query_len = int(query_start_loc.np[: num_reqs + 1].max())
+    max_query_len = int(query_start_loc_cpu.max())
     seq_lens = seq_lens[:num_reqs]
     seq_lens_cpu = torch.from_numpy(seq_lens_np)
     max_seq_len = int(seq_lens_np.max())
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -120,7 +120,8 @@ def capture_graph(
             attn_metadata_builders=attn_metadata_builders,
             num_reqs=batch_size,
             num_tokens=batch_size,
-            query_start_loc=input_buffers.query_start_loc,
+            query_start_loc_gpu=input_buffers.query_start_loc.gpu[: batch_size + 1],
+            query_start_loc_cpu=input_buffers.query_start_loc.cpu[: batch_size + 1],
             seq_lens=input_buffers.seq_lens,
             seq_lens_np=np.full(batch_size, self.max_model_len, dtype=np.int32),
             num_computed_tokens_cpu=None,  # FIXME
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
@@ -226,11 +226,15 @@ def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
         num_computed_tokens = torch.zeros(
             input_batch.num_reqs, dtype=torch.int32, device=self.device
         )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: input_batch.num_reqs + 1]
+        query_start_loc_cpu = query_start_loc.cpu[: input_batch.num_reqs + 1]
         attn_metadata = build_attn_metadata(
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=input_batch.num_reqs,
             num_tokens=input_batch.num_tokens,
-            query_start_loc=self.input_buffers.query_start_loc,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens,
             seq_lens_np=input_batch.seq_lens_np,
             num_computed_tokens_cpu=num_computed_tokens,
@@ -515,6 +519,7 @@ def prepare_inputs(
         self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
         self.input_buffers.query_start_loc.copy_to_gpu()
         query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1]
         query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
 
         # Copy prefill tokens from CPU to GPU.
@@ -572,7 +577,8 @@ def prepare_inputs(
             attn_metadata_builders=self.attn_metadata_builders,
             num_reqs=num_reqs,
             num_tokens=num_tokens,
-            query_start_loc=self.input_buffers.query_start_loc,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
             seq_lens=self.input_buffers.seq_lens,
             seq_lens_np=seq_lens_np,
             num_computed_tokens_cpu=num_computed_tokens,