bugfix: correct total_num_rows calculation in batch_prefill for accurate tensor indexing. (#504)

XuZhang99 · web-flow · commit 2c2e7ab5e37a · 2025-12-09T14:00:57.000+08:00
diff --git a/xllm/core/kernels/cuda/batch_prefill.cpp b/xllm/core/kernels/cuda/batch_prefill.cpp
@@ -47,7 +47,7 @@ void batch_prefill(torch::Tensor float_workspace_buffer,
   torch::Tensor kv_cu_seq_lens_host = kv_cu_seq_lens.to(torch::kCPU);
   torch::Tensor kv_len_arr_host =
       kv_cu_seq_lens_host.slice(0, 1) - kv_cu_seq_lens_host.slice(0, 0, -1);
-  const int64_t total_num_rows = qo_indptr_host.size(0);
+  const int64_t total_num_rows = qo_indptr_host[-1].item<int64_t>();
   const int64_t batch_size = qo_indptr_host.size(0) - 1;
 
   auto plan_info = FunctionFactory::get_instance().prefill_plan_func(uri).call(