From 156235d9779acc32802f56be774b364ef6a54d14 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Fri, 5 Dec 2025 22:25:29 +0000
Subject: [PATCH] [L0v2] fix unbounded memory growth of queue's submitted
 kernels

L0v2 avoids internally tracking each kernel submission through
an event for lifetime management. Instead, when a kernel is submitted
to the queue, its handle is added to a vector, to be removed at the
next queue synchronization point, urQueueFinish(). This is a much more
efficient way of handling kernel tracking, since it avoids taking and
storing an event. However, if the application never synchronizes the
queue, this vector of submitted kernels will grow unbounded.

This patch forcibly synchronizes the queue once the submitted kernels
vector reaches a threshold.
---
 .../v2/queue_immediate_in_order.cpp           | 31 ++++++++++++++-----
 .../v2/queue_immediate_in_order.hpp           |  6 +++-
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index e3e88f1aa581c..594a173862f40 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -146,10 +146,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
-  TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
-
-  auto commandListLocked = commandListManager.lock();
+ur_result_t ur_queue_immediate_in_order_t::synchronize(
+    locked<ur_command_list_manager> &commandListLocked) {
   // TODO: use zeEventHostSynchronize instead?
   TRACK_SCOPE_LATENCY(
       "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize");
@@ -165,8 +163,27 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
   return UR_RESULT_SUCCESS;
 }
 
+ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
+  TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
+
+  auto commandListLocked = commandListManager.lock();
+  return synchronize(commandListLocked);
+}
+
+// In order to avoid tracking individual events for each kernel submission on
+// the queue, the adapter simply keeps a vector of all handles of submitted
+// kernels, prunning it at queue synchronization, urQueueFinish(), knowing that
+// all previously enqueued kernels have finished. However, some applications
+// might not explicitly synchronize the queue, in which case the submitted
+// kernels might grow unbounded. To prevent that, we need to cap the vector's
+// size, and forcibly synchronize the queue once it exceeds the limit.
+#define MAX_QUEUE_SUBMITTED_KERNELS 1024
+
 void ur_queue_immediate_in_order_t::recordSubmittedKernel(
-    ur_kernel_handle_t hKernel) {
+    locked<ur_command_list_manager> &commandList, ur_kernel_handle_t hKernel) {
+  if (submittedKernels.size() > MAX_QUEUE_SUBMITTED_KERNELS) {
+    synchronize(commandList);
+  }
   submittedKernels.push_back(hKernel);
   hKernel->RefCount.increment();
 }
@@ -195,7 +212,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
       hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
       numEventsInWaitList, phEventWaitList, phEvent));
 
-  recordSubmittedKernel(hKernel);
+  recordSubmittedKernel(commandListLocked, hKernel);
 
   return UR_RESULT_SUCCESS;
 }
@@ -847,7 +864,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
               &zeThreadGroupDimensions, zeSignalEvent, waitListView.num,
               waitListView.handles));
 
-  recordSubmittedKernel(hKernel);
+  recordSubmittedKernel(commandListLocked, hKernel);
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index fb7ed9a9b43e9..da0b217f1e1b1 100644
--- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -22,6 +22,7 @@
 
 #include "command_list_manager.hpp"
 #include "lockable.hpp"
+#include "ur_api.h"
 
 namespace v2 {
 
@@ -62,7 +63,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
                                    const ur_event_handle_t *phEventWaitList,
                                    ur_event_handle_t *phEvent);
 
-  void recordSubmittedKernel(ur_kernel_handle_t hKernel);
+  void recordSubmittedKernel(locked<ur_command_list_manager> &commandList,
+                             ur_kernel_handle_t hKernel);
+
+  ur_result_t synchronize(locked<ur_command_list_manager> &commandList);
 
 public:
   ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,