[L0v2] fix unbounded memory growth of queue's submitted kernels

pbalcer · pbalcer · commit 57aef87e757b · 2025-12-06T10:16:59.000Z
L0v2 avoids internally tracking each kernel submission through
an event for lifetime management. Instead, when a kernel is submitted
to the queue, its handle is added to a vector, to be removed at the
next queue synchronization point, urQueueFinish(). This is a much more
efficient way of handling kernel tracking, since it avoids taking and
storing an event. However, if the application never synchronizes the
queue, this vector of submitted kernels will grow unbounded.

This patch forcibly synchronizes the queue once the submitted kernels
vector reaches a threshold.
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -146,10 +146,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
-  TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
-
-  auto commandListLocked = commandListManager.lock();
+ur_result_t ur_queue_immediate_in_order_t::synchronize(
+    locked<ur_command_list_manager> &commandListLocked) {
   // TODO: use zeEventHostSynchronize instead?
   TRACK_SCOPE_LATENCY(
       "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize");
@@ -161,12 +159,29 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
     UR_CALL(hKernel->release());
   }
   submittedKernels.clear();
+}
 
-  return UR_RESULT_SUCCESS;
+ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
+  TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
+
+  auto commandListLocked = commandListManager.lock();
+  return synchronize(commandListLocked);
 }
 
+// In order to avoid tracking individual events for each kernel submission on
+// the queue, the adapter simply keeps a vector of all handles of submitted
+// kernels, prunning it at queue synchronization, urQueueFinish(), knowing that
+// all previously enqueued kernels have finished. However, some applications
+// might not explicitly synchronize the queue, in which case the submitted
+// kernels might grow unbounded. To prevent that, we need to cap the vector's
+// size, and forcibly synchronize the queue once it exceeds the limit.
+#define MAX_QUEUE_SUBMITTED_KERNELS 1024
+
 void ur_queue_immediate_in_order_t::recordSubmittedKernel(
-    ur_kernel_handle_t hKernel) {
+    locked<ur_command_list_manager> &commandList, ur_kernel_handle_t hKernel) {
+  if (submittedKernels.size() > MAX_QUEUE_SUBMITTED_KERNELS) {
+    synchronize(commandList);
+  }
   submittedKernels.push_back(hKernel);
   hKernel->RefCount.increment();
 }
@@ -195,7 +210,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
       hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
       numEventsInWaitList, phEventWaitList, phEvent));
 
-  recordSubmittedKernel(hKernel);
+  recordSubmittedKernel(commandListLocked, hKernel);
 
   return UR_RESULT_SUCCESS;
 }
@@ -847,7 +862,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
               &zeThreadGroupDimensions, zeSignalEvent, waitListView.num,
               waitListView.handles));
 
-  recordSubmittedKernel(hKernel);
+  recordSubmittedKernel(commandListLocked, hKernel);
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -22,6 +22,7 @@
 
 #include "command_list_manager.hpp"
 #include "lockable.hpp"
+#include "ur_api.h"
 
 namespace v2 {
 
@@ -62,7 +63,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
                                    const ur_event_handle_t *phEventWaitList,
                                    ur_event_handle_t *phEvent);
 
-  void recordSubmittedKernel(ur_kernel_handle_t hKernel);
+  void recordSubmittedKernel(locked<ur_command_list_manager> &commandList,
+                             ur_kernel_handle_t hKernel);
+
+  ur_result_t synchronize(locked<ur_command_list_manager> &commandList);
 
 public:
   ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,