From 156235d9779acc32802f56be774b364ef6a54d14 Mon Sep 17 00:00:00 2001 From: Piotr Balcer Date: Fri, 5 Dec 2025 22:25:29 +0000 Subject: [PATCH] [L0v2] fix unbounded memory growth of queue's submitted kernels L0v2 avoids internally tracking each kernel submission through an event for lifetime management. Instead, when a kernel is submitted to the queue, its handle is added to a vector, to be removed at the next queue synchronization point, urQueueFinish(). This is a much more efficient way of handling kernel tracking, since it avoids taking and storing an event. However, if the application never synchronizes the queue, this vector of submitted kernels will grow unbounded. This patch forcibly synchronizes the queue once the submitted kernels vector reaches a threshold. --- .../v2/queue_immediate_in_order.cpp | 31 ++++++++++++++----- .../v2/queue_immediate_in_order.hpp | 6 +++- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index e3e88f1aa581c..594a173862f40 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -146,10 +146,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_immediate_in_order_t::queueFinish() { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); - - auto commandListLocked = commandListManager.lock(); +ur_result_t ur_queue_immediate_in_order_t::synchronize( + locked &commandListLocked) { // TODO: use zeEventHostSynchronize instead? TRACK_SCOPE_LATENCY( "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); @@ -165,8 +163,27 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { return UR_RESULT_SUCCESS; } +ur_result_t ur_queue_immediate_in_order_t::queueFinish() { + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); + + auto commandListLocked = commandListManager.lock(); + return synchronize(commandListLocked); +} + +// In order to avoid tracking individual events for each kernel submission on +// the queue, the adapter simply keeps a vector of all handles of submitted +// kernels, prunning it at queue synchronization, urQueueFinish(), knowing that +// all previously enqueued kernels have finished. However, some applications +// might not explicitly synchronize the queue, in which case the submitted +// kernels might grow unbounded. To prevent that, we need to cap the vector's +// size, and forcibly synchronize the queue once it exceeds the limit. +#define MAX_QUEUE_SUBMITTED_KERNELS 1024 + void ur_queue_immediate_in_order_t::recordSubmittedKernel( - ur_kernel_handle_t hKernel) { + locked &commandList, ur_kernel_handle_t hKernel) { + if (submittedKernels.size() > MAX_QUEUE_SUBMITTED_KERNELS) { + synchronize(commandList); + } submittedKernels.push_back(hKernel); hKernel->RefCount.increment(); } @@ -195,7 +212,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent)); - recordSubmittedKernel(hKernel); + recordSubmittedKernel(commandListLocked, hKernel); return UR_RESULT_SUCCESS; } @@ -847,7 +864,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( &zeThreadGroupDimensions, zeSignalEvent, waitListView.num, waitListView.handles)); - recordSubmittedKernel(hKernel); + recordSubmittedKernel(commandListLocked, hKernel); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index fb7ed9a9b43e9..da0b217f1e1b1 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -22,6 +22,7 @@ #include "command_list_manager.hpp" #include "lockable.hpp" +#include "ur_api.h" namespace v2 { @@ -62,7 +63,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - void recordSubmittedKernel(ur_kernel_handle_t hKernel); + void recordSubmittedKernel(locked &commandList, + ur_kernel_handle_t hKernel); + + ur_result_t synchronize(locked &commandList); public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,