Skip to content

Commit 156235d

Browse files
committed
[L0v2] fix unbounded memory growth of queue's submitted kernels
L0v2 avoids internally tracking each kernel submission through an event for lifetime management. Instead, when a kernel is submitted to the queue, its handle is added to a vector, to be removed at the next queue synchronization point, urQueueFinish(). This is a much more efficient way of handling kernel tracking, since it avoids taking and storing an event. However, if the application never synchronizes the queue, this vector of submitted kernels will grow unbounded. This patch forcibly synchronizes the queue once the submitted kernels vector reaches a threshold.
1 parent 7b05a8c commit 156235d

File tree

2 files changed

+29
-8
lines changed

2 files changed

+29
-8
lines changed

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
146146
return UR_RESULT_SUCCESS;
147147
}
148148

149-
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
150-
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
151-
152-
auto commandListLocked = commandListManager.lock();
149+
ur_result_t ur_queue_immediate_in_order_t::synchronize(
150+
locked<ur_command_list_manager> &commandListLocked) {
153151
// TODO: use zeEventHostSynchronize instead?
154152
TRACK_SCOPE_LATENCY(
155153
"ur_queue_immediate_in_order_t::zeCommandListHostSynchronize");
@@ -165,8 +163,27 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
165163
return UR_RESULT_SUCCESS;
166164
}
167165

166+
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
167+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
168+
169+
auto commandListLocked = commandListManager.lock();
170+
return synchronize(commandListLocked);
171+
}
172+
173+
// In order to avoid tracking individual events for each kernel submission on
174+
// the queue, the adapter simply keeps a vector of all handles of submitted
175+
// kernels, prunning it at queue synchronization, urQueueFinish(), knowing that
176+
// all previously enqueued kernels have finished. However, some applications
177+
// might not explicitly synchronize the queue, in which case the submitted
178+
// kernels might grow unbounded. To prevent that, we need to cap the vector's
179+
// size, and forcibly synchronize the queue once it exceeds the limit.
180+
#define MAX_QUEUE_SUBMITTED_KERNELS 1024
181+
168182
void ur_queue_immediate_in_order_t::recordSubmittedKernel(
169-
ur_kernel_handle_t hKernel) {
183+
locked<ur_command_list_manager> &commandList, ur_kernel_handle_t hKernel) {
184+
if (submittedKernels.size() > MAX_QUEUE_SUBMITTED_KERNELS) {
185+
synchronize(commandList);
186+
}
170187
submittedKernels.push_back(hKernel);
171188
hKernel->RefCount.increment();
172189
}
@@ -195,7 +212,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
195212
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
196213
numEventsInWaitList, phEventWaitList, phEvent));
197214

198-
recordSubmittedKernel(hKernel);
215+
recordSubmittedKernel(commandListLocked, hKernel);
199216

200217
return UR_RESULT_SUCCESS;
201218
}
@@ -847,7 +864,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
847864
&zeThreadGroupDimensions, zeSignalEvent, waitListView.num,
848865
waitListView.handles));
849866

850-
recordSubmittedKernel(hKernel);
867+
recordSubmittedKernel(commandListLocked, hKernel);
851868

852869
return UR_RESULT_SUCCESS;
853870
}

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "command_list_manager.hpp"
2424
#include "lockable.hpp"
25+
#include "ur_api.h"
2526

2627
namespace v2 {
2728

@@ -62,7 +63,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
6263
const ur_event_handle_t *phEventWaitList,
6364
ur_event_handle_t *phEvent);
6465

65-
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
66+
void recordSubmittedKernel(locked<ur_command_list_manager> &commandList,
67+
ur_kernel_handle_t hKernel);
68+
69+
ur_result_t synchronize(locked<ur_command_list_manager> &commandList);
6670

6771
public:
6872
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,

0 commit comments

Comments
 (0)