Skip to content

Commit 57aef87

Browse files
committed
[L0v2] fix unbounded memory growth of queue's submitted kernels
L0v2 avoids internally tracking each kernel submission through an event for lifetime management. Instead, when a kernel is submitted to the queue, its handle is added to a vector, to be removed at the next queue synchronization point, urQueueFinish(). This is a much more efficient way of handling kernel tracking, since it avoids taking and storing an event. However, if the application never synchronizes the queue, this vector of submitted kernels will grow unbounded. This patch forcibly synchronizes the queue once the submitted kernels vector reaches a threshold.
1 parent 7b05a8c commit 57aef87

File tree

2 files changed

+28
-9
lines changed

2 files changed

+28
-9
lines changed

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
146146
return UR_RESULT_SUCCESS;
147147
}
148148

149-
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
150-
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
151-
152-
auto commandListLocked = commandListManager.lock();
149+
ur_result_t ur_queue_immediate_in_order_t::synchronize(
150+
locked<ur_command_list_manager> &commandListLocked) {
153151
// TODO: use zeEventHostSynchronize instead?
154152
TRACK_SCOPE_LATENCY(
155153
"ur_queue_immediate_in_order_t::zeCommandListHostSynchronize");
@@ -161,12 +159,29 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
161159
UR_CALL(hKernel->release());
162160
}
163161
submittedKernels.clear();
162+
}
164163

165-
return UR_RESULT_SUCCESS;
164+
ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
165+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");
166+
167+
auto commandListLocked = commandListManager.lock();
168+
return synchronize(commandListLocked);
166169
}
167170

171+
// In order to avoid tracking individual events for each kernel submission on
172+
// the queue, the adapter simply keeps a vector of all handles of submitted
173+
// kernels, prunning it at queue synchronization, urQueueFinish(), knowing that
174+
// all previously enqueued kernels have finished. However, some applications
175+
// might not explicitly synchronize the queue, in which case the submitted
176+
// kernels might grow unbounded. To prevent that, we need to cap the vector's
177+
// size, and forcibly synchronize the queue once it exceeds the limit.
178+
#define MAX_QUEUE_SUBMITTED_KERNELS 1024
179+
168180
void ur_queue_immediate_in_order_t::recordSubmittedKernel(
169-
ur_kernel_handle_t hKernel) {
181+
locked<ur_command_list_manager> &commandList, ur_kernel_handle_t hKernel) {
182+
if (submittedKernels.size() > MAX_QUEUE_SUBMITTED_KERNELS) {
183+
synchronize(commandList);
184+
}
170185
submittedKernels.push_back(hKernel);
171186
hKernel->RefCount.increment();
172187
}
@@ -195,7 +210,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
195210
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
196211
numEventsInWaitList, phEventWaitList, phEvent));
197212

198-
recordSubmittedKernel(hKernel);
213+
recordSubmittedKernel(commandListLocked, hKernel);
199214

200215
return UR_RESULT_SUCCESS;
201216
}
@@ -847,7 +862,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
847862
&zeThreadGroupDimensions, zeSignalEvent, waitListView.num,
848863
waitListView.handles));
849864

850-
recordSubmittedKernel(hKernel);
865+
recordSubmittedKernel(commandListLocked, hKernel);
851866

852867
return UR_RESULT_SUCCESS;
853868
}

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "command_list_manager.hpp"
2424
#include "lockable.hpp"
25+
#include "ur_api.h"
2526

2627
namespace v2 {
2728

@@ -62,7 +63,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
6263
const ur_event_handle_t *phEventWaitList,
6364
ur_event_handle_t *phEvent);
6465

65-
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
66+
void recordSubmittedKernel(locked<ur_command_list_manager> &commandList,
67+
ur_kernel_handle_t hKernel);
68+
69+
ur_result_t synchronize(locked<ur_command_list_manager> &commandList);
6670

6771
public:
6872
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,

0 commit comments

Comments
 (0)