Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle(
return UR_RESULT_SUCCESS;
}

ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");

auto commandListLocked = commandListManager.lock();
ur_result_t ur_queue_immediate_in_order_t::synchronize(
locked<ur_command_list_manager> &commandListLocked) {
// TODO: use zeEventHostSynchronize instead?
TRACK_SCOPE_LATENCY(
"ur_queue_immediate_in_order_t::zeCommandListHostSynchronize");
Expand All @@ -165,8 +163,27 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
return UR_RESULT_SUCCESS;
}

ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish");

auto commandListLocked = commandListManager.lock();
return synchronize(commandListLocked);
}

// In order to avoid tracking individual events for each kernel submission on
// the queue, the adapter simply keeps a vector of all handles of submitted
// kernels, prunning it at queue synchronization, urQueueFinish(), knowing that
// all previously enqueued kernels have finished. However, some applications
// might not explicitly synchronize the queue, in which case the submitted
// kernels might grow unbounded. To prevent that, we need to cap the vector's
// size, and forcibly synchronize the queue once it exceeds the limit.
#define MAX_QUEUE_SUBMITTED_KERNELS 1024

void ur_queue_immediate_in_order_t::recordSubmittedKernel(
ur_kernel_handle_t hKernel) {
locked<ur_command_list_manager> &commandList, ur_kernel_handle_t hKernel) {
if (submittedKernels.size() > MAX_QUEUE_SUBMITTED_KERNELS) {
synchronize(commandList);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is very low chance that we will have 1000 unique kernels, most likely we have duplicates here.
RIght now this will always synchronize every 1k submits which is undesired.

Can you compact the vector by finding duplicates and releasing those?
You just need one kernel instance in container to hold the object all additional ones are not neeeded.

Also what I would consider is not adding kernel to the vector if it is already there.

}
submittedKernels.push_back(hKernel);
hKernel->RefCount.increment();
}
Expand Down Expand Up @@ -195,7 +212,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
numEventsInWaitList, phEventWaitList, phEvent));

recordSubmittedKernel(hKernel);
recordSubmittedKernel(commandListLocked, hKernel);

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -847,7 +864,7 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
&zeThreadGroupDimensions, zeSignalEvent, waitListView.num,
waitListView.handles));

recordSubmittedKernel(hKernel);
recordSubmittedKernel(commandListLocked, hKernel);

return UR_RESULT_SUCCESS;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include "command_list_manager.hpp"
#include "lockable.hpp"
#include "ur_api.h"

namespace v2 {

Expand Down Expand Up @@ -62,7 +63,10 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent);

void recordSubmittedKernel(ur_kernel_handle_t hKernel);
void recordSubmittedKernel(locked<ur_command_list_manager> &commandList,
ur_kernel_handle_t hKernel);

ur_result_t synchronize(locked<ur_command_list_manager> &commandList);

public:
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
Expand Down
Loading