Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 120 additions & 25 deletions linux/include/uapi/drm/ivpu_accel.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ extern "C" {
#define DRM_IVPU_CMDQ_CREATE 0x0b
#define DRM_IVPU_CMDQ_DESTROY 0x0c
#define DRM_IVPU_CMDQ_SUBMIT 0x0d
#define DRM_IVPU_BO_CREATE_FROM_USERPTR 0x0e
#define DRM_IVPU_CMDQ_INFO 0x0e
#define DRM_IVPU_CMDQ_UMQ_ENABLE 0x0f
#define DRM_IVPU_CMDQ_UMQ_DISABLE 0x10
#define DRM_IVPU_BO_CREATE_FROM_USERPTR 0x11

#define DRM_IOCTL_IVPU_GET_PARAM \
DRM_IOWR(DRM_COMMAND_BASE + DRM_IVPU_GET_PARAM, struct drm_ivpu_param)
Expand Down Expand Up @@ -74,6 +77,17 @@ extern "C" {
DRM_IOWR(DRM_COMMAND_BASE + DRM_IVPU_BO_CREATE_FROM_USERPTR, \
struct drm_ivpu_bo_create_from_userptr)

#define DRM_IOCTL_IVPU_CMDQ_INFO \
DRM_IOWR(DRM_COMMAND_BASE + DRM_IVPU_CMDQ_INFO, struct drm_ivpu_cmdq_info)

#define DRM_IOCTL_IVPU_CMDQ_UMQ_ENABLE \
DRM_IOW(DRM_COMMAND_BASE + DRM_IVPU_CMDQ_UMQ_ENABLE, \
struct drm_ivpu_cmdq_umq_enable)

#define DRM_IOCTL_IVPU_CMDQ_UMQ_DISABLE \
DRM_IOW(DRM_COMMAND_BASE + DRM_IVPU_CMDQ_UMQ_DISABLE, \
struct drm_ivpu_cmdq_umq_disable)

/**
* DOC: contexts
*
Expand Down Expand Up @@ -139,6 +153,14 @@ extern "C" {
* This allows creating GEM buffers from existing user memory regions.
*/
#define DRM_IVPU_CAP_BO_CREATE_FROM_USERPTR 4
/**
* DRM_IVPU_CAP_UMQ
*
* Driver supports User Mode Queue (UMQ): direct doorbell access from
* userspace via mmap, bypassing the CMDQ_SUBMIT ioctl in the hot path.
* Requires HW scheduling mode, silicon platform, and SMMU enabled.
*/
#define DRM_IVPU_CAP_UMQ 5

/**
* struct drm_ivpu_param - Get/Set VPU parameters
Expand Down Expand Up @@ -515,44 +537,117 @@ struct drm_ivpu_metric_streamer_get_data {
};

/* Command queue flags */
#define DRM_IVPU_CMDQ_FLAG_TURBO 0x00000001
#define DRM_IVPU_CMDQ_FLAG_TURBO 0x00000001 /* BIT(0): low-latency turbo scheduling */
#define DRM_IVPU_CMDQ_FLAG_PERSISTENT 0x00000002 /* BIT(1): persistent CmdQ — hold NPU tile between inferences */

/**
* struct drm_ivpu_cmdq_create - Create command queue for job submission
*/
struct drm_ivpu_cmdq_create {
/** @cmdq_id: Returned ID of created command queue */
/** @cmdq_id: Returned ID of created command queue */
__u32 cmdq_id;
/**
* @priority:
*
* Priority to be set for related job command queue, can be one of the following:
* %DRM_IVPU_JOB_PRIORITY_DEFAULT
* %DRM_IVPU_JOB_PRIORITY_IDLE
* %DRM_IVPU_JOB_PRIORITY_NORMAL
* %DRM_IVPU_JOB_PRIORITY_FOCUS
* %DRM_IVPU_JOB_PRIORITY_REALTIME
*/
__u32 priority;
/**
* @flags:
*
* Supported flags:
*
* %DRM_IVPU_CMDQ_FLAG_TURBO
*
* Enable low-latency mode for the command queue. The NPU will maximize performance
* when executing jobs from such queue at the cost of increased power usage.
*
* %DRM_IVPU_CMDQ_FLAG_PERSISTENT
*
* Keep the NPU tile allocated between inferences, eliminating per-inference
* tile acquire/release overhead. Used with UMQ fast path.
*/
__u32 flags;
/** @_pad: Reserved, must be zero. */
__u32 _pad;
};

/**
* struct drm_ivpu_cmdq_destroy - Destroy a command queue
*/
struct drm_ivpu_cmdq_destroy {
/** @cmdq_id: ID of command queue to destroy */
__u32 cmdq_id;
};

/**
* struct drm_ivpu_cmdq_info - Query UMQ parameters for a command queue
*
* Used with DRM_IOCTL_IVPU_CMDQ_INFO. After a successful call the
* job ring buffer is accessible at @cmdq_mmap_offset and the doorbell
* MMIO page is accessible at @db_mmap_offset (after CMDQ_UMQ_ENABLE).
*/
struct drm_ivpu_cmdq_info {
/** @cmdq_id: Command queue ID (input) */
__u32 cmdq_id;
/** @entry_count: Number of job slots in the ring buffer */
__u32 entry_count;
/** @cmdq_mmap_offset: mmap() offset for the job ring buffer */
__u64 cmdq_mmap_offset;
/** @db_mmap_offset: mmap() offset for the doorbell MMIO page */
__u64 db_mmap_offset;
/** @db_id: Doorbell register index */
__u32 db_id;
/** @job_id_base: Upper bits of job_id (context identifier) */
__u32 job_id_base;
/** @primary_preempt_buf_vpu_addr: VPU address of primary preemption buffer */
__u64 primary_preempt_buf_vpu_addr;
/** @primary_preempt_buf_size: Size of primary preemption buffer in bytes (OUT) */
__u32 primary_preempt_buf_size;
/** @_pad0: Explicit padding for alignment of secondary_preempt_buf_vpu_addr */
__u32 _pad0;
/** @secondary_preempt_buf_vpu_addr: VPU address of secondary preemption buffer (OUT) */
__u64 secondary_preempt_buf_vpu_addr;
/** @secondary_preempt_buf_size: Size of secondary preemption buffer in bytes (OUT) */
__u32 secondary_preempt_buf_size;
/**
* @priority:
*
* Priority to be set for related job command queue, can be one of the following:
* %DRM_IVPU_JOB_PRIORITY_DEFAULT
* %DRM_IVPU_JOB_PRIORITY_IDLE
* %DRM_IVPU_JOB_PRIORITY_NORMAL
* %DRM_IVPU_JOB_PRIORITY_FOCUS
* %DRM_IVPU_JOB_PRIORITY_REALTIME
*/
__u32 priority;
/**
* @flags:
*
* Supported flags:
*
* %DRM_IVPU_CMDQ_FLAG_TURBO
*
* Enable low-latency mode for the command queue. The NPU will maximize performance
* when executing jobs from such queue at the cost of increased power usage.
* @reset_counter: Device reset counter (OUT).
* Incremented on every device reset. UMD must check this before
* each UMQ submit and re-setup the UMQ if it has changed.
*/
__u32 reset_counter;
};

/**
* struct drm_ivpu_cmdq_umq_enable - Enable User Mode Queue for a command queue
*
* After a successful call the userspace may mmap the job ring buffer and
* doorbell MMIO page and submit jobs directly without any ioctl.
*/
struct drm_ivpu_cmdq_umq_enable {
/** @cmdq_id: Command queue ID */
__u32 cmdq_id;
/** @flags: Reserved, must be zero */
__u32 flags;
/** @reset_eventfd: eventfd for device-reset notification (-1 to disable) */
__s32 reset_eventfd;
/** @pad: Reserved, must be zero */
__u32 pad;
};

/**
* struct drm_ivpu_cmdq_destroy - Destroy a command queue
* struct drm_ivpu_cmdq_umq_disable - Disable User Mode Queue for a command queue
*/
struct drm_ivpu_cmdq_destroy {
/** @cmdq_id: ID of command queue to destroy */
struct drm_ivpu_cmdq_umq_disable {
/** @cmdq_id: Command queue ID */
__u32 cmdq_id;
/** @pad: Reserved, must be zero */
__u32 pad;
};

/**
Expand Down
66 changes: 61 additions & 5 deletions umd/vpu_driver/source/command/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <limits>
#include <ratio>
#include <string.h>
#include <thread>
#include <utility>

namespace VPU {
Expand Down Expand Up @@ -275,13 +276,60 @@ void VPUCommandBuffer::useBusyWait() {
}

bool VPUCommandBuffer::waitForCompletion(int64_t timeout_abs_ns) {
if (useBusyWaitFlag)
busyWait(timeout_abs_ns, VPUDeviceContext::getCpuTscFreqMHz());
if (!umqMode) {
if (useBusyWaitFlag)
busyWait(timeout_abs_ns, VPUDeviceContext::getCpuTscFreqMHz());

bool result = wait(timeout_abs_ns);
bool result = wait(timeout_abs_ns);
if (!result)
return false;
} else {
/* UMQ mode: NPU writes STATE_DEVICE_SIGNAL to fenceValue when done.
* Use umonitor/umwait with short per-iteration timeouts matching the
* Intel GPU ULLS-Light pattern (compute-runtime WaitUtils, 16000 cycles
* ≈ 8 µs @ 2 GHz) and yield between iterations so other OS threads can
* run while the NPU executes. No BO_WAIT ioctl needed. */
CommandHeader *cmdHeader = reinterpret_cast<CommandHeader *>(buffer->getBasePointer());
if (!cmdHeader)
return false;

if (!result)
return false;
if (cmdHeader->fenceValue != VPUEventCommand::State::STATE_DEVICE_SIGNAL) {
#ifdef __x86_64__
constexpr uint64_t kCyclesPerIter = 16000u; /* ~8 µs @ 2 GHz */
auto durationNs =
(std::chrono::steady_clock::time_point(std::chrono::nanoseconds(timeout_abs_ns)) -
std::chrono::steady_clock::now())
.count();
if (durationNs > 0) {
unsigned long long tscDeadline =
__rdtsc() + static_cast<uint64_t>(durationNs) *
VPUDeviceContext::getCpuTscFreqMHz() / 1'000ULL;
do {
_umonitor(&cmdHeader->fenceValue);
if (cmdHeader->fenceValue == VPUEventCommand::State::STATE_DEVICE_SIGNAL)
break;
_umwait(0, __rdtsc() + kCyclesPerIter);
_mm_mfence(); /* ensure read is ordered after UMWAIT */
if (cmdHeader->fenceValue == VPUEventCommand::State::STATE_DEVICE_SIGNAL)
break;
std::this_thread::yield(); /* allow other threads to run between iterations */
} while (__rdtsc() < tscDeadline);
}
#else
/* Non-x86: spin with yield until fence is signalled or timeout */
auto deadline = std::chrono::steady_clock::time_point(
std::chrono::nanoseconds(timeout_abs_ns));
while (cmdHeader->fenceValue != VPUEventCommand::State::STATE_DEVICE_SIGNAL &&
std::chrono::steady_clock::now() < deadline)
std::this_thread::yield();
#endif
}

if (cmdHeader->fenceValue != VPUEventCommand::State::STATE_DEVICE_SIGNAL)
return false;
/* BO_WAIT ioctl is skipped in UMQ mode; set status so jobStatusToResult() sees SUCCESS. */
jobStatus = DRM_IVPU_JOB_STATUS_SUCCESS;
}

useBusyWaitFlag = false;
inferenceScratchBuffer.reset();
Expand Down Expand Up @@ -321,6 +369,7 @@ void VPUCommandBuffer::busyWait(int64_t timeout_abs_ns, uint32_t tscFreqMHz) {

CommandHeader *cmdHeader = reinterpret_cast<CommandHeader *>(buffer->getBasePointer());
if (cmdHeader && cmdHeader->fenceValue != VPUEventCommand::State::STATE_DEVICE_SIGNAL) {
#ifdef __x86_64__
// Use UMONITOR/UMWAIT to do efficient busy wait
// UMWAIT waits until the monitored address is written or the timeout expires
// The timeout is specified in CPU clock ticks
Expand All @@ -336,6 +385,13 @@ void VPUCommandBuffer::busyWait(int64_t timeout_abs_ns, uint32_t tscFreqMHz) {
_umwait(0, timeoutTime);
_mm_mfence(); // ensure that the read is not moved before the UMWAIT
} while (cmdHeader->fenceValue != VPUEventCommand::State::STATE_DEVICE_SIGNAL);
#else
/* Non-x86: spin with yield until fence is signalled or timeout */
auto deadline = std::chrono::steady_clock::now() + timeoutNs;
while (cmdHeader->fenceValue != VPUEventCommand::State::STATE_DEVICE_SIGNAL &&
std::chrono::steady_clock::now() < deadline)
std::this_thread::yield();
#endif
}

return;
Expand Down
6 changes: 6 additions & 0 deletions umd/vpu_driver/source/command/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ class VPUCommandBuffer {
void addPreemptionBuffer(std::shared_ptr<VPUBufferObject> bo);
uint32_t getPreemptionBufferIndex() const { return preemptionBufferIndex.value_or(0); }
void useBusyWait();
/**
* Enable UMQ mode: waitForCompletion() will skip the BO_WAIT ioctl and rely
* solely on the busy-wait fence value written by the NPU firmware.
*/
void setUmqMode(bool enable) { umqMode = enable; }

private:
/**
Expand Down Expand Up @@ -151,6 +156,7 @@ class VPUCommandBuffer {
std::shared_ptr<VPUBufferObject> preemptionBuffer;
std::optional<uint32_t> preemptionBufferIndex = std::nullopt;
bool useBusyWaitFlag = false;
bool umqMode = false;
};

} // namespace VPU
1 change: 1 addition & 0 deletions umd/vpu_driver/source/device/hw_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ struct VPUHwInfo {
bool primeBuffersCapability = false;
bool cmdQueueCreationCapability = false;
bool userPtrCapability = false;
bool umqCapability = false;

GetCopyCommand *getCopyCommand = nullptr;
PrintCopyDescriptor *printCopyDescriptor = nullptr;
Expand Down
Loading