|
13 | 13 | #include "shared/source/helpers/logical_state_helper.h" |
14 | 14 | #include "shared/source/memory_manager/internal_allocation_storage.h" |
15 | 15 | #include "shared/source/memory_manager/prefetch_manager.h" |
| 16 | +#include "shared/source/memory_manager/unified_memory_manager.h" |
16 | 17 |
|
17 | 18 | #include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h" |
18 | 19 | #include "level_zero/core/source/device/bcs_split.h" |
@@ -227,6 +228,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy( |
227 | 228 |
|
228 | 229 | ze_result_t ret; |
229 | 230 |
|
| 231 | + NEO::SvmAllocationData *srcAllocData = nullptr; |
| 232 | + NEO::SvmAllocationData *dstAllocData = nullptr; |
| 233 | + bool srcAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &srcAllocData); |
| 234 | + bool dstAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &dstAllocData); |
| 235 | + if (preferCopyThroughLockedPtr(dstAllocData, dstAllocFound, srcAllocData, srcAllocFound, size)) { |
| 236 | + return performCpuMemcpy(dstptr, srcptr, size, dstAllocFound, hSignalEvent, numWaitEvents, phWaitEvents); |
| 237 | + } |
| 238 | + |
230 | 239 | if (this->isAppendSplitNeeded(dstptr, srcptr, size)) { |
231 | 240 | ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { |
232 | 241 | return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents); |
@@ -461,4 +470,91 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res |
461 | 470 | return inputRet; |
462 | 471 | } |
463 | 472 |
|
| 473 | +template <GFXCORE_FAMILY gfxCoreFamily> |
| 474 | +bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size) { |
| 475 | + size_t h2DThreshold = 2 * MemoryConstants::megaByte; |
| 476 | + size_t d2HThreshold = 1 * MemoryConstants::kiloByte; |
| 477 | + if (NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get() != -1) { |
| 478 | + h2DThreshold = NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get(); |
| 479 | + } |
| 480 | + if (NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get() != -1) { |
| 481 | + d2HThreshold = NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get(); |
| 482 | + } |
| 483 | + if (NEO::HwHelper::get(this->device->getHwInfo().platform.eRenderCoreFamily).copyThroughLockedPtrEnabled()) { |
| 484 | + return (!srcFound && isAllocUSMDeviceMemory(dstAlloc, dstFound) && size <= h2DThreshold) || |
| 485 | + (!dstFound && isAllocUSMDeviceMemory(srcAlloc, srcFound) && size <= d2HThreshold); |
| 486 | + } |
| 487 | + return false; |
| 488 | +} |
| 489 | + |
| 490 | +template <GFXCORE_FAMILY gfxCoreFamily> |
| 491 | +bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound) { |
| 492 | + return allocFound && (alloc->memoryType == InternalMemoryType::DEVICE_UNIFIED_MEMORY); |
| 493 | +} |
| 494 | + |
| 495 | +template <GFXCORE_FAMILY gfxCoreFamily> |
| 496 | +ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { |
| 497 | + |
| 498 | + bool needsBarrier = (numWaitEvents > 0); |
| 499 | + if (needsBarrier) { |
| 500 | + this->appendBarrier(nullptr, numWaitEvents, phWaitEvents); |
| 501 | + } |
| 502 | + |
| 503 | + bool needsFlushTagUpdate = this->latestFlushedBarrierCounter < this->barrierCounter; |
| 504 | + if (needsFlushTagUpdate) { |
| 505 | + this->csr->flushTagUpdate(); |
| 506 | + } |
| 507 | + |
| 508 | + Event *signalEvent = nullptr; |
| 509 | + if (hSignalEvent) { |
| 510 | + signalEvent = Event::fromHandle(hSignalEvent); |
| 511 | + } |
| 512 | + |
| 513 | + const void *cpuMemcpySrcPtr = nullptr; |
| 514 | + void *cpuMemcpyDstPtr = nullptr; |
| 515 | + if (isDstDeviceMemory) { |
| 516 | + cpuMemcpySrcPtr = srcptr; |
| 517 | + cpuMemcpyDstPtr = obtainLockedPtrFromDevice(dstptr, size); |
| 518 | + } else { |
| 519 | + cpuMemcpySrcPtr = obtainLockedPtrFromDevice(const_cast<void *>(srcptr), size); |
| 520 | + cpuMemcpyDstPtr = dstptr; |
| 521 | + } |
| 522 | + |
| 523 | + if (needsFlushTagUpdate) { |
| 524 | + auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; |
| 525 | + const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutMicroseconds}, this->csr->peekTaskCount()); |
| 526 | + if (waitStatus == NEO::WaitStatus::GpuHang) { |
| 527 | + return ZE_RESULT_ERROR_DEVICE_LOST; |
| 528 | + } |
| 529 | + this->latestFlushedBarrierCounter = this->barrierCounter; |
| 530 | + } |
| 531 | + |
| 532 | + if (signalEvent) { |
| 533 | + signalEvent->setGpuStartTimestamp(); |
| 534 | + } |
| 535 | + |
| 536 | + memcpy_s(cpuMemcpyDstPtr, size, cpuMemcpySrcPtr, size); |
| 537 | + |
| 538 | + if (signalEvent) { |
| 539 | + signalEvent->setGpuEndTimestamp(); |
| 540 | + signalEvent->hostSignal(); |
| 541 | + } |
| 542 | + return ZE_RESULT_SUCCESS; |
| 543 | +} |
| 544 | + |
| 545 | +template <GFXCORE_FAMILY gfxCoreFamily> |
| 546 | +void *CommandListCoreFamilyImmediate<gfxCoreFamily>::obtainLockedPtrFromDevice(void *ptr, size_t size) { |
| 547 | + NEO::SvmAllocationData *allocData = nullptr; |
| 548 | + auto allocFound = this->device->getDriverHandle()->findAllocationDataForRange(ptr, size, &allocData); |
| 549 | + UNRECOVERABLE_IF(!allocFound); |
| 550 | + |
| 551 | + auto alloc = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex()); |
| 552 | + if (!alloc->isLocked()) { |
| 553 | + this->device->getDriverHandle()->getMemoryManager()->lockResource(alloc); |
| 554 | + } |
| 555 | + auto gpuAddress = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->getGpuAddress(); |
| 556 | + auto offset = ptrDiff(ptr, gpuAddress); |
| 557 | + return ptrOffset(alloc->getLockedPtr(), offset); |
| 558 | +} |
| 559 | + |
464 | 560 | } // namespace L0 |
0 commit comments