Skip to content

Commit ec04de6

Browse files
[L0][XE_HPC]Perform memcpy on CPU for non-usm ptrs
Related-To: NEO-7237 If size is small enough, it is more efficient to perform copy through locked ptr on CPU. This change also introduces experimental flag to enable this. Signed-off-by: Szymon Morek <szymon.morek@intel.com>
1 parent 6c1504a commit ec04de6

File tree

16 files changed

+575
-11
lines changed

16 files changed

+575
-11
lines changed

level_zero/core/source/cmdlist/cmdlist.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,9 @@ struct CommandList : _ze_command_list_handle_t {
323323
bool multiReturnPointCommandList = false;
324324
bool systolicModeSupport = false;
325325
bool pipelineSelectStateTracking = false;
326+
327+
std::atomic<uint32_t> barrierCounter{0u};
328+
uint32_t latestFlushedBarrierCounter = 0u;
326329
};
327330

328331
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2492,7 +2492,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
24922492
}
24932493

24942494
appendSignalEventPostWalker(signalEvent, workloadPartition);
2495-
2495+
this->barrierCounter++;
24962496
return ZE_RESULT_SUCCESS;
24972497
}
24982498

level_zero/core/source/cmdlist/cmdlist_hw_immediate.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99

1010
#include "level_zero/core/source/cmdlist/cmdlist_hw.h"
1111

12+
namespace NEO {
13+
struct SvmAllocationData;
14+
}
15+
1216
namespace L0 {
1317

1418
struct EventPool;
@@ -123,6 +127,11 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
123127

124128
void createLogicalStateHelper() override {}
125129
NEO::LogicalStateHelper *getLogicalStateHelper() const override;
130+
131+
bool preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size);
132+
bool isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound);
133+
ze_result_t performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
134+
void *obtainLockedPtrFromDevice(void *ptr, size_t size);
126135
};
127136

128137
template <PRODUCT_FAMILY gfxProductFamily>

level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "shared/source/helpers/logical_state_helper.h"
1414
#include "shared/source/memory_manager/internal_allocation_storage.h"
1515
#include "shared/source/memory_manager/prefetch_manager.h"
16+
#include "shared/source/memory_manager/unified_memory_manager.h"
1617

1718
#include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h"
1819
#include "level_zero/core/source/device/bcs_split.h"
@@ -227,6 +228,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
227228

228229
ze_result_t ret;
229230

231+
NEO::SvmAllocationData *srcAllocData = nullptr;
232+
NEO::SvmAllocationData *dstAllocData = nullptr;
233+
bool srcAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(const_cast<void *>(srcptr), size, &srcAllocData);
234+
bool dstAllocFound = this->device->getDriverHandle()->findAllocationDataForRange(dstptr, size, &dstAllocData);
235+
if (preferCopyThroughLockedPtr(dstAllocData, dstAllocFound, srcAllocData, srcAllocFound, size)) {
236+
return performCpuMemcpy(dstptr, srcptr, size, dstAllocFound, hSignalEvent, numWaitEvents, phWaitEvents);
237+
}
238+
230239
if (this->isAppendSplitNeeded(dstptr, srcptr, size)) {
231240
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
232241
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents);
@@ -461,4 +470,91 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
461470
return inputRet;
462471
}
463472

473+
template <GFXCORE_FAMILY gfxCoreFamily>
474+
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::preferCopyThroughLockedPtr(NEO::SvmAllocationData *dstAlloc, bool dstFound, NEO::SvmAllocationData *srcAlloc, bool srcFound, size_t size) {
475+
size_t h2DThreshold = 2 * MemoryConstants::megaByte;
476+
size_t d2HThreshold = 1 * MemoryConstants::kiloByte;
477+
if (NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get() != -1) {
478+
h2DThreshold = NEO::DebugManager.flags.ExperimentalH2DCpuCopyThreshold.get();
479+
}
480+
if (NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get() != -1) {
481+
d2HThreshold = NEO::DebugManager.flags.ExperimentalD2HCpuCopyThreshold.get();
482+
}
483+
if (NEO::HwHelper::get(this->device->getHwInfo().platform.eRenderCoreFamily).copyThroughLockedPtrEnabled()) {
484+
return (!srcFound && isAllocUSMDeviceMemory(dstAlloc, dstFound) && size <= h2DThreshold) ||
485+
(!dstFound && isAllocUSMDeviceMemory(srcAlloc, srcFound) && size <= d2HThreshold);
486+
}
487+
return false;
488+
}
489+
490+
template <GFXCORE_FAMILY gfxCoreFamily>
491+
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isAllocUSMDeviceMemory(NEO::SvmAllocationData *alloc, bool allocFound) {
492+
return allocFound && (alloc->memoryType == InternalMemoryType::DEVICE_UNIFIED_MEMORY);
493+
}
494+
495+
template <GFXCORE_FAMILY gfxCoreFamily>
496+
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(void *dstptr, const void *srcptr, size_t size, bool isDstDeviceMemory, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) {
497+
498+
bool needsBarrier = (numWaitEvents > 0);
499+
if (needsBarrier) {
500+
this->appendBarrier(nullptr, numWaitEvents, phWaitEvents);
501+
}
502+
503+
bool needsFlushTagUpdate = this->latestFlushedBarrierCounter < this->barrierCounter;
504+
if (needsFlushTagUpdate) {
505+
this->csr->flushTagUpdate();
506+
}
507+
508+
Event *signalEvent = nullptr;
509+
if (hSignalEvent) {
510+
signalEvent = Event::fromHandle(hSignalEvent);
511+
}
512+
513+
const void *cpuMemcpySrcPtr = nullptr;
514+
void *cpuMemcpyDstPtr = nullptr;
515+
if (isDstDeviceMemory) {
516+
cpuMemcpySrcPtr = srcptr;
517+
cpuMemcpyDstPtr = obtainLockedPtrFromDevice(dstptr, size);
518+
} else {
519+
cpuMemcpySrcPtr = obtainLockedPtrFromDevice(const_cast<void *>(srcptr), size);
520+
cpuMemcpyDstPtr = dstptr;
521+
}
522+
523+
if (needsFlushTagUpdate) {
524+
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
525+
const auto waitStatus = this->csr->waitForCompletionWithTimeout(NEO::WaitParams{false, false, timeoutMicroseconds}, this->csr->peekTaskCount());
526+
if (waitStatus == NEO::WaitStatus::GpuHang) {
527+
return ZE_RESULT_ERROR_DEVICE_LOST;
528+
}
529+
this->latestFlushedBarrierCounter = this->barrierCounter;
530+
}
531+
532+
if (signalEvent) {
533+
signalEvent->setGpuStartTimestamp();
534+
}
535+
536+
memcpy_s(cpuMemcpyDstPtr, size, cpuMemcpySrcPtr, size);
537+
538+
if (signalEvent) {
539+
signalEvent->setGpuEndTimestamp();
540+
signalEvent->hostSignal();
541+
}
542+
return ZE_RESULT_SUCCESS;
543+
}
544+
545+
template <GFXCORE_FAMILY gfxCoreFamily>
546+
void *CommandListCoreFamilyImmediate<gfxCoreFamily>::obtainLockedPtrFromDevice(void *ptr, size_t size) {
547+
NEO::SvmAllocationData *allocData = nullptr;
548+
auto allocFound = this->device->getDriverHandle()->findAllocationDataForRange(ptr, size, &allocData);
549+
UNRECOVERABLE_IF(!allocFound);
550+
551+
auto alloc = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex());
552+
if (!alloc->isLocked()) {
553+
this->device->getDriverHandle()->getMemoryManager()->lockResource(alloc);
554+
}
555+
auto gpuAddress = allocData->gpuAllocations.getGraphicsAllocation(this->device->getRootDeviceIndex())->getGpuAddress();
556+
auto offset = ptrDiff(ptr, gpuAddress);
557+
return ptrOffset(alloc->getLockedPtr(), offset);
558+
}
559+
464560
} // namespace L0

level_zero/core/source/event/event.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ struct Event : _ze_event_handle_t {
6565
void *getHostAddress() { return hostAddress; }
6666
virtual void setPacketsInUse(uint32_t value) = 0;
6767
uint32_t getCurrKernelDataIndex() const { return kernelCount - 1; }
68+
virtual void setGpuStartTimestamp() = 0;
69+
virtual void setGpuEndTimestamp() = 0;
6870

6971
size_t getContextStartOffset() const {
7072
return contextStartOffset;
@@ -143,6 +145,10 @@ struct Event : _ze_event_handle_t {
143145
size_t singlePacketSize = 0u;
144146
size_t eventPoolOffset = 0u;
145147

148+
size_t cpuStartTimestamp = 0u;
149+
size_t gpuStartTimestamp = 0u;
150+
size_t gpuEndTimestamp = 0u;
151+
146152
uint32_t kernelCount = 1u;
147153

148154
bool isTimestampEvent = false;
@@ -195,6 +201,8 @@ struct EventImp : public Event {
195201
uint32_t getPacketsInUse() override;
196202
uint32_t getPacketsUsedInLastKernel() override;
197203
void setPacketsInUse(uint32_t value) override;
204+
void setGpuStartTimestamp() override;
205+
void setGpuEndTimestamp() override;
198206

199207
std::unique_ptr<KernelEventCompletionData<TagSizeT>[]> kernelEventCompletionData;
200208

level_zero/core/source/event/event_impl.inl

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "shared/source/debug_settings/debug_settings_manager.h"
99
#include "shared/source/memory_manager/internal_allocation_storage.h"
10+
#include "shared/source/os_interface/os_time.h"
1011

1112
#include "level_zero/core/source/event/event.h"
1213
#include "level_zero/core/source/hw_helpers/l0_hw_helper.h"
@@ -167,18 +168,24 @@ template <typename TagSizeT>
167168
ze_result_t EventImp<TagSizeT>::hostEventSetValueTimestamps(TagSizeT eventVal) {
168169

169170
auto baseAddr = castToUint64(hostAddress);
170-
171-
auto eventTsSetFunc = [&eventVal](auto tsAddr) {
171+
auto eventTsSetFunc = [](auto tsAddr, TagSizeT value) {
172172
auto tsptr = reinterpret_cast<void *>(tsAddr);
173-
memcpy_s(tsptr, sizeof(TagSizeT), static_cast<void *>(&eventVal), sizeof(TagSizeT));
173+
memcpy_s(tsptr, sizeof(TagSizeT), static_cast<void *>(&value), sizeof(TagSizeT));
174174
};
175+
176+
TagSizeT timestampStart = eventVal;
177+
TagSizeT timestampEnd = eventVal;
178+
if (eventVal == Event::STATE_SIGNALED) {
179+
timestampStart = static_cast<TagSizeT>(this->gpuStartTimestamp);
180+
timestampEnd = static_cast<TagSizeT>(this->gpuEndTimestamp);
181+
}
175182
for (uint32_t i = 0; i < kernelCount; i++) {
176183
uint32_t packetsToSet = kernelEventCompletionData[i].getPacketsUsed();
177184
for (uint32_t j = 0; j < packetsToSet; j++) {
178-
eventTsSetFunc(baseAddr + contextStartOffset);
179-
eventTsSetFunc(baseAddr + globalStartOffset);
180-
eventTsSetFunc(baseAddr + contextEndOffset);
181-
eventTsSetFunc(baseAddr + globalEndOffset);
185+
eventTsSetFunc(baseAddr + contextStartOffset, timestampStart);
186+
eventTsSetFunc(baseAddr + globalStartOffset, timestampStart);
187+
eventTsSetFunc(baseAddr + contextEndOffset, timestampEnd);
188+
eventTsSetFunc(baseAddr + globalEndOffset, timestampEnd);
182189
baseAddr += singlePacketSize;
183190
}
184191
}
@@ -316,7 +323,6 @@ ze_result_t EventImp<TagSizeT>::queryKernelTimestamp(ze_kernel_timestamp_result_
316323
eventTsSetFunc(globalEndTS, result.context.kernelEnd);
317324
eventTsSetFunc(globalEndTS, result.global.kernelEnd);
318325
}
319-
320326
return ZE_RESULT_SUCCESS;
321327
}
322328

@@ -379,6 +385,9 @@ void EventImp<TagSizeT>::resetPackets() {
379385
kernelEventCompletionData[i].setPacketsUsed(1);
380386
}
381387
kernelCount = 1;
388+
cpuStartTimestamp = 0;
389+
gpuStartTimestamp = 0;
390+
gpuEndTimestamp = 0;
382391
}
383392

384393
template <typename TagSizeT>
@@ -410,4 +419,21 @@ uint64_t EventImp<TagSizeT>::getPacketAddress(Device *device) {
410419
return address;
411420
}
412421

422+
template <typename TagSizeT>
423+
void EventImp<TagSizeT>::setGpuStartTimestamp() {
424+
if (isEventTimestampFlagSet()) {
425+
this->device->getGlobalTimestamps(&cpuStartTimestamp, &gpuStartTimestamp);
426+
cpuStartTimestamp = cpuStartTimestamp / this->device->getNEODevice()->getDeviceInfo().outProfilingTimerResolution;
427+
}
428+
}
429+
430+
template <typename TagSizeT>
431+
void EventImp<TagSizeT>::setGpuEndTimestamp() {
432+
if (isEventTimestampFlagSet()) {
433+
auto resolution = this->device->getNEODevice()->getDeviceInfo().outProfilingTimerResolution;
434+
auto cpuEndTimestamp = this->device->getNEODevice()->getOSTime()->getCpuRawTimestamp() / resolution;
435+
this->gpuEndTimestamp = gpuStartTimestamp + (cpuEndTimestamp - cpuStartTimestamp);
436+
}
437+
}
438+
413439
} // namespace L0

level_zero/core/test/unit_tests/mocks/mock_event.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,11 @@ struct Mock<EventPool> : public EventPool {
6767

6868
class MockEvent : public ::L0::Event {
6969
public:
70+
using ::L0::Event::gpuEndTimestamp;
71+
using ::L0::Event::gpuStartTimestamp;
7072
using ::L0::Event::isCompleted;
7173
using ::L0::Event::l3FlushAppliedOnKernel;
74+
7275
MockEvent() {
7376
mockAllocation.reset(new NEO::MockGraphicsAllocation(0,
7477
NEO::AllocationType::INTERNAL_HOST_MEMORY,
@@ -119,7 +122,8 @@ class MockEvent : public ::L0::Event {
119122
void resetPackets() override {}
120123
void setPacketsInUse(uint32_t value) override {}
121124
uint64_t getPacketAddress(L0::Device *) override { return 0; }
122-
125+
void setGpuStartTimestamp() override {}
126+
void setGpuEndTimestamp() override {}
123127
std::unique_ptr<NEO::GraphicsAllocation> mockAllocation;
124128
};
125129

level_zero/core/test/unit_tests/sources/event/test_event.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2202,6 +2202,9 @@ HWTEST_F(EventTests,
22022202
}
22032203

22042204
struct MockEventCompletion : public EventImp<uint32_t> {
2205+
using EventImp<uint32_t>::gpuStartTimestamp;
2206+
using EventImp<uint32_t>::gpuEndTimestamp;
2207+
22052208
MockEventCompletion(L0::EventPool *eventPool, int index, L0::Device *device) : EventImp(eventPool, index, device) {
22062209
auto neoDevice = device->getNEODevice();
22072210
kernelEventCompletionData = std::make_unique<KernelEventCompletionData<uint32_t>[]>(EventPacketsCount::maxKernelSplit);
@@ -2260,5 +2263,14 @@ TEST_F(EventTests, WhenQueryingStatusAfterResetThenAccessMemory) {
22602263
EXPECT_EQ(event->assignKernelEventCompletionDataCounter, 2u);
22612264
}
22622265

2266+
TEST_F(EventTests, WhenResetEventThenZeroCpuTimestamps) {
2267+
auto event = std::make_unique<MockEventCompletion>(eventPool, 1u, device);
2268+
event->gpuStartTimestamp = 10u;
2269+
event->gpuEndTimestamp = 20u;
2270+
EXPECT_EQ(event->reset(), ZE_RESULT_SUCCESS);
2271+
EXPECT_EQ(event->gpuStartTimestamp, 0u);
2272+
EXPECT_EQ(event->gpuEndTimestamp, 0u);
2273+
}
2274+
22632275
} // namespace ult
22642276
} // namespace L0

0 commit comments

Comments
 (0)