Skip to content

Commit c330579

Browse files
Revert "Allocate RTDispatchGlobals as unboxed array"
This reverts commit eaa4965. Signed-off-by: Jim Snow <jim.m.snow@intel.com> Source: f976c7a
1 parent bae8735 commit c330579

File tree

8 files changed

+68
-94
lines changed

8 files changed

+68
-94
lines changed

level_zero/core/source/kernel/kernel_imp.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
907907
uint32_t bvhLevels = NEO::RayTracingHelper::maxBvhLevels;
908908
auto arg = this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.rtDispatchGlobals;
909909
if (arg.pointerSize == 0) {
910-
// application is allocating its own RTDispatchGlobals manually
910+
// kernel is allocating its own RTDispatchGlobals manually
911911
neoDevice->initializeRayTracing(0);
912912
} else {
913913
neoDevice->initializeRayTracing(bvhLevels);
@@ -916,18 +916,17 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
916916
return ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY;
917917
}
918918

919-
for (auto rtStack : rtDispatchGlobalsInfo->rtStacks) {
920-
this->residencyContainer.push_back(rtStack);
919+
for (auto rtDispatchGlobals : rtDispatchGlobalsInfo->rtDispatchGlobals) {
920+
this->residencyContainer.push_back(rtDispatchGlobals);
921921
}
922922

923-
auto address = rtDispatchGlobalsInfo->rtDispatchGlobalsArray->getGpuAddressToPatch();
923+
auto address = rtDispatchGlobalsInfo->rtDispatchGlobals[0]->getGpuAddressToPatch();
924924
NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
925925
arg,
926926
static_cast<uintptr_t>(address));
927927

928-
this->residencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
928+
this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer());
929929
}
930-
this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer());
931930
}
932931

933932
return ZE_RESULT_SUCCESS;

level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -929,7 +929,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndNoRTDispatchGlobalsIs
929929
neoDevice->executionEnvironment->memoryManager.swap(otherMemoryManager);
930930
}
931931

932-
HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTStackAllocationFailsThenRayTracingIsNotInitialized, IsAtLeastXeHpgCore) {
932+
HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTDispatchGlobalsArrayAllocationFailsThenRayTracingIsNotInitialized, IsAtLeastXeHpgCore) {
933933
KernelDescriptor mockDescriptor = {};
934934
mockDescriptor.kernelAttributes.flags.hasRTCalls = true;
935935
mockDescriptor.kernelMetadata.kernelName = "rt_test";
@@ -963,7 +963,7 @@ HWTEST2_F(KernelImmutableDataTests, whenHasRTCallsIsTrueAndRTStackAllocationFail
963963
neoDevice->rtDispatchGlobalsForceAllocation = false;
964964

965965
std::unique_ptr<NEO::MemoryManager> otherMemoryManager;
966-
// Ensure that allocating RTDispatchGlobals succeeds, but first RTStack allocation fails.
966+
// Ensure that allocating RTDispatchGlobals succeeds, but the array allocation fails.
967967
otherMemoryManager = std::make_unique<NEO::FailMemoryManager>(1, *neoDevice->executionEnvironment);
968968
neoDevice->executionEnvironment->memoryManager.swap(otherMemoryManager);
969969

@@ -1091,7 +1091,7 @@ TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenCrossThreadDataIsPatche
10911091
EXPECT_NE(nullptr, rtDispatchGlobals);
10921092

10931093
auto dispatchGlobalsAddressPatched = *reinterpret_cast<uint64_t *>(ptrOffset(crossThreadData.get(), rtGlobalPointerPatchOffset));
1094-
auto dispatchGlobalsGpuAddressOffset = static_cast<uint64_t>(rtDispatchGlobals->rtDispatchGlobalsArray->getGpuAddressToPatch());
1094+
auto dispatchGlobalsGpuAddressOffset = static_cast<uint64_t>(rtDispatchGlobals->rtDispatchGlobals[0]->getGpuAddressToPatch());
10951095
EXPECT_EQ(dispatchGlobalsGpuAddressOffset, dispatchGlobalsAddressPatched);
10961096

10971097
kernel->crossThreadData.release();

opencl/test/unit_test/helpers/ray_tracing_helper_tests.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,17 @@ TEST(RayTracingHelperTests, whenMemoryBackedFifoSizeIsRequestedThenCorrectValueI
3333
}
3434

3535
TEST(RayTracingHelperTests, whenGlobalDispatchSizeIsRequestedThenCorrectValueIsReturned) {
36-
size_t expectedSize = alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize);
37-
size_t size = RayTracingHelper::getDispatchGlobalSize();
38-
EXPECT_EQ(expectedSize, size);
39-
}
40-
41-
TEST(RayTracingHelperTests, whenRTStackSizeIsRequestedThenCorrectValueIsReturned) {
4236
MockClDevice device{new MockDevice};
4337
MockContext context(&device);
4438

4539
uint32_t maxBvhLevel = 2;
4640
uint32_t extraBytesLocal = 20;
4741
uint32_t extraBytesGlobal = 100;
48-
uint32_t tiles = 2;
4942

50-
size_t expectedSize = RayTracingHelper::getStackSizePerRay(maxBvhLevel, extraBytesLocal) * (RayTracingHelper::getNumRtStacks(device.getDevice()) / tiles) + extraBytesGlobal;
51-
size_t size = RayTracingHelper::getRTStackSizePerTile(device.getDevice(), tiles, maxBvhLevel, extraBytesLocal, extraBytesGlobal);
43+
size_t expectedSize = alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize) +
44+
(RayTracingHelper::hitInfoSize + RayTracingHelper::bvhStackSize * maxBvhLevel + extraBytesLocal) * RayTracingHelper::getNumRtStacks(device.getDevice()) +
45+
extraBytesGlobal;
46+
size_t size = RayTracingHelper::getDispatchGlobalSize(device.getDevice(), maxBvhLevel, extraBytesLocal, extraBytesGlobal);
5247
EXPECT_EQ(expectedSize, size);
5348
}
5449

shared/source/device/device.cpp

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -663,13 +663,13 @@ void Device::finalizeRayTracing() {
663663
if (rtDispatchGlobalsInfo == nullptr) {
664664
continue;
665665
}
666-
for (size_t j = 0; j < rtDispatchGlobalsInfo->rtStacks.size(); j++) {
667-
getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtStacks[j]);
668-
rtDispatchGlobalsInfo->rtStacks[j] = nullptr;
666+
for (size_t j = 0; j < rtDispatchGlobalsInfo->rtDispatchGlobals.size(); j++) {
667+
getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtDispatchGlobals[j]);
668+
rtDispatchGlobalsInfo->rtDispatchGlobals[j] = nullptr;
669669
}
670670

671-
getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
672-
rtDispatchGlobalsInfo->rtDispatchGlobalsArray = nullptr;
671+
getMemoryManager()->freeGraphicsMemory(rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation);
672+
rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation = nullptr;
673673

674674
delete rtDispatchGlobalsInfos[i];
675675
rtDispatchGlobalsInfos[i] = nullptr;
@@ -749,55 +749,37 @@ void Device::allocateRTDispatchGlobals(uint32_t maxBvhLevels) {
749749

750750
uint32_t extraBytesLocal = 0;
751751
uint32_t extraBytesGlobal = 0;
752-
uint32_t dispatchGlobalsStride = MemoryConstants::pageSize64k;
753-
UNRECOVERABLE_IF(RayTracingHelper::getDispatchGlobalSize() > dispatchGlobalsStride);
754-
755-
bool allocFailed = false;
752+
auto size = RayTracingHelper::getDispatchGlobalSize(*this, maxBvhLevels, extraBytesLocal, extraBytesGlobal);
756753

757754
const auto deviceCount = HwHelper::getSubDevicesCount(executionEnvironment->rootDeviceEnvironments[getRootDeviceIndex()]->getHardwareInfo());
758-
auto dispatchGlobalsSize = deviceCount * dispatchGlobalsStride;
759-
auto rtStackSize = RayTracingHelper::getRTStackSizePerTile(*this, deviceCount, maxBvhLevels, extraBytesLocal, extraBytesGlobal);
760755

761-
std::unique_ptr<RTDispatchGlobalsInfo> dispatchGlobalsInfo = std::make_unique<RTDispatchGlobalsInfo>();
756+
auto dispatchGlobalsInfo = new RTDispatchGlobalsInfo(nullptr);
762757
if (dispatchGlobalsInfo == nullptr) {
763758
return;
764759
}
765760

766761
auto &hwInfo = getHardwareInfo();
767762
auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily);
768763

769-
GraphicsAllocation *dispatchGlobalsArrayAllocation = nullptr;
770-
771-
AllocationProperties arrayAllocProps(getRootDeviceIndex(), true, dispatchGlobalsSize,
772-
AllocationType::BUFFER, true, getDeviceBitfield());
773-
arrayAllocProps.flags.resource48Bit = true;
774-
arrayAllocProps.flags.isUSMDeviceAllocation = true;
775-
dispatchGlobalsArrayAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(arrayAllocProps);
776-
777-
if (dispatchGlobalsArrayAllocation == nullptr) {
778-
return;
779-
}
764+
std::vector<uint64_t> gpuAddressVector;
765+
bool allocFailed = false;
780766

781767
for (unsigned int tile = 0; tile < deviceCount; tile++) {
782-
DeviceBitfield deviceBitfield =
783-
(deviceCount == 1)
784-
? this->getDeviceBitfield()
785-
: subdevices[tile]->getDeviceBitfield();
786-
787-
AllocationProperties allocProps(getRootDeviceIndex(), true, rtStackSize, AllocationType::BUFFER, true, deviceBitfield);
768+
AllocationProperties allocProps(getRootDeviceIndex(), true, size, AllocationType::BUFFER, true, getDeviceBitfield());
788769
allocProps.flags.resource48Bit = true;
789770
allocProps.flags.isUSMDeviceAllocation = true;
790771

791-
auto rtStackAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(allocProps);
772+
auto dispatchGlobalsAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(allocProps);
792773

793-
if (rtStackAllocation == nullptr) {
774+
if (dispatchGlobalsAllocation == nullptr) {
794775
allocFailed = true;
795776
break;
796777
}
797778

779+
auto dispatchGlobalsPtr = dispatchGlobalsAllocation->getGpuAddress();
798780
struct RTDispatchGlobals dispatchGlobals = {0};
799781

800-
dispatchGlobals.rtMemBasePtr = rtStackAllocation->getGpuAddress();
782+
dispatchGlobals.rtMemBasePtr = size + dispatchGlobalsPtr;
801783
dispatchGlobals.callStackHandlerKSP = reinterpret_cast<uint64_t>(nullptr);
802784
dispatchGlobals.stackSizePerRay = 0;
803785
dispatchGlobals.numDSSRTStacks = RayTracingHelper::stackDssMultiplier;
@@ -806,27 +788,45 @@ void Device::allocateRTDispatchGlobals(uint32_t maxBvhLevels) {
806788
uint32_t *dispatchGlobalsAsArray = reinterpret_cast<uint32_t *>(&dispatchGlobals);
807789
dispatchGlobalsAsArray[7] = 1;
808790

809-
MemoryTransferHelper::transferMemoryToAllocation(hwInfoConfig.isBlitCopyRequiredForLocalMemory(this->getHardwareInfo(), *dispatchGlobalsArrayAllocation),
791+
MemoryTransferHelper::transferMemoryToAllocation(hwInfoConfig.isBlitCopyRequiredForLocalMemory(this->getHardwareInfo(), *dispatchGlobalsAllocation),
810792
*this,
811-
dispatchGlobalsArrayAllocation,
812-
tile * dispatchGlobalsStride,
793+
dispatchGlobalsAllocation,
794+
0,
813795
&dispatchGlobals,
814796
sizeof(RTDispatchGlobals));
815797

816-
dispatchGlobalsInfo->rtStacks.push_back(rtStackAllocation);
798+
dispatchGlobalsInfo->rtDispatchGlobals.push_back(dispatchGlobalsAllocation);
799+
gpuAddressVector.push_back(dispatchGlobalsAllocation->getGpuAddress());
817800
}
818801

819-
if (allocFailed) {
820-
for (auto allocation : dispatchGlobalsInfo->rtStacks) {
802+
GraphicsAllocation *dispatchGlobalsArrayAllocation = nullptr;
803+
size_t arrayAllocSize = sizeof(uint64_t) * deviceCount;
804+
805+
if (!allocFailed) {
806+
AllocationProperties arrayAllocProps(getRootDeviceIndex(), true, arrayAllocSize,
807+
AllocationType::BUFFER, true, getDeviceBitfield());
808+
arrayAllocProps.flags.resource48Bit = true;
809+
arrayAllocProps.flags.isUSMDeviceAllocation = true;
810+
dispatchGlobalsArrayAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(arrayAllocProps);
811+
}
812+
813+
if (dispatchGlobalsArrayAllocation == nullptr) {
814+
for (auto allocation : dispatchGlobalsInfo->rtDispatchGlobals) {
821815
getMemoryManager()->freeGraphicsMemory(allocation);
822816
}
823-
824-
getMemoryManager()->freeGraphicsMemory(dispatchGlobalsArrayAllocation);
817+
delete dispatchGlobalsInfo;
825818
return;
826819
}
827820

828-
dispatchGlobalsInfo->rtDispatchGlobalsArray = dispatchGlobalsArrayAllocation;
829-
rtDispatchGlobalsInfos[maxBvhLevels] = dispatchGlobalsInfo.release();
821+
MemoryTransferHelper::transferMemoryToAllocation(hwInfoConfig.isBlitCopyRequiredForLocalMemory(this->getHardwareInfo(), *dispatchGlobalsArrayAllocation),
822+
*this,
823+
dispatchGlobalsArrayAllocation,
824+
0,
825+
gpuAddressVector.data(),
826+
arrayAllocSize);
827+
828+
dispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation = dispatchGlobalsArrayAllocation;
829+
rtDispatchGlobalsInfos[maxBvhLevels] = dispatchGlobalsInfo;
830830
}
831831

832832
} // namespace NEO

shared/source/device/device.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@ struct EngineGroupT {
4141
using EngineGroupsT = std::vector<EngineGroupT>;
4242

4343
struct RTDispatchGlobalsInfo {
44-
GraphicsAllocation *rtDispatchGlobalsArray = nullptr;
45-
std::vector<GraphicsAllocation *> rtStacks; // per tile
44+
RTDispatchGlobalsInfo(GraphicsAllocation *rtDispatchGlobalsArrayAllocation)
45+
: rtDispatchGlobalsArrayAllocation(rtDispatchGlobalsArrayAllocation){};
46+
std::vector<GraphicsAllocation *> rtDispatchGlobals; // per tile
47+
GraphicsAllocation *rtDispatchGlobalsArrayAllocation; // above array as visible from device
4648
};
4749

4850
class Device : public ReferenceTrackedObject<Device> {

shared/source/helpers/ray_tracing_helper.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@ class RayTracingHelper : public NonCopyableOrMovableClass {
2424
static constexpr uint32_t memoryBackedFifoSizePerDss = 8 * KB;
2525
static constexpr uint32_t maxBvhLevels = 8;
2626

27-
static size_t getDispatchGlobalSize() {
28-
return static_cast<size_t>(alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize));
29-
}
30-
31-
static size_t getRTStackSizePerTile(const Device &device, uint32_t tiles, uint32_t maxBvhLevel, uint32_t extraBytesLocal, uint32_t extraBytesGlobal) {
32-
return static_cast<size_t>(getStackSizePerRay(maxBvhLevel, extraBytesLocal) * (getNumRtStacks(device) / tiles) + extraBytesGlobal);
27+
static size_t getDispatchGlobalSize(const Device &device, uint32_t maxBvhLevel, uint32_t extraBytesLocal, uint32_t extraBytesGlobal) {
28+
return static_cast<size_t>(alignUp(sizeof(RTDispatchGlobals), MemoryConstants::cacheLineSize) +
29+
getStackSizePerRay(maxBvhLevel, extraBytesLocal) * getNumRtStacks(device) +
30+
extraBytesGlobal);
3331
}
3432

3533
static size_t getTotalMemoryBackedFifoSize(const Device &device) {

shared/test/common/mocks/mock_device.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,12 +168,12 @@ class MockDevice : public RootDevice {
168168
for (unsigned int i = 0; i < rtDispatchGlobalsInfos.size(); i++) {
169169
auto rtDispatchGlobalsInfo = rtDispatchGlobalsInfos[i];
170170
if (rtDispatchGlobalsForceAllocation == true && rtDispatchGlobalsInfo != nullptr) {
171-
for (unsigned int j = 0; j < rtDispatchGlobalsInfo->rtStacks.size(); j++) {
172-
delete rtDispatchGlobalsInfo->rtStacks[j];
173-
rtDispatchGlobalsInfo->rtStacks[j] = nullptr;
171+
for (unsigned int j = 0; j < rtDispatchGlobalsInfo->rtDispatchGlobals.size(); j++) {
172+
delete rtDispatchGlobalsInfo->rtDispatchGlobals[j];
173+
rtDispatchGlobalsInfo->rtDispatchGlobals[j] = nullptr;
174174
}
175-
delete rtDispatchGlobalsInfo->rtDispatchGlobalsArray;
176-
rtDispatchGlobalsInfo->rtDispatchGlobalsArray = nullptr;
175+
delete rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation;
176+
rtDispatchGlobalsInfo->rtDispatchGlobalsArrayAllocation = nullptr;
177177
delete rtDispatchGlobalsInfos[i];
178178
rtDispatchGlobalsInfos[i] = nullptr;
179179
}

shared/test/unit_test/device/neo_device_tests.cpp

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -98,26 +98,6 @@ TEST_F(DeviceTest, whenAllocateRTDispatchGlobalsIsCalledThenRTDispatchGlobalsIsA
9898
EXPECT_NE(nullptr, pDevice->getRTDispatchGlobals(3));
9999
}
100100

101-
HWTEST2_F(DeviceTest, whenAllocateRTDispatchGlobalsIsCalledAndRTStackAllocationFailsRTDispatchGlobalsIsNotAllocated, IsPVC) {
102-
DebugManagerStateRestore dbgRestorer;
103-
104-
DebugManager.flags.CreateMultipleSubDevices.set(2);
105-
pDevice->deviceBitfield = 3;
106-
107-
pDevice->subdevices.push_back(new SubDevice(pDevice->executionEnvironment, 0, *pDevice));
108-
pDevice->subdevices.push_back(new SubDevice(pDevice->executionEnvironment, 1, *pDevice));
109-
110-
std::unique_ptr<NEO::MemoryManager> otherMemoryManager;
111-
otherMemoryManager = std::make_unique<NEO::MockMemoryManagerWithCapacity>(*pDevice->executionEnvironment);
112-
static_cast<NEO::MockMemoryManagerWithCapacity &>(*otherMemoryManager).capacity = 25000000;
113-
pDevice->executionEnvironment->memoryManager.swap(otherMemoryManager);
114-
115-
pDevice->initializeRayTracing(5);
116-
EXPECT_EQ(nullptr, pDevice->getRTDispatchGlobals(3));
117-
118-
pDevice->executionEnvironment->memoryManager.swap(otherMemoryManager);
119-
}
120-
121101
TEST_F(DeviceTest, givenDispatchGlobalsAllocationFailsThenRTDispatchGlobalsInfoIsNull) {
122102
std::unique_ptr<NEO::MemoryManager> otherMemoryManager;
123103
otherMemoryManager = std::make_unique<NEO::FailMemoryManager>(1, *pDevice->getExecutionEnvironment());
@@ -644,4 +624,4 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DeviceTests, givenNonDebuggableOsContextWhenDeviceC
644624
auto device = deviceFactory.rootDevices[0];
645625
auto csr = device->allEngines[device->defaultEngineIndex].commandStreamReceiver;
646626
EXPECT_EQ(0u, csr->peekLatestSentTaskCount());
647-
}
627+
}

0 commit comments

Comments
 (0)