Skip to content

Commit ff07a72

Browse files
Revert "performance: Optimize ULLS start on submit path"
This reverts commit ea78831. Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com> Source: 1d02f7f
1 parent e870729 commit ff07a72

File tree

6 files changed

+166
-90
lines changed

6 files changed

+166
-90
lines changed

shared/source/direct_submission/direct_submission_hw.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2023 Intel Corporation
2+
* Copyright (C) 2020-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -82,6 +82,8 @@ class DirectSubmissionHw {
8282

8383
MOCKABLE_VIRTUAL bool stopRingBuffer(bool blocking);
8484

85+
bool startRingBuffer();
86+
8587
MOCKABLE_VIRTUAL bool dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp);
8688
uint32_t getDispatchErrorCode();
8789

@@ -119,7 +121,6 @@ class DirectSubmissionHw {
119121
virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
120122
virtual void getTagAddressValue(TagData &tagData) = 0;
121123
void unblockGpu();
122-
bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size);
123124
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
124125

125126
void cpuCachelineFlush(void *ptr, size_t size);
@@ -134,9 +135,6 @@ class DirectSubmissionHw {
134135
void dispatchStartSection(uint64_t gpuStartAddress);
135136
size_t getSizeStartSection();
136137

137-
size_t getUllsStateSize();
138-
void dispatchUllsState();
139-
140138
void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
141139
size_t getSizeSwitchRingBufferSection();
142140

shared/source/direct_submission/direct_submission_hw.inl

Lines changed: 56 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2023 Intel Corporation
2+
* Copyright (C) 2020-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -499,6 +499,55 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
499499
return ret;
500500
}
501501

502+
template <typename GfxFamily, typename Dispatcher>
503+
bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
504+
if (ringStart) {
505+
return true;
506+
}
507+
508+
size_t startSize = getSizeSemaphoreSection(false);
509+
if (!this->partitionConfigSet) {
510+
startSize += getSizePartitionRegisterConfigurationSection();
511+
}
512+
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
513+
startSize += getSizeSystemMemoryFenceAddress();
514+
}
515+
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
516+
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
517+
}
518+
519+
size_t requiredSize = startSize + getSizeDispatch(false, false, dispatchMonitorFenceRequired(true)) + getSizeEnd(false);
520+
if (ringCommandStream.getAvailableSpace() < requiredSize) {
521+
switchRingBuffers(nullptr);
522+
}
523+
uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();
524+
525+
if (!this->partitionConfigSet) {
526+
dispatchPartitionRegisterConfiguration();
527+
this->partitionConfigSet = true;
528+
}
529+
530+
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
531+
dispatchSystemMemoryFenceAddress();
532+
this->systemMemoryFenceAddressSet = true;
533+
}
534+
535+
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
536+
preinitializeRelaxedOrderingSections();
537+
dispatchStaticRelaxedOrderingScheduler();
538+
initRelaxedOrderingRegisters();
539+
540+
this->relaxedOrderingInitialized = true;
541+
}
542+
543+
currentQueueWorkCount++;
544+
dispatchSemaphoreSection(currentQueueWorkCount);
545+
546+
ringStart = submit(gpuStartVa, startSize);
547+
548+
return ringStart;
549+
}
550+
502551
template <typename GfxFamily, typename Dispatcher>
503552
bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer(bool blocking) {
504553
if (!ringStart) {
@@ -891,46 +940,15 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::copyCommandBufferIntoRing(BatchB
891940
return ret;
892941
}
893942

894-
template <typename GfxFamily, typename Dispatcher>
895-
size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getUllsStateSize() {
896-
size_t startSize = 0u;
897-
if (!this->partitionConfigSet) {
898-
startSize += getSizePartitionRegisterConfigurationSection();
899-
}
900-
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
901-
startSize += getSizeSystemMemoryFenceAddress();
902-
}
903-
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
904-
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
905-
}
906-
return startSize;
907-
}
908-
909-
template <typename GfxFamily, typename Dispatcher>
910-
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchUllsState() {
911-
if (!this->partitionConfigSet) {
912-
dispatchPartitionRegisterConfiguration();
913-
this->partitionConfigSet = true;
914-
}
915-
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
916-
dispatchSystemMemoryFenceAddress();
917-
this->systemMemoryFenceAddressSet = true;
918-
}
919-
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
920-
preinitializeRelaxedOrderingSections();
921-
dispatchStaticRelaxedOrderingScheduler();
922-
initRelaxedOrderingRegisters();
923-
924-
this->relaxedOrderingInitialized = true;
925-
}
926-
}
927-
928943
template <typename GfxFamily, typename Dispatcher>
929944
bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
930945
if (batchBuffer.ringBufferRestartRequest) {
931946
this->stopRingBuffer(false);
932947
}
933948

949+
if (!this->startRingBuffer()) {
950+
return false;
951+
}
934952
lastSubmittedThrottle = batchBuffer.throttle;
935953
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
936954
bool inputRequiredMonitorFence = false;
@@ -941,7 +959,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
941959
}
942960
bool dispatchMonitorFence = this->dispatchMonitorFenceRequired(inputRequiredMonitorFence);
943961

944-
size_t dispatchSize = this->getUllsStateSize() + getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
962+
size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
945963

946964
if (this->copyCommandBufferIntoRing(batchBuffer)) {
947965
dispatchSize += (batchBuffer.stream->getUsed() - batchBuffer.startOffset) - 2 * getSizeStartSection();
@@ -960,14 +978,8 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
960978
}
961979
}
962980

963-
auto needStart = !this->ringStart;
964-
this->ringStart = true;
965-
auto startVA = ringCommandStream.getCurrentGpuAddressPosition();
966-
967981
this->switchRingBuffersNeeded(requiredMinimalSize, batchBuffer.allocationsForResidency);
968982

969-
this->dispatchUllsState();
970-
971983
if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
972984
dispatchRelaxedOrderingQueueStall();
973985
}
@@ -979,10 +991,9 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
979991
void *currentPosition = dispatchWorkloadSection(batchBuffer, dispatchMonitorFence);
980992

981993
cpuCachelineFlush(currentPosition, dispatchSize);
994+
handleResidency();
982995

983-
if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize)) {
984-
return false;
985-
}
996+
this->unblockGpu();
986997

987998
cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize);
988999
currentQueueWorkCount++;
@@ -997,17 +1008,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
9971008
return ringStart;
9981009
}
9991010

1000-
template <typename GfxFamily, typename Dispatcher>
1001-
bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size) {
1002-
if (needStart) {
1003-
return this->submit(gpuAddress, size);
1004-
} else {
1005-
handleResidency();
1006-
this->unblockGpu();
1007-
return true;
1008-
}
1009-
}
1010-
10111011
template <typename GfxFamily, typename Dispatcher>
10121012
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::setReturnAddress(void *returnCmd, uint64_t returnAddress) {
10131013
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;

shared/source/direct_submission/windows/wddm_direct_submission.inl

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2023 Intel Corporation
2+
* Copyright (C) 2020-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -55,9 +55,7 @@ WddmDirectSubmission<GfxFamily, Dispatcher>::~WddmDirectSubmission() {
5555

5656
template <typename GfxFamily, typename Dispatcher>
5757
inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
58-
auto needStart = !this->ringStart;
59-
this->ringStart = true;
60-
auto startVA = this->ringCommandStream.getCurrentGpuAddressPosition();
58+
this->startRingBuffer();
6159

6260
size_t requiredMinimalSize = this->getSizeSemaphoreSection(false) +
6361
Dispatcher::getSizeMonitorFence(this->rootDeviceEnvironment) +
@@ -73,7 +71,8 @@ inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
7371
Dispatcher::dispatchMonitorFence(this->ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, this->rootDeviceEnvironment, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);
7472

7573
this->dispatchSemaphoreSection(this->currentQueueWorkCount + 1);
76-
this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize);
74+
this->handleResidency();
75+
this->unblockGpu();
7776
this->currentQueueWorkCount++;
7877

7978
this->updateTagValueImpl(this->currentRingBuffer);

shared/test/common/mocks/mock_direct_submission_hw.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2023 Intel Corporation
2+
* Copyright (C) 2020-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -40,7 +40,6 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
4040
using BaseClass::dispatchSemaphoreSection;
4141
using BaseClass::dispatchStartSection;
4242
using BaseClass::dispatchSwitchRingBufferSection;
43-
using BaseClass::dispatchUllsState;
4443
using BaseClass::dispatchWorkloadSection;
4544
using BaseClass::getDiagnosticModeSection;
4645
using BaseClass::getSizeDisablePrefetcher;
@@ -80,6 +79,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
8079
using BaseClass::semaphorePtr;
8180
using BaseClass::semaphores;
8281
using BaseClass::setReturnAddress;
82+
using BaseClass::startRingBuffer;
8383
using BaseClass::stopRingBuffer;
8484
using BaseClass::switchRingBuffersAllocations;
8585
using BaseClass::switchRingBuffersNeeded;

shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2023 Intel Corporation
2+
* Copyright (C) 2020-2024 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -375,6 +375,74 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionSubmitFailWhenRingIsStartedT
375375
EXPECT_NE(0u, directSubmission.ringCommandStream.getUsed());
376376
}
377377

378+
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsStartedThenExpectNoStartCommandsDispatched) {
379+
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
380+
381+
bool ret = directSubmission.initialize(true, false);
382+
EXPECT_TRUE(ret);
383+
size_t usedSize = directSubmission.ringCommandStream.getUsed();
384+
385+
ret = directSubmission.startRingBuffer();
386+
EXPECT_TRUE(ret);
387+
EXPECT_EQ(usedSize, directSubmission.ringCommandStream.getUsed());
388+
}
389+
390+
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsNotStartedThenExpectStartCommandsDispatched) {
391+
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
392+
393+
bool ret = directSubmission.initialize(false, false);
394+
EXPECT_TRUE(ret);
395+
size_t usedSize = directSubmission.ringCommandStream.getUsed();
396+
397+
ret = directSubmission.startRingBuffer();
398+
EXPECT_TRUE(ret);
399+
EXPECT_TRUE(directSubmission.ringStart);
400+
EXPECT_NE(usedSize, directSubmission.ringCommandStream.getUsed());
401+
}
402+
403+
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsNotStartedSubmitFailThenExpectStartCommandsDispatchedRingNotStarted) {
404+
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
405+
406+
bool ret = directSubmission.initialize(false, false);
407+
EXPECT_TRUE(ret);
408+
size_t usedSize = directSubmission.ringCommandStream.getUsed();
409+
410+
directSubmission.submitReturn = false;
411+
ret = directSubmission.startRingBuffer();
412+
EXPECT_FALSE(ret);
413+
EXPECT_FALSE(directSubmission.ringStart);
414+
EXPECT_NE(usedSize, directSubmission.ringCommandStream.getUsed());
415+
}
416+
417+
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsNotStartedAndSwitchBufferIsNeededThenExpectRingAllocationChangedStartCommandsDispatched) {
418+
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
419+
420+
bool ret = directSubmission.initialize(false, false);
421+
EXPECT_TRUE(ret);
422+
auto expectedRingBuffer = directSubmission.currentRingBuffer;
423+
GraphicsAllocation *oldRingBuffer = directSubmission.ringCommandStream.getGraphicsAllocation();
424+
425+
auto requiredSize = directSubmission.getSizeSemaphoreSection(false);
426+
if (directSubmission.miMemFenceRequired) {
427+
requiredSize += directSubmission.getSizeSystemMemoryFenceAddress();
428+
}
429+
if (directSubmission.isRelaxedOrderingEnabled()) {
430+
requiredSize += RelaxedOrderingHelper::getSizeRegistersInit<FamilyType>();
431+
}
432+
433+
directSubmission.ringCommandStream.getSpace(directSubmission.ringCommandStream.getAvailableSpace() - requiredSize);
434+
435+
ret = directSubmission.startRingBuffer();
436+
auto actualRingBuffer = directSubmission.currentRingBuffer;
437+
438+
EXPECT_TRUE(ret);
439+
EXPECT_TRUE(directSubmission.ringStart);
440+
EXPECT_NE(oldRingBuffer, directSubmission.ringCommandStream.getGraphicsAllocation());
441+
EXPECT_EQ(requiredSize, directSubmission.ringCommandStream.getUsed());
442+
443+
EXPECT_NE(expectedRingBuffer, actualRingBuffer);
444+
}
445+
378446
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStopWhenStopRingIsCalledThenExpectStopCommandDispatched) {
379447
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
380448

0 commit comments

Comments
 (0)