Skip to content

Commit e960802

Browse files
Add pipeline select state tracking
This optimization removes pipeline select from command list preamble and presented to command queue for necessary state update. Code is disabled by default and available under debug key. Related-To: NEO-5019 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
1 parent 7aecea5 commit e960802

25 files changed

+513
-180
lines changed

level_zero/core/source/cmdlist/cmdlist.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,10 @@ struct CommandList : _ze_command_list_handle_t {
276276
void makeResidentAndMigrate(bool);
277277
void migrateSharedAllocations();
278278

279+
bool getSystolicModeSupport() const {
280+
return systolicModeSupport;
281+
}
282+
279283
ze_context_handle_t hContext = nullptr;
280284
std::vector<Kernel *> printfKernelContainer;
281285
CommandQueue *cmdQImmediate = nullptr;
@@ -318,6 +322,7 @@ struct CommandList : _ze_command_list_handle_t {
318322
bool performMemoryPrefetch = false;
319323
bool multiReturnPointCommandList = false;
320324
bool systolicModeSupport = false;
325+
bool pipelineSelectStateTracking = false;
321326
};
322327

323328
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
140140
commandContainer.setReservedSshSize(getReserveSshSize());
141141
DeviceImp *deviceImp = static_cast<DeviceImp *>(device);
142142
auto returnValue = commandContainer.initialize(deviceImp->getActiveDevice(), deviceImp->allocationsForReuse.get(), !isCopyOnly());
143-
commandContainer.systolicModeSupport = this->systolicModeSupport;
143+
if (!this->pipelineSelectStateTracking) {
144+
// allow systolic support set in container when tracking disabled
145+
// setting systolic support allows dispatching untracked command in legacy mode
146+
commandContainer.systolicModeSupport = this->systolicModeSupport;
147+
}
144148

145149
ze_result_t returnType = parseErrorCode(returnValue);
146150
if (returnType == ZE_RESULT_SUCCESS) {
@@ -2323,10 +2327,21 @@ void CommandListCoreFamily<gfxCoreFamily>::updateStreamProperties(Kernel &kernel
23232327
containsAnyKernel = true;
23242328
}
23252329

2326-
finalStreamState.frontEndState.setProperties(isCooperative, kernelAttributes.flags.requiresDisabledEUFusion, disableOverdispatch, -1, hwInfo);
2327-
bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get();
23282330
auto logicalStateHelperBlock = !getLogicalStateHelper();
23292331

2332+
finalStreamState.pipelineSelect.setProperties(true, false, kernelAttributes.flags.usesSystolicPipelineSelectMode, hwInfo);
2333+
if (this->pipelineSelectStateTracking && finalStreamState.pipelineSelect.isDirty() && logicalStateHelperBlock) {
2334+
NEO::PipelineSelectArgs pipelineSelectArgs;
2335+
pipelineSelectArgs.systolicPipelineSelectMode = kernelAttributes.flags.usesSystolicPipelineSelectMode;
2336+
pipelineSelectArgs.systolicPipelineSelectSupport = this->systolicModeSupport;
2337+
2338+
NEO::PreambleHelper<GfxFamily>::programPipelineSelect(commandContainer.getCommandStream(),
2339+
pipelineSelectArgs,
2340+
hwInfo);
2341+
}
2342+
2343+
finalStreamState.frontEndState.setProperties(isCooperative, kernelAttributes.flags.requiresDisabledEUFusion, disableOverdispatch, -1, hwInfo);
2344+
bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get();
23302345
if (finalStreamState.frontEndState.isDirty() && logicalStateHelperBlock) {
23312346
if (isPatchingVfeStateAllowed) {
23322347
auto pVfeStateAddress = NEO::PreambleHelper<GfxFamily>::getSpaceForVfeState(commandContainer.getCommandStream(), hwInfo, engineGroupType);
@@ -2348,14 +2363,15 @@ void CommandListCoreFamily<gfxCoreFamily>::updateStreamProperties(Kernel &kernel
23482363
}
23492364

23502365
finalStreamState.stateComputeMode.setProperties(false, kernelAttributes.numGrfRequired, kernelAttributes.threadArbitrationPolicy, device->getDevicePreemptionMode(), hwInfo);
2351-
23522366
if (finalStreamState.stateComputeMode.isDirty() && logicalStateHelperBlock) {
23532367
bool isRcs = (this->engineGroupType == NEO::EngineGroupType::RenderCompute);
2368+
NEO::PipelineSelectArgs pipelineSelectArgs;
2369+
pipelineSelectArgs.systolicPipelineSelectMode = kernelAttributes.flags.usesSystolicPipelineSelectMode;
2370+
pipelineSelectArgs.systolicPipelineSelectSupport = this->systolicModeSupport;
2371+
23542372
NEO::EncodeComputeMode<GfxFamily>::programComputeModeCommandWithSynchronization(
2355-
*commandContainer.getCommandStream(), finalStreamState.stateComputeMode, {}, false, hwInfo, isRcs, nullptr);
2373+
*commandContainer.getCommandStream(), finalStreamState.stateComputeMode, pipelineSelectArgs, false, hwInfo, isRcs, nullptr);
23562374
}
2357-
2358-
finalStreamState.pipelineSelect.setProperties(true, false, kernelAttributes.flags.usesSystolicPipelineSelectMode, hwInfo);
23592375
}
23602376

23612377
template <GFXCORE_FAMILY gfxCoreFamily>

level_zero/core/source/cmdlist/cmdlist_imp.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ namespace L0 {
2828

2929
CommandList::CommandList(uint32_t numIddsPerBlock) : commandContainer(numIddsPerBlock) {
3030
multiReturnPointCommandList = L0HwHelper::enableMultiReturnPointCommandList();
31+
pipelineSelectStateTracking = L0HwHelper::enablePipelineSelectStateTracking();
3132
}
3233

3334
CommandListAllocatorFn commandListFactory[IGFX_MAX_PRODUCT] = {};

level_zero/core/source/cmdqueue/cmdqueue.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr
4545
}
4646

4747
multiReturnPointCommandList = L0HwHelper::enableMultiReturnPointCommandList();
48+
pipelineSelectStateTracking = L0HwHelper::enablePipelineSelectStateTracking();
4849
}
4950

5051
ze_result_t CommandQueueImp::destroy() {

level_zero/core/source/cmdqueue/cmdqueue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
6868
bool isCopyOnlyCommandQueue = false;
6969
bool internalUsage = false;
7070
bool multiReturnPointCommandList = false;
71+
bool pipelineSelectStateTracking = false;
7172
};
7273

7374
using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr,

level_zero/core/source/cmdqueue/cmdqueue_hw.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ struct CommandQueueHw : public CommandQueueImp {
4444
size_t estimateFrontEndCmdSize();
4545
size_t estimateFrontEndCmdSize(bool isFrontEndDirty);
4646

47-
size_t estimatePipelineSelect();
4847
void programPipelineSelectIfGpgpuDisabled(NEO::LinearStream &commandStream);
4948

5049
MOCKABLE_VIRTUAL void handleScratchSpace(NEO::HeapContainer &heapContainer,
@@ -171,7 +170,16 @@ struct CommandQueueHw : public CommandQueueImp {
171170
inline void updateTaskCountAndPostSync(bool isDispatchTaskCountPostSyncRequired);
172171
inline ze_result_t waitForCommandQueueCompletionAndCleanHeapContainer();
173172
inline ze_result_t handleSubmissionAndCompletionResults(NEO::SubmissionStatus submitRet, ze_result_t completionRet);
174-
inline void updatePipelineSelectState(CommandList *commandList);
173+
inline size_t estimatePipelineSelectCmdSizeForMultipleCommandLists(NEO::StreamProperties &csrStateCopy,
174+
const NEO::StreamProperties &cmdListRequired,
175+
const NEO::StreamProperties &cmdListFinal,
176+
bool &gpgpuEnabled);
177+
inline size_t estimatePipelineSelectCmdSize();
178+
inline void programOneCmdListPipelineSelect(CommandList *commandList,
179+
NEO::LinearStream &commandStream,
180+
NEO::StreamProperties &csrState,
181+
const NEO::StreamProperties &cmdListRequired,
182+
const NEO::StreamProperties &cmdListFinal);
175183

176184
size_t alignedChildStreamPadding{};
177185
};

level_zero/core/source/cmdqueue/cmdqueue_hw.inl

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,18 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
146146
this->csr->programHardwareContext(child);
147147
this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled);
148148

149-
this->programPipelineSelectIfGpgpuDisabled(child);
149+
auto &csrStateProperties = csr->getStreamProperties();
150+
if (!this->pipelineSelectStateTracking) {
151+
this->programPipelineSelectIfGpgpuDisabled(child);
152+
} else {
153+
// Setting systolic/pipeline select here for 1st command list is to preserve dispatch order of hw commands
154+
auto commandList = CommandList::fromHandle(phCommandLists[0]);
155+
auto &requiredStreamState = commandList->getRequiredStreamState();
156+
// Provide cmdlist required state as cmdlist final state, so csr state does not transition to final
157+
// By preserving required state in csr - keeping csr state not dirty - it will not dispatch 1st command list pipeline select/systolic in main loop
158+
// Csr state will transition to final of 1st command list in main loop
159+
this->programOneCmdListPipelineSelect(commandList, child, csrStateProperties, requiredStreamState, requiredStreamState);
160+
}
150161
this->programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled(ctx.isDebugEnabled, child);
151162
this->programStateBaseAddressWithGsbaIfDirty(ctx, phCommandLists[0], child);
152163
this->programCsrBaseAddressIfPreemptionModeInitial(ctx.isPreemptionModeInitial, child);
@@ -157,7 +168,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
157168

158169
this->programActivePartitionConfig(ctx.isProgramActivePartitionConfigRequired, child);
159170
this->encodeKernelArgsBufferAndMakeItResident();
160-
auto &csrStateProperties = csr->getStreamProperties();
171+
161172
bool shouldProgramVfe = this->csr->getLogicalStateHelper() && ctx.frontEndStateDirty;
162173
this->programFrontEndAndClearDirtyFlag(shouldProgramVfe, ctx, child, csrStateProperties);
163174

@@ -171,7 +182,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
171182
auto &finalStreamState = commandList->getFinalStreamState();
172183

173184
this->updateOneCmdListPreemptionModeAndCtxStatePreemption(ctx, commandList->getCommandListPreemptionMode(), child);
174-
this->updatePipelineSelectState(commandList);
185+
this->programOneCmdListPipelineSelect(commandList, child, csrStateProperties, requiredStreamState, finalStreamState);
175186
this->programOneCmdListFrontEndIfDirty(ctx, child, csrStateProperties, requiredStreamState, finalStreamState);
176187

177188
this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress());
@@ -388,11 +399,6 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandL
388399
return estimatedSize;
389400
}
390401

391-
template <GFXCORE_FAMILY gfxCoreFamily>
392-
size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelect() {
393-
return NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect(device->getHwInfo());
394-
}
395-
396402
template <GFXCORE_FAMILY gfxCoreFamily>
397403
void CommandQueueHw<gfxCoreFamily>::programPipelineSelectIfGpgpuDisabled(NEO::LinearStream &cmdStream) {
398404
bool gpgpuEnabled = this->csr->getPreambleSetFlag();
@@ -647,24 +653,22 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
647653
uint32_t numCommandLists) {
648654

649655
size_t linearStreamSizeEstimate = 0u;
650-
bool gpgpuEnabled = csr->getPreambleSetFlag();
651-
652-
if (!gpgpuEnabled) {
653-
linearStreamSizeEstimate += estimatePipelineSelect();
654-
}
655656

656657
linearStreamSizeEstimate += estimateFrontEndCmdSize(ctx.frontEndStateDirty);
658+
linearStreamSizeEstimate += estimatePipelineSelectCmdSize();
657659

658-
if (frontEndTrackingEnabled()) {
660+
if (this->pipelineSelectStateTracking || frontEndTrackingEnabled()) {
659661
bool frontEndStateDirtyCopy = ctx.frontEndStateDirty;
660662
auto streamPropertiesCopy = csr->getStreamProperties();
663+
bool gpgpuEnabledCopy = csr->getPreambleSetFlag();
661664
for (uint32_t i = 0; i < numCommandLists; i++) {
662665
auto cmdList = CommandList::fromHandle(phCommandLists[i]);
663666
auto &requiredStreamState = cmdList->getRequiredStreamState();
664667
auto &finalStreamState = cmdList->getFinalStreamState();
665668

666669
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirtyCopy, ctx.engineInstanced, cmdList,
667670
streamPropertiesCopy, requiredStreamState, finalStreamState);
671+
linearStreamSizeEstimate += estimatePipelineSelectCmdSizeForMultipleCommandLists(streamPropertiesCopy, requiredStreamState, finalStreamState, gpgpuEnabledCopy);
668672
}
669673
}
670674

@@ -1119,14 +1123,59 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::handleSubmissionAndCompletionResults(
11191123
}
11201124

11211125
template <GFXCORE_FAMILY gfxCoreFamily>
1122-
void CommandQueueHw<gfxCoreFamily>::updatePipelineSelectState(CommandList *commandList) {
1123-
auto &streamProperties = this->csr->getStreamProperties();
1126+
size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelectCmdSize() {
1127+
if (!this->pipelineSelectStateTracking) {
1128+
bool gpgpuEnabled = csr->getPreambleSetFlag();
1129+
return !gpgpuEnabled * NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect(device->getHwInfo());
1130+
}
1131+
return 0;
1132+
}
1133+
1134+
template <GFXCORE_FAMILY gfxCoreFamily>
1135+
size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelectCmdSizeForMultipleCommandLists(NEO::StreamProperties &csrStateCopy,
1136+
const NEO::StreamProperties &cmdListRequired,
1137+
const NEO::StreamProperties &cmdListFinal,
1138+
bool &gpgpuEnabled) {
1139+
if (!this->pipelineSelectStateTracking) {
1140+
return 0;
1141+
}
1142+
1143+
size_t singlePipelineSelectSize = NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect(device->getHwInfo());
1144+
size_t estimatedSize = 0;
1145+
1146+
csrStateCopy.pipelineSelect.setProperties(cmdListRequired.pipelineSelect);
1147+
if (!gpgpuEnabled || csrStateCopy.pipelineSelect.isDirty()) {
1148+
estimatedSize += singlePipelineSelectSize;
1149+
gpgpuEnabled = true;
1150+
}
1151+
1152+
csrStateCopy.pipelineSelect.setProperties(cmdListFinal.pipelineSelect);
1153+
1154+
return estimatedSize;
1155+
}
11241156

1125-
auto &requiredStreamState = commandList->getRequiredStreamState();
1126-
auto &finalStreamState = commandList->getFinalStreamState();
1157+
template <GFXCORE_FAMILY gfxCoreFamily>
1158+
void CommandQueueHw<gfxCoreFamily>::programOneCmdListPipelineSelect(CommandList *commandList, NEO::LinearStream &commandStream, NEO::StreamProperties &csrState,
1159+
const NEO::StreamProperties &cmdListRequired, const NEO::StreamProperties &cmdListFinal) {
1160+
if (!this->pipelineSelectStateTracking) {
1161+
return;
1162+
}
1163+
1164+
bool preambleSet = csr->getPreambleSetFlag();
1165+
csrState.pipelineSelect.setProperties(cmdListRequired.pipelineSelect);
1166+
1167+
if (!preambleSet || csrState.pipelineSelect.isDirty()) {
1168+
NEO::PipelineSelectArgs args = {
1169+
!!csrState.pipelineSelect.systolicMode.value,
1170+
false,
1171+
false,
1172+
commandList->getSystolicModeSupport()};
1173+
1174+
NEO::PreambleHelper<GfxFamily>::programPipelineSelect(&commandStream, args, device->getHwInfo());
1175+
csr->setPreambleSetFlag(true);
1176+
}
11271177

1128-
streamProperties.pipelineSelect.setProperties(requiredStreamState.pipelineSelect);
1129-
streamProperties.pipelineSelect.setProperties(finalStreamState.pipelineSelect);
1178+
csrState.pipelineSelect.setProperties(cmdListFinal.pipelineSelect);
11301179
}
11311180

11321181
template <GFXCORE_FAMILY gfxCoreFamily>

level_zero/core/source/hw_helpers/l0_hw_helper.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,12 @@ bool L0HwHelper::enableMultiReturnPointCommandList() {
2323
return defaultValue;
2424
}
2525

26+
bool L0HwHelper::enablePipelineSelectStateTracking() {
27+
constexpr bool defaultValue = false;
28+
if (NEO::DebugManager.flags.EnablePipelineSelectTracking.get() != -1) {
29+
return !!NEO::DebugManager.flags.EnablePipelineSelectTracking.get();
30+
}
31+
return defaultValue;
32+
}
33+
2634
} // namespace L0

level_zero/core/source/hw_helpers/l0_hw_helper.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class L0HwHelper {
3131
public:
3232
static L0HwHelper &get(GFXCORE_FAMILY gfxCore);
3333
static bool enableMultiReturnPointCommandList();
34+
static bool enablePipelineSelectStateTracking();
3435
virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0;
3536
virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0;
3637

level_zero/core/test/unit_tests/fixtures/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ set(L0_FIXTURES_SOURCES
1111
${CMAKE_CURRENT_SOURCE_DIR}/aub_csr_fixture.h
1212
${CMAKE_CURRENT_SOURCE_DIR}/cmdlist_fixture.cpp
1313
${CMAKE_CURRENT_SOURCE_DIR}/cmdlist_fixture.h
14+
${CMAKE_CURRENT_SOURCE_DIR}/cmdlist_fixture.inl
1415
${CMAKE_CURRENT_SOURCE_DIR}/device_fixture.h
1516
${CMAKE_CURRENT_SOURCE_DIR}/device_fixture.cpp
1617
${CMAKE_CURRENT_SOURCE_DIR}/host_pointer_manager_fixture.h

0 commit comments

Comments
 (0)