@@ -146,7 +146,18 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
146146 this ->csr ->programHardwareContext (child);
147147 this ->makeSbaTrackingBufferResidentIfL0DebuggerEnabled (ctx.isDebugEnabled );
148148
149- this ->programPipelineSelectIfGpgpuDisabled (child);
149+ auto &csrStateProperties = csr->getStreamProperties ();
150+ if (!this ->pipelineSelectStateTracking ) {
151+ this ->programPipelineSelectIfGpgpuDisabled (child);
152+ } else {
153+ // Setting systolic/pipeline select here for 1st command list is to preserve dispatch order of hw commands
154+ auto commandList = CommandList::fromHandle (phCommandLists[0 ]);
155+ auto &requiredStreamState = commandList->getRequiredStreamState ();
156+ // Provide cmdlist required state as cmdlist final state, so csr state does not transition to final
157+ // By preserving required state in csr - keeping csr state not dirty - it will not dispatch 1st command list pipeline select/systolic in main loop
158+ // Csr state will transition to final of 1st command list in main loop
159+ this ->programOneCmdListPipelineSelect (commandList, child, csrStateProperties, requiredStreamState, requiredStreamState);
160+ }
150161 this ->programCommandQueueDebugCmdsForSourceLevelOrL0DebuggerIfEnabled (ctx.isDebugEnabled , child);
151162 this ->programStateBaseAddressWithGsbaIfDirty (ctx, phCommandLists[0 ], child);
152163 this ->programCsrBaseAddressIfPreemptionModeInitial (ctx.isPreemptionModeInitial , child);
@@ -157,7 +168,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
157168
158169 this ->programActivePartitionConfig (ctx.isProgramActivePartitionConfigRequired , child);
159170 this ->encodeKernelArgsBufferAndMakeItResident ();
160- auto &csrStateProperties = csr-> getStreamProperties ();
171+
161172 bool shouldProgramVfe = this ->csr ->getLogicalStateHelper () && ctx.frontEndStateDirty ;
162173 this ->programFrontEndAndClearDirtyFlag (shouldProgramVfe, ctx, child, csrStateProperties);
163174
@@ -171,7 +182,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
171182 auto &finalStreamState = commandList->getFinalStreamState ();
172183
173184 this ->updateOneCmdListPreemptionModeAndCtxStatePreemption (ctx, commandList->getCommandListPreemptionMode (), child);
174- this ->updatePipelineSelectState (commandList);
185+ this ->programOneCmdListPipelineSelect (commandList, child, csrStateProperties, requiredStreamState, finalStreamState );
175186 this ->programOneCmdListFrontEndIfDirty (ctx, child, csrStateProperties, requiredStreamState, finalStreamState);
176187
177188 this ->patchCommands (*commandList, this ->csr ->getScratchSpaceController ()->getScratchPatchAddress ());
@@ -388,11 +399,6 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandL
388399 return estimatedSize;
389400}
390401
391- template <GFXCORE_FAMILY gfxCoreFamily>
392- size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelect() {
393- return NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect (device->getHwInfo ());
394- }
395-
396402template <GFXCORE_FAMILY gfxCoreFamily>
397403void CommandQueueHw<gfxCoreFamily>::programPipelineSelectIfGpgpuDisabled(NEO::LinearStream &cmdStream) {
398404 bool gpgpuEnabled = this ->csr ->getPreambleSetFlag ();
@@ -647,24 +653,22 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
647653 uint32_t numCommandLists) {
648654
649655 size_t linearStreamSizeEstimate = 0u ;
650- bool gpgpuEnabled = csr->getPreambleSetFlag ();
651-
652- if (!gpgpuEnabled) {
653- linearStreamSizeEstimate += estimatePipelineSelect ();
654- }
655656
656657 linearStreamSizeEstimate += estimateFrontEndCmdSize (ctx.frontEndStateDirty );
658+ linearStreamSizeEstimate += estimatePipelineSelectCmdSize ();
657659
658- if (frontEndTrackingEnabled ()) {
660+ if (this -> pipelineSelectStateTracking || frontEndTrackingEnabled ()) {
659661 bool frontEndStateDirtyCopy = ctx.frontEndStateDirty ;
660662 auto streamPropertiesCopy = csr->getStreamProperties ();
663+ bool gpgpuEnabledCopy = csr->getPreambleSetFlag ();
661664 for (uint32_t i = 0 ; i < numCommandLists; i++) {
662665 auto cmdList = CommandList::fromHandle (phCommandLists[i]);
663666 auto &requiredStreamState = cmdList->getRequiredStreamState ();
664667 auto &finalStreamState = cmdList->getFinalStreamState ();
665668
666669 linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists (frontEndStateDirtyCopy, ctx.engineInstanced , cmdList,
667670 streamPropertiesCopy, requiredStreamState, finalStreamState);
671+ linearStreamSizeEstimate += estimatePipelineSelectCmdSizeForMultipleCommandLists (streamPropertiesCopy, requiredStreamState, finalStreamState, gpgpuEnabledCopy);
668672 }
669673 }
670674
@@ -1119,14 +1123,59 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::handleSubmissionAndCompletionResults(
11191123}
11201124
11211125template <GFXCORE_FAMILY gfxCoreFamily>
1122- void CommandQueueHw<gfxCoreFamily>::updatePipelineSelectState(CommandList *commandList) {
1123- auto &streamProperties = this ->csr ->getStreamProperties ();
1126+ size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelectCmdSize() {
1127+ if (!this ->pipelineSelectStateTracking ) {
1128+ bool gpgpuEnabled = csr->getPreambleSetFlag ();
1129+ return !gpgpuEnabled * NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect (device->getHwInfo ());
1130+ }
1131+ return 0 ;
1132+ }
1133+
1134+ template <GFXCORE_FAMILY gfxCoreFamily>
1135+ size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelectCmdSizeForMultipleCommandLists(NEO::StreamProperties &csrStateCopy,
1136+ const NEO::StreamProperties &cmdListRequired,
1137+ const NEO::StreamProperties &cmdListFinal,
1138+ bool &gpgpuEnabled) {
1139+ if (!this ->pipelineSelectStateTracking ) {
1140+ return 0 ;
1141+ }
1142+
1143+ size_t singlePipelineSelectSize = NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect (device->getHwInfo ());
1144+ size_t estimatedSize = 0 ;
1145+
1146+ csrStateCopy.pipelineSelect .setProperties (cmdListRequired.pipelineSelect );
1147+ if (!gpgpuEnabled || csrStateCopy.pipelineSelect .isDirty ()) {
1148+ estimatedSize += singlePipelineSelectSize;
1149+ gpgpuEnabled = true ;
1150+ }
1151+
1152+ csrStateCopy.pipelineSelect .setProperties (cmdListFinal.pipelineSelect );
1153+
1154+ return estimatedSize;
1155+ }
11241156
1125- auto &requiredStreamState = commandList->getRequiredStreamState ();
1126- auto &finalStreamState = commandList->getFinalStreamState ();
1157+ template <GFXCORE_FAMILY gfxCoreFamily>
1158+ void CommandQueueHw<gfxCoreFamily>::programOneCmdListPipelineSelect(CommandList *commandList, NEO::LinearStream &commandStream, NEO::StreamProperties &csrState,
1159+ const NEO::StreamProperties &cmdListRequired, const NEO::StreamProperties &cmdListFinal) {
1160+ if (!this ->pipelineSelectStateTracking ) {
1161+ return ;
1162+ }
1163+
1164+ bool preambleSet = csr->getPreambleSetFlag ();
1165+ csrState.pipelineSelect .setProperties (cmdListRequired.pipelineSelect );
1166+
1167+ if (!preambleSet || csrState.pipelineSelect .isDirty ()) {
1168+ NEO::PipelineSelectArgs args = {
1169+ !!csrState.pipelineSelect .systolicMode .value ,
1170+ false ,
1171+ false ,
1172+ commandList->getSystolicModeSupport ()};
1173+
1174+ NEO::PreambleHelper<GfxFamily>::programPipelineSelect (&commandStream, args, device->getHwInfo ());
1175+ csr->setPreambleSetFlag (true );
1176+ }
11271177
1128- streamProperties.pipelineSelect .setProperties (requiredStreamState.pipelineSelect );
1129- streamProperties.pipelineSelect .setProperties (finalStreamState.pipelineSelect );
1178+ csrState.pipelineSelect .setProperties (cmdListFinal.pipelineSelect );
11301179}
11311180
11321181template <GFXCORE_FAMILY gfxCoreFamily>
0 commit comments