@@ -101,24 +101,22 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
101101
102102template <typename GfxFamily>
103103inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
104- WALKER_HANDLE pCmdData ,
104+ WALKER_TYPE<GfxFamily> *walkerCmd ,
105105 const size_t globalOffsets[3 ],
106106 const size_t startWorkGroups[3 ],
107107 const size_t numWorkGroups[3 ],
108108 const size_t localWorkSizesIn[3 ],
109109 uint32_t simd,
110110 uint32_t workDim,
111111 bool localIdsGeneration) {
112- WALKER_TYPE<GfxFamily> *pCmd = static_cast <WALKER_TYPE<GfxFamily> *>(pCmdData);
113-
114112 auto localWorkSize = localWorkSizesIn[0 ] * localWorkSizesIn[1 ] * localWorkSizesIn[2 ];
115113
116114 auto threadsPerWorkGroup = getThreadsPerWG (simd, localWorkSize);
117- pCmd ->setThreadWidthCounterMaximum (static_cast <uint32_t >(threadsPerWorkGroup));
115+ walkerCmd ->setThreadWidthCounterMaximum (static_cast <uint32_t >(threadsPerWorkGroup));
118116
119- pCmd ->setThreadGroupIdXDimension (static_cast <uint32_t >(numWorkGroups[0 ]));
120- pCmd ->setThreadGroupIdYDimension (static_cast <uint32_t >(numWorkGroups[1 ]));
121- pCmd ->setThreadGroupIdZDimension (static_cast <uint32_t >(numWorkGroups[2 ]));
117+ walkerCmd ->setThreadGroupIdXDimension (static_cast <uint32_t >(numWorkGroups[0 ]));
118+ walkerCmd ->setThreadGroupIdYDimension (static_cast <uint32_t >(numWorkGroups[1 ]));
119+ walkerCmd ->setThreadGroupIdZDimension (static_cast <uint32_t >(numWorkGroups[2 ]));
122120
123121 // compute executionMask - to tell which SIMD lines are active within thread
124122 auto remainderSimdLanes = localWorkSize & (simd - 1 );
@@ -128,13 +126,13 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
128126
129127 using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
130128
131- pCmd ->setRightExecutionMask (static_cast <uint32_t >(executionMask));
132- pCmd ->setBottomExecutionMask (static_cast <uint32_t >(0xffffffff ));
133- pCmd ->setSimdSize (static_cast <SIMD_SIZE>(simd >> 4 ));
129+ walkerCmd ->setRightExecutionMask (static_cast <uint32_t >(executionMask));
130+ walkerCmd ->setBottomExecutionMask (static_cast <uint32_t >(0xffffffff ));
131+ walkerCmd ->setSimdSize (static_cast <SIMD_SIZE>(simd >> 4 ));
134132
135- pCmd ->setThreadGroupIdStartingX (static_cast <uint32_t >(startWorkGroups[0 ]));
136- pCmd ->setThreadGroupIdStartingY (static_cast <uint32_t >(startWorkGroups[1 ]));
137- pCmd ->setThreadGroupIdStartingResumeZ (static_cast <uint32_t >(startWorkGroups[2 ]));
133+ walkerCmd ->setThreadGroupIdStartingX (static_cast <uint32_t >(startWorkGroups[0 ]));
134+ walkerCmd ->setThreadGroupIdStartingY (static_cast <uint32_t >(startWorkGroups[1 ]));
135+ walkerCmd ->setThreadGroupIdStartingResumeZ (static_cast <uint32_t >(startWorkGroups[2 ]));
138136
139137 return localWorkSize;
140138}
@@ -432,7 +430,7 @@ inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(Lin
432430template <typename GfxFamily>
433431void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
434432 LinearStream *cmdStream,
435- WALKER_HANDLE walkerHandle ,
433+ WALKER_TYPE<GfxFamily> *walkerCmd ,
436434 TimestampPacket *timestampPacket,
437435 TimestampPacket::WriteOperationType writeOperationType) {
438436
@@ -523,8 +521,12 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
523521 indirectObjectHeap.getSpace (curbeOffset);
524522 ioh = &indirectObjectHeap;
525523
526- bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isDispatchForLocalIdsGeneration (1 , globalWorkSizes, localWorkSizes);
527- auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState (
524+ // Program the walker. Invokes execution so all state should already be programmed
525+ auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace (sizeof (GPGPU_WALKER));
526+ *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
527+
528+ bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired (1 , globalWorkSizes, localWorkSizes);
529+ KernelCommandsHelper<GfxFamily>::sendIndirectState (
528530 *commandStream,
529531 *dsh,
530532 *ioh,
@@ -535,37 +537,16 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
535537 offsetInterfaceDescriptorTable,
536538 interfaceDescriptorIndex,
537539 preemptionMode,
540+ pGpGpuWalkerCmd,
538541 nullptr ,
539542 localIdsGeneration);
540543
541544 // Implement enabling special WA DisableLSQCROPERFforOCL if needed
542545 GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, scheduler, true );
543546
544- // Program the walker. Invokes execution so all state should already be programmed
545- auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace (sizeof (GPGPU_WALKER));
546- *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
547-
548547 size_t globalOffsets[3 ] = {0 , 0 , 0 };
549548 size_t workGroups[3 ] = {(scheduler.getGws () / scheduler.getLws ()), 1 , 1 };
550- auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData (pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1 , localIdsGeneration);
551-
552- pGpGpuWalkerCmd->setIndirectDataStartAddress ((uint32_t )offsetCrossThreadData);
553- DEBUG_BREAK_IF (offsetCrossThreadData % 64 != 0 );
554- pGpGpuWalkerCmd->setInterfaceDescriptorOffset (interfaceDescriptorIndex);
555-
556- auto threadPayload = scheduler.getKernelInfo ().patchInfo .threadPayload ;
557- DEBUG_BREAK_IF (nullptr == threadPayload);
558-
559- auto numChannels = PerThreadDataHelper::getNumLocalIdChannels (*threadPayload);
560- auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread (simd, numChannels);
561- localIdSizePerThread = std::max (localIdSizePerThread, sizeof (GRF));
562-
563- auto sizePerThreadDataTotal = getThreadsPerWG (simd, localWorkSize) * localIdSizePerThread;
564- DEBUG_BREAK_IF (sizePerThreadDataTotal == 0 ); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
565-
566- auto sizeCrossThreadData = scheduler.getCrossThreadDataSize ();
567- auto IndirectDataLength = alignUp ((uint32_t )(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
568- pGpGpuWalkerCmd->setIndirectDataLength (IndirectDataLength);
549+ GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData (pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1 , localIdsGeneration);
569550
570551 // Implement disabling special WA DisableLSQCROPERFforOCL if needed
571552 GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, scheduler, false );
0 commit comments