Skip to content

Commit 2632b21

Browse files
Organize dispatching of thread data for better reuse of code
Change-Id: I8c156f8b5a50f6fa4dfb5218cdadb2840ff556eb
1 parent 95cfb15 commit 2632b21

File tree

12 files changed

+174
-155
lines changed

12 files changed

+174
-155
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!groovy
22
neoDependenciesRev='803657-1097'
33
strategy='EQUAL'
4-
allowedCD=274
4+
allowedCD=273
55
allowedF=4

runtime/command_queue/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
3636
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
3737
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.h
3838
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.inl
39+
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_base.inl
3940
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
4041
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
4142
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl

runtime/command_queue/gpgpu_walker.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626

2727
namespace OCLRT {
2828

29-
using WALKER_HANDLE = void *;
30-
3129
template <typename GfxFamily>
3230
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
3331

@@ -129,7 +127,7 @@ class GpgpuWalkerHelper {
129127
static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
130128

131129
static size_t setGpgpuWalkerThreadData(
132-
WALKER_HANDLE pCmdData,
130+
WALKER_TYPE<GfxFamily> *walkerCmd,
133131
const size_t globalOffsets[3],
134132
const size_t startWorkGroups[3],
135133
const size_t numWorkGroups[3],
@@ -187,7 +185,7 @@ class GpgpuWalkerHelper {
187185

188186
static void setupTimestampPacket(
189187
LinearStream *cmdStream,
190-
WALKER_HANDLE walkerHandle,
188+
WALKER_TYPE<GfxFamily> *walkerCmd,
191189
TimestampPacket *timestampPacket,
192190
TimestampPacket::WriteOperationType writeOperationType);
193191

runtime/command_queue/gpgpu_walker.inl

Lines changed: 20 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -101,24 +101,22 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
101101

102102
template <typename GfxFamily>
103103
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
104-
WALKER_HANDLE pCmdData,
104+
WALKER_TYPE<GfxFamily> *walkerCmd,
105105
const size_t globalOffsets[3],
106106
const size_t startWorkGroups[3],
107107
const size_t numWorkGroups[3],
108108
const size_t localWorkSizesIn[3],
109109
uint32_t simd,
110110
uint32_t workDim,
111111
bool localIdsGeneration) {
112-
WALKER_TYPE<GfxFamily> *pCmd = static_cast<WALKER_TYPE<GfxFamily> *>(pCmdData);
113-
114112
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
115113

116114
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
117-
pCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
115+
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
118116

119-
pCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
120-
pCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
121-
pCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
117+
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
118+
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
119+
walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
122120

123121
// compute executionMask - to tell which SIMD lines are active within thread
124122
auto remainderSimdLanes = localWorkSize & (simd - 1);
@@ -128,13 +126,13 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
128126

129127
using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
130128

131-
pCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
132-
pCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
133-
pCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
129+
walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
130+
walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
131+
walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
134132

135-
pCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
136-
pCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
137-
pCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
133+
walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
134+
walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
135+
walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
138136

139137
return localWorkSize;
140138
}
@@ -432,7 +430,7 @@ inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(Lin
432430
template <typename GfxFamily>
433431
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
434432
LinearStream *cmdStream,
435-
WALKER_HANDLE walkerHandle,
433+
WALKER_TYPE<GfxFamily> *walkerCmd,
436434
TimestampPacket *timestampPacket,
437435
TimestampPacket::WriteOperationType writeOperationType) {
438436

@@ -523,8 +521,12 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
523521
indirectObjectHeap.getSpace(curbeOffset);
524522
ioh = &indirectObjectHeap;
525523

526-
bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isDispatchForLocalIdsGeneration(1, globalWorkSizes, localWorkSizes);
527-
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
524+
// Program the walker. Invokes execution so all state should already be programmed
525+
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
526+
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
527+
528+
bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
529+
KernelCommandsHelper<GfxFamily>::sendIndirectState(
528530
*commandStream,
529531
*dsh,
530532
*ioh,
@@ -535,37 +537,16 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
535537
offsetInterfaceDescriptorTable,
536538
interfaceDescriptorIndex,
537539
preemptionMode,
540+
pGpGpuWalkerCmd,
538541
nullptr,
539542
localIdsGeneration);
540543

541544
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
542545
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
543546

544-
// Program the walker. Invokes execution so all state should already be programmed
545-
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
546-
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
547-
548547
size_t globalOffsets[3] = {0, 0, 0};
549548
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
550-
auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration);
551-
552-
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
553-
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
554-
pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex);
555-
556-
auto threadPayload = scheduler.getKernelInfo().patchInfo.threadPayload;
557-
DEBUG_BREAK_IF(nullptr == threadPayload);
558-
559-
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
560-
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
561-
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
562-
563-
auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
564-
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
565-
566-
auto sizeCrossThreadData = scheduler.getCrossThreadDataSize();
567-
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
568-
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
549+
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration);
569550

570551
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
571552
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);

runtime/command_queue/hardware_interface.h

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ struct MultiDispatchInfo;
2626
template <class T>
2727
struct TagNode;
2828

29-
using WALKER_HANDLE = void *;
29+
template <typename GfxFamily>
30+
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
3031

3132
template <typename GfxFamily>
3233
class HardwareInterface {
@@ -57,12 +58,7 @@ class HardwareInterface {
5758
LinearStream *commandStream);
5859

5960
static INTERFACE_DESCRIPTOR_DATA *obtainInterfaceDescriptorData(
60-
WALKER_HANDLE pCmdData);
61-
62-
static void setOffsetCrossThreadData(
63-
WALKER_HANDLE pCmdData,
64-
size_t &offsetCrossThreadData,
65-
uint32_t &interfaceDescriptorIndex);
61+
WALKER_TYPE<GfxFamily> *walkerCmd);
6662

6763
static void dispatchWorkarounds(
6864
LinearStream *commandStream,
@@ -83,6 +79,9 @@ class HardwareInterface {
8379
HwPerfCounter *hwPerfCounter,
8480
LinearStream *commandStream,
8581
CommandQueue &commandQueue);
82+
83+
static WALKER_TYPE<GfxFamily> *allocateWalkerSpace(LinearStream &commandStream,
84+
const Kernel &kernel);
8685
};
8786

8887
} // namespace OCLRT

runtime/command_queue/hardware_interface.inl

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -188,18 +188,17 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
188188
}
189189

190190
// Program the walker. Invokes execution so all state should already be programmed
191-
auto pWalkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream->getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
192-
*pWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
191+
auto walkerCmd = allocateWalkerSpace(*commandStream, kernel);
193192

194193
if (setupTimestampPacket) {
195-
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, pWalkerCmd, currentTimestampPacket,
194+
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, walkerCmd, currentTimestampPacket,
196195
TimestampPacket::WriteOperationType::AfterWalker);
197196
}
198197

199-
auto idd = obtainInterfaceDescriptorData(pWalkerCmd);
198+
auto idd = obtainInterfaceDescriptorData(walkerCmd);
200199

201-
bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isDispatchForLocalIdsGeneration(dim, globalWorkSizes, localWorkSizes);
202-
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
200+
bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes);
201+
KernelCommandsHelper<GfxFamily>::sendIndirectState(
203202
*commandStream,
204203
*dsh,
205204
*ioh,
@@ -210,35 +209,15 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
210209
offsetInterfaceDescriptorTable,
211210
interfaceDescriptorIndex,
212211
preemptionMode,
212+
walkerCmd,
213213
idd,
214214
localIdsGeneration);
215215

216216
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
217217
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
218218
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
219-
auto localWorkSize = GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pWalkerCmd, globalOffsets, startWorkGroups,
220-
numWorkGroups, localWorkSizes, simd, dim, localIdsGeneration);
221-
222-
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
223-
setOffsetCrossThreadData(pWalkerCmd, offsetCrossThreadData, interfaceDescriptorIndex);
224-
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
225-
226-
size_t sizePerThreadDataTotal = 0;
227-
if (localIdsGeneration) {
228-
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
229-
DEBUG_BREAK_IF(nullptr == threadPayload);
230-
231-
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
232-
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
233-
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
234-
235-
sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
236-
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
237-
}
238-
239-
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
240-
WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
241-
pWalkerCmd->setIndirectDataLength(indirectDataLength);
219+
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
220+
numWorkGroups, localWorkSizes, simd, dim, localIdsGeneration);
242221

243222
dispatchWorkarounds(commandStream, commandQueue, kernel, false);
244223
currentDispatchIndex++;

runtime/command_queue/hardware_interface_base.inl

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,10 @@ inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
3333
template <typename GfxFamily>
3434
inline typename HardwareInterface<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *
3535
HardwareInterface<GfxFamily>::obtainInterfaceDescriptorData(
36-
WALKER_HANDLE pCmdData) {
37-
36+
WALKER_TYPE<GfxFamily> *walkerCmd) {
3837
return nullptr;
3938
}
4039

41-
template <typename GfxFamily>
42-
inline void HardwareInterface<GfxFamily>::setOffsetCrossThreadData(
43-
WALKER_HANDLE pCmdData,
44-
size_t &offsetCrossThreadData,
45-
uint32_t &interfaceDescriptorIndex) {
46-
47-
WALKER_TYPE<GfxFamily> *pCmd = static_cast<WALKER_TYPE<GfxFamily> *>(pCmdData);
48-
pCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
49-
pCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
50-
}
51-
5240
template <typename GfxFamily>
5341
inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
5442
LinearStream *commandStream,
@@ -103,4 +91,12 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
10391
}
10492
}
10593

94+
template <typename GfxFamily>
95+
inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
96+
const Kernel &kernel) {
97+
auto walkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream.getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
98+
*walkerCmd = GfxFamily::cmdInitGpgpuWalker;
99+
return walkerCmd;
100+
}
101+
106102
} // namespace OCLRT

runtime/helpers/basic_math.h

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,8 @@
11
/*
2-
* Copyright (c) 2017, Intel Corporation
2+
* Copyright (C) 2017-2018 Intel Corporation
33
*
4-
* Permission is hereby granted, free of charge, to any person obtaining a
5-
* copy of this software and associated documentation files (the "Software"),
6-
* to deal in the Software without restriction, including without limitation
7-
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8-
* and/or sell copies of the Software, and to permit persons to whom the
9-
* Software is furnished to do so, subject to the following conditions:
4+
* SPDX-License-Identifier: MIT
105
*
11-
* The above copyright notice and this permission notice shall be included
12-
* in all copies or substantial portions of the Software.
13-
*
14-
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15-
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16-
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17-
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18-
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19-
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20-
* OTHER DEALINGS IN THE SOFTWARE.
216
*/
227

238
#pragma once
@@ -164,5 +149,15 @@ inline size_t computeTotalElementsCount(const Vec3<size_t> &inputVector) {
164149
return xDim * yDim * zDim;
165150
}
166151

152+
template <typename T>
153+
bool isPow2(T val) {
154+
if (val != 0) {
155+
if ((val & (val - 1)) == 0) {
156+
return true;
157+
}
158+
}
159+
return false;
160+
}
161+
167162
} // namespace Math
168163
} // namespace OCLRT

runtime/helpers/kernel_commands.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ class IndirectHeap;
2424
struct CrossThreadInfo;
2525
struct MultiDispatchInfo;
2626

27+
template <typename GfxFamily>
28+
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
29+
2730
template <typename GfxFamily>
2831
struct KernelCommandsHelper : public PerThreadDataHelper {
2932
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
@@ -88,8 +91,9 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
8891
uint32_t simd,
8992
const size_t localWorkSize[3],
9093
const uint64_t offsetInterfaceDescriptorTable,
91-
const uint32_t interfaceDescriptorIndex,
94+
uint32_t &interfaceDescriptorIndex,
9295
PreemptionMode preemptionMode,
96+
WALKER_TYPE<GfxFamily> *walkerCmd,
9397
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
9498
bool localIdsGeneration);
9599

@@ -155,6 +159,6 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
155159

156160
static bool doBindingTablePrefetch();
157161

158-
static bool isDispatchForLocalIdsGeneration(uint32_t workDim, size_t *gws, size_t *lws);
162+
static bool isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws);
159163
};
160164
} // namespace OCLRT

0 commit comments

Comments
 (0)