11#--- source.hlsl
2+ #define VALUE_SETS 2
3+ #define NUM_MASKS 4
4+ #define NUM_THREADS 4
5+
6+ struct MaskStruct {
7+ int mask[NUM_THREADS];
8+ };
9+
210StructuredBuffer<half4> In : register(t0);
3- RWStructuredBuffer<half4 > Out1 : register(u1); // test scalar
4- RWStructuredBuffer<half4 > Out2 : register(u2); // test half2
11+ RWStructuredBuffer<half > Out1 : register(u1); // test scalar
12+ RWStructuredBuffer<half2 > Out2 : register(u2); // test half2
513RWStructuredBuffer<half4> Out3 : register(u3); // test half3
614RWStructuredBuffer<half4> Out4 : register(u4); // test half4
715RWStructuredBuffer<half4> Out5 : register(u5); // constant folding
16+ StructuredBuffer<MaskStruct> Masks : register(t6);
17+
818
9- [numthreads(4 ,1,1)]
19+ [numthreads(NUM_THREADS ,1,1)]
1020void main(uint3 tid : SV_GroupThreadID)
1121{
12- half4 v = In[tid.x];
13-
14- // Mask per "active lane set": only >= N lanes contribute
15- half s1 = tid.x >= 3 ? WaveReadLaneFirst( v.x ) : 0;
16- half s2 = tid.x >= 2 ? WaveReadLaneFirst( v.x ) : 0;
17- half s3 = tid.x >= 1 ? WaveReadLaneFirst( v.x ) : 0;
18- half s4 = tid.x >= 0 ? WaveReadLaneFirst( v.x ) : 0;
19-
20- half2 v2_1 = tid.x >= 3 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
21- half2 v2_2 = tid.x >= 2 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
22- half2 v2_3 = tid.x >= 1 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
23- half2 v2_4 = tid.x >= 0 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
24-
25- half3 v3_1 = tid.x >= 3 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
26- half3 v3_2 = tid.x >= 2 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
27- half3 v3_3 = tid.x >= 1 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
28- half3 v3_4 = tid.x >= 0 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
29-
30- half4 v4_1 = tid.x >= 3 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
31- half4 v4_2 = tid.x >= 2 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
32- half4 v4_3 = tid.x >= 1 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
33- half4 v4_4 = tid.x >= 0 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
34-
35- half scalars[4] = { s4, s3, s2, s1 };
36- half2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
37- half3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
38- half4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };
39-
40- Out1[tid.x].x = scalars[tid.x];
41- Out2[tid.x].xy = vec2s[tid.x];
42- Out3[tid.x].xyz = vec3s[tid.x];
43- Out4[tid.x] = vec4s[tid.x];
22+ for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
23+ const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
24+ for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
25+ half4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
26+ const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
27+ if (Masks[MaskIdx].mask[tid.x]) {
28+ Out1[OutIdx] = WaveReadLaneFirst( v.x );
29+ Out2[OutIdx].xy = WaveReadLaneFirst( v.xy );
30+ Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz );
31+ Out4[OutIdx] = WaveReadLaneFirst( v );
32+ }
33+ }
34+ }
4435
4536 // constant folding case
46- Out5[0] = WaveReadLaneFirst(half4(1,2,3,4));
37+ Out5[0] = WaveReadLaneFirst(half4(1,2,3,4));
4738}
4839
40+
4941//--- pipeline.yaml
5042
5143---
@@ -57,52 +49,184 @@ Buffers:
5749 - Name: In
5850 Format: Float16
5951 Stride: 8
60- # 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000
61- Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
52+ # 2 value sets
53+ # For each value set,
54+ # and for each specific one of the 4 thread masks in that value set,
55+ # and for each of the 4 threads in that thread mask,
56+ # there will be a unique set of 4 values, such that
57+ # none of the other threads in that thread mask share any values
58+ Data: [
59+ 0x2000, 0x2200, 0x2400, 0x2800, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
60+ 0x2A00, 0x2C00, 0x2E00, 0x3000, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
61+ 0x3200, 0x3400, 0x3600, 0x3800,
62+ 0x3900, 0x3A00, 0x3B00, 0x3BC0,
63+ 0x2200, 0x2400, 0x2800, 0x2A00, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
64+ 0x2C00, 0x2E00, 0x3000, 0x3200,
65+ 0x3400, 0x3600, 0x3800, 0x3900,
66+ 0x3A00, 0x3B00, 0x3BC0, 0x2000,
67+ 0x2400, 0x2800, 0x2A00, 0x2C00,
68+ 0x2E00, 0x3000, 0x3200, 0x3400,
69+ 0x3600, 0x3800, 0x3900, 0x3A00,
70+ 0x3B00, 0x3BC0, 0x2000, 0x2200,
71+ 0x2800, 0x2A00, 0x2C00, 0x2E00,
72+ 0x3000, 0x3200, 0x3400, 0x3600,
73+ 0x3800, 0x3900, 0x3A00, 0x3B00,
74+ 0x3BC0, 0x2000, 0x2200, 0x2400,
75+ 0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
76+ 0x3000, 0x2E00, 0x2C00, 0x2A00,
77+ 0x3800, 0x3600, 0x3400, 0x3200,
78+ 0x3BC0, 0x3B00, 0x3A00, 0x3900,
79+ 0x2A00, 0x2800, 0x2400, 0x2200,
80+ 0x3200, 0x3000, 0x2E00, 0x2C00,
81+ 0x3900, 0x3800, 0x3600, 0x3400,
82+ 0x2000, 0x3BC0, 0x3B00, 0x3A00,
83+ 0x2C00, 0x2A00, 0x2800, 0x2400,
84+ 0x3400, 0x3200, 0x3000, 0x2E00,
85+ 0x3A00, 0x3900, 0x3800, 0x3600,
86+ 0x2200, 0x2000, 0x3BC0, 0x3B00,
87+ 0x2E00, 0x2C00, 0x2A00, 0x2800,
88+ 0x3600, 0x3400, 0x3200, 0x3000,
89+ 0x3B00, 0x3A00, 0x3900, 0x3800,
90+ 0x2400, 0x2200, 0x2000, 0x3BC0 ]
91+
6292 - Name: Out1
6393 Format: Float16
64- Stride: 8
65- ZeroInitSize: 32
94+ Stride: 2
95+ # 1 half is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
96+ FillSize: 64
6697 - Name: Out2
6798 Format: Float16
68- Stride: 8
69- ZeroInitSize: 32
99+ Stride: 4
100+ FillSize: 128
70101 - Name: Out3
71102 Format: Float16
72103 Stride: 8
73- ZeroInitSize: 32
104+ FillSize: 256
74105 - Name: Out4
75106 Format: Float16
76107 Stride: 8
77- ZeroInitSize: 32
108+ FillSize: 256
78109 - Name: Out5
79110 Format: Float16
80111 Stride: 8
81- ZeroInitSize: 8
112+ FillSize: 8
113+ - Name: Masks
114+ Format: Int32
115+ Stride: 16
116+ # 4 active mask sets for threads 0, 1, 2, 3:
117+ # 0 0 0 0
118+ # 1 1 1 1
119+ # 1 0 0 0
120+ # 0 1 1 0
121+ Data: [
122+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
82123 - Name: ExpectedOut1
83124 Format: Float16
84125 Stride: 8
85- # 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0
86- Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4200, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0 ]
126+ # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
127+ Data: [ 0x0, 0x0, 0x0, 0x0,
128+ 0x2200, 0x2200, 0x2200, 0x2200,
129+ 0x2400, 0x0, 0x0, 0x0,
130+ 0x0, 0x3000, 0x3000, 0x0,
131+ 0x0, 0x0, 0x0, 0x0,
132+ 0x2A00, 0x2A00, 0x2A00, 0x2A00,
133+ 0x2C00, 0x0, 0x0, 0x0,
134+ 0x0, 0x3600, 0x3600, 0x0]
87135 - Name: ExpectedOut2
88136 Format: Float16
89137 Stride: 8
90- # 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0
91- Data: [ 0x3c00, 0x4900, 0x0, 0x0, 0x4000, 0x4d00, 0x0, 0x0, 0x4200, 0x4f80, 0x0, 0x0, 0x4400, 0x5100, 0x0, 0x0 ]
138+ # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
139+ Data: [ 0x0, 0x0, 0x0, 0x0,
140+ 0x0, 0x0, 0x0, 0x0,
141+ 0x2200, 0x2400, 0x2200, 0x2400,
142+ 0x2200, 0x2400, 0x2200, 0x2400,
143+ 0x2400, 0x2800, 0x0, 0x0,
144+ 0x0, 0x0, 0x0, 0x0,
145+ 0x0, 0x0, 0x3000, 0x3200,
146+ 0x3000, 0x3200, 0x0, 0x0,
147+ 0x0, 0x0, 0x0, 0x0,
148+ 0x0, 0x0, 0x0, 0x0,
149+ 0x2A00, 0x2800, 0x2A00, 0x2800,
150+ 0x2A00, 0x2800, 0x2A00, 0x2800,
151+ 0x2C00, 0x2A00, 0x0, 0x0,
152+ 0x0, 0x0, 0x0, 0x0,
153+ 0x0, 0x0, 0x3600, 0x3400,
154+ 0x3600, 0x3400, 0x0, 0x0 ]
92155 - Name: ExpectedOut3
93156 Format: Float16
94157 Stride: 8
95- # 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0
96- Data: [ 0x3c00, 0x4900, 0x5640, 0x0, 0x4000, 0x4d00, 0x5a40, 0x0, 0x4200, 0x4f80, 0x5cb0, 0x0, 0x4400, 0x5100, 0x5e40, 0x0 ]
158+ # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
159+ # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
160+ Data: [ 0x0, 0x0, 0x0, 0x0,
161+ 0x0, 0x0, 0x0, 0x0,
162+ 0x0, 0x0, 0x0, 0x0,
163+ 0x0, 0x0, 0x0, 0x0,
164+ 0x2200, 0x2400, 0x2800, 0x0,
165+ 0x2200, 0x2400, 0x2800, 0x0,
166+ 0x2200, 0x2400, 0x2800, 0x0,
167+ 0x2200, 0x2400, 0x2800, 0x0,
168+ 0x2400, 0x2800, 0x2A00, 0x0,
169+ 0x0, 0x0, 0x0, 0x0,
170+ 0x0, 0x0, 0x0, 0x0,
171+ 0x0, 0x0, 0x0, 0x0,
172+ 0x0, 0x0, 0x0, 0x0,
173+ 0x3000, 0x3200, 0x3400, 0x0,
174+ 0x3000, 0x3200, 0x3400, 0x0,
175+ 0x0, 0x0, 0x0, 0x0,
176+ 0x0, 0x0, 0x0, 0x0,
177+ 0x0, 0x0, 0x0, 0x0,
178+ 0x0, 0x0, 0x0, 0x0,
179+ 0x0, 0x0, 0x0, 0x0,
180+ 0x2A00, 0x2800, 0x2400, 0x0,
181+ 0x2A00, 0x2800, 0x2400, 0x0,
182+ 0x2A00, 0x2800, 0x2400, 0x0,
183+ 0x2A00, 0x2800, 0x2400, 0x0,
184+ 0x2C00, 0x2A00, 0x2800, 0x0,
185+ 0x0, 0x0, 0x0, 0x0,
186+ 0x0, 0x0, 0x0, 0x0,
187+ 0x0, 0x0, 0x0, 0x0,
188+ 0x0, 0x0, 0x0, 0x0,
189+ 0x3600, 0x3400, 0x3200, 0x0,
190+ 0x3600, 0x3400, 0x3200, 0x0,
191+ 0x0, 0x0, 0x0, 0x0 ]
97192 - Name: ExpectedOut4
98193 Format: Float16
99194 Stride: 8
100- # 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000
101- Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
195+ Data: [ 0x0, 0x0, 0x0, 0x0,
196+ 0x0, 0x0, 0x0, 0x0,
197+ 0x0, 0x0, 0x0, 0x0,
198+ 0x0, 0x0, 0x0, 0x0,
199+ 0x2200, 0x2400, 0x2800, 0x2A00,
200+ 0x2200, 0x2400, 0x2800, 0x2A00,
201+ 0x2200, 0x2400, 0x2800, 0x2A00,
202+ 0x2200, 0x2400, 0x2800, 0x2A00,
203+ 0x2400, 0x2800, 0x2A00, 0x2C00,
204+ 0x0, 0x0, 0x0, 0x0,
205+ 0x0, 0x0, 0x0, 0x0,
206+ 0x0, 0x0, 0x0, 0x0,
207+ 0x0, 0x0, 0x0, 0x0,
208+ 0x3000, 0x3200, 0x3400, 0x3600,
209+ 0x3000, 0x3200, 0x3400, 0x3600,
210+ 0x0, 0x0, 0x0, 0x0,
211+ 0x0, 0x0, 0x0, 0x0,
212+ 0x0, 0x0, 0x0, 0x0,
213+ 0x0, 0x0, 0x0, 0x0,
214+ 0x0, 0x0, 0x0, 0x0,
215+ 0x2A00, 0x2800, 0x2400, 0x2200,
216+ 0x2A00, 0x2800, 0x2400, 0x2200,
217+ 0x2A00, 0x2800, 0x2400, 0x2200,
218+ 0x2A00, 0x2800, 0x2400, 0x2200,
219+ 0x2C00, 0x2A00, 0x2800, 0x2400,
220+ 0x0, 0x0, 0x0, 0x0,
221+ 0x0, 0x0, 0x0, 0x0,
222+ 0x0, 0x0, 0x0, 0x0,
223+ 0x0, 0x0, 0x0, 0x0,
224+ 0x3600, 0x3400, 0x3200, 0x3000,
225+ 0x3600, 0x3400, 0x3200, 0x3000,
226+ 0x0, 0x0, 0x0, 0x0 ]
102227 - Name: ExpectedOut5
103228 Format: Float16
104229 Stride: 8
105- # 1, 2, 3, 4
106230 Data: [ 0x3C00, 0x4000, 0x4200, 0x4400 ]
107231Results:
108232 - Result: ExpectedOut1
@@ -125,7 +249,6 @@ Results:
125249 Rule: BufferExact
126250 Actual: Out5
127251 Expected: ExpectedOut5
128-
129252DescriptorSets:
130253 - Resources:
131254 - Name: In
@@ -170,19 +293,22 @@ DescriptorSets:
170293 Space: 0
171294 VulkanBinding:
172295 Binding: 5
296+ - Name: Masks
297+ Kind: StructuredBuffer
298+ DirectXBinding:
299+ Register: 6
300+ Space: 0
301+ VulkanBinding:
302+ Binding: 6
303+
173304...
174305#--- end
175306
176- # REQUIRES: Half
177-
178- # Bug https://github.com/llvm/offload-test-suite/issues/393
179- # XFAIL: Metal
180-
181307# Bug https://github.com/llvm/llvm-project/issues/156775
182308# XFAIL: Clang
183309
184- # Bug https://github.com/llvm/offload-test-suite/issues/433
185- # XFAIL: DirectX-WARP
310+ # Bug https://github.com/llvm/offload-test-suite/issues/393
311+ # XFAIL: Metal
186312
187313# RUN: split-file %s %t
188314# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
0 commit comments