Skip to content

Commit 179162a

Browse files
committed
update wavereadlanefirst tests according to waveactivemaax format
1 parent 25f7925 commit 179162a

File tree

6 files changed

+1121
-778
lines changed

6 files changed

+1121
-778
lines changed

test/WaveOps/WaveReadLaneFirst.fp16.test

Lines changed: 188 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,43 @@
11
#--- source.hlsl
2+
#define VALUE_SETS 2
3+
#define NUM_MASKS 4
4+
#define NUM_THREADS 4
5+
6+
struct MaskStruct {
7+
int mask[NUM_THREADS];
8+
};
9+
210
StructuredBuffer<half4> In : register(t0);
3-
RWStructuredBuffer<half4> Out1 : register(u1); // test scalar
4-
RWStructuredBuffer<half4> Out2 : register(u2); // test half2
11+
RWStructuredBuffer<half> Out1 : register(u1); // test scalar
12+
RWStructuredBuffer<half2> Out2 : register(u2); // test half2
513
RWStructuredBuffer<half4> Out3 : register(u3); // test half3
614
RWStructuredBuffer<half4> Out4 : register(u4); // test half4
715
RWStructuredBuffer<half4> Out5 : register(u5); // constant folding
16+
StructuredBuffer<MaskStruct> Masks : register(t6);
17+
818

9-
[numthreads(4,1,1)]
19+
[numthreads(NUM_THREADS,1,1)]
1020
void main(uint3 tid : SV_GroupThreadID)
1121
{
12-
half4 v = In[tid.x];
13-
14-
// Mask per "active lane set": only >= N lanes contribute
15-
half s1 = tid.x >= 3 ? WaveReadLaneFirst( v.x ) : 0;
16-
half s2 = tid.x >= 2 ? WaveReadLaneFirst( v.x ) : 0;
17-
half s3 = tid.x >= 1 ? WaveReadLaneFirst( v.x ) : 0;
18-
half s4 = tid.x >= 0 ? WaveReadLaneFirst( v.x ) : 0;
19-
20-
half2 v2_1 = tid.x >= 3 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
21-
half2 v2_2 = tid.x >= 2 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
22-
half2 v2_3 = tid.x >= 1 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
23-
half2 v2_4 = tid.x >= 0 ? WaveReadLaneFirst( v.xy ) : half2(0,0);
24-
25-
half3 v3_1 = tid.x >= 3 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
26-
half3 v3_2 = tid.x >= 2 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
27-
half3 v3_3 = tid.x >= 1 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
28-
half3 v3_4 = tid.x >= 0 ? WaveReadLaneFirst( v.xyz ) : half3(0,0,0);
29-
30-
half4 v4_1 = tid.x >= 3 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
31-
half4 v4_2 = tid.x >= 2 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
32-
half4 v4_3 = tid.x >= 1 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
33-
half4 v4_4 = tid.x >= 0 ? WaveReadLaneFirst( v ) : half4(0,0,0,0);
34-
35-
half scalars[4] = { s4, s3, s2, s1 };
36-
half2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
37-
half3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
38-
half4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };
39-
40-
Out1[tid.x].x = scalars[tid.x];
41-
Out2[tid.x].xy = vec2s[tid.x];
42-
Out3[tid.x].xyz = vec3s[tid.x];
43-
Out4[tid.x] = vec4s[tid.x];
22+
for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
23+
const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
24+
for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
25+
half4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
26+
const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
27+
if (Masks[MaskIdx].mask[tid.x]) {
28+
Out1[OutIdx] = WaveReadLaneFirst( v.x );
29+
Out2[OutIdx].xy = WaveReadLaneFirst( v.xy );
30+
Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz );
31+
Out4[OutIdx] = WaveReadLaneFirst( v );
32+
}
33+
}
34+
}
4435

4536
// constant folding case
46-
Out5[0] = WaveReadLaneFirst(half4(1,2,3,4));
37+
Out5[0] = WaveReadLaneFirst(half4(1,2,3,4));
4738
}
4839

40+
4941
//--- pipeline.yaml
5042

5143
---
@@ -57,52 +49,184 @@ Buffers:
5749
- Name: In
5850
Format: Float16
5951
Stride: 8
60-
# 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000
61-
Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
52+
# 2 value sets
53+
# For each value set,
54+
# and for each specific one of the 4 thread masks in that value set,
55+
# and for each of the 4 threads in that thread mask,
56+
# there will be a unique set of 4 values, such that
57+
# none of the other threads in that thread mask share any values
58+
Data: [
59+
0x2000, 0x2200, 0x2400, 0x2800, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
60+
0x2A00, 0x2C00, 0x2E00, 0x3000, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
61+
0x3200, 0x3400, 0x3600, 0x3800,
62+
0x3900, 0x3A00, 0x3B00, 0x3BC0,
63+
0x2200, 0x2400, 0x2800, 0x2A00, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
64+
0x2C00, 0x2E00, 0x3000, 0x3200,
65+
0x3400, 0x3600, 0x3800, 0x3900,
66+
0x3A00, 0x3B00, 0x3BC0, 0x2000,
67+
0x2400, 0x2800, 0x2A00, 0x2C00,
68+
0x2E00, 0x3000, 0x3200, 0x3400,
69+
0x3600, 0x3800, 0x3900, 0x3A00,
70+
0x3B00, 0x3BC0, 0x2000, 0x2200,
71+
0x2800, 0x2A00, 0x2C00, 0x2E00,
72+
0x3000, 0x3200, 0x3400, 0x3600,
73+
0x3800, 0x3900, 0x3A00, 0x3B00,
74+
0x3BC0, 0x2000, 0x2200, 0x2400,
75+
0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
76+
0x3000, 0x2E00, 0x2C00, 0x2A00,
77+
0x3800, 0x3600, 0x3400, 0x3200,
78+
0x3BC0, 0x3B00, 0x3A00, 0x3900,
79+
0x2A00, 0x2800, 0x2400, 0x2200,
80+
0x3200, 0x3000, 0x2E00, 0x2C00,
81+
0x3900, 0x3800, 0x3600, 0x3400,
82+
0x2000, 0x3BC0, 0x3B00, 0x3A00,
83+
0x2C00, 0x2A00, 0x2800, 0x2400,
84+
0x3400, 0x3200, 0x3000, 0x2E00,
85+
0x3A00, 0x3900, 0x3800, 0x3600,
86+
0x2200, 0x2000, 0x3BC0, 0x3B00,
87+
0x2E00, 0x2C00, 0x2A00, 0x2800,
88+
0x3600, 0x3400, 0x3200, 0x3000,
89+
0x3B00, 0x3A00, 0x3900, 0x3800,
90+
0x2400, 0x2200, 0x2000, 0x3BC0 ]
91+
6292
- Name: Out1
6393
Format: Float16
64-
Stride: 8
65-
ZeroInitSize: 32
94+
Stride: 2
95+
# 1 half is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
96+
FillSize: 64
6697
- Name: Out2
6798
Format: Float16
68-
Stride: 8
69-
ZeroInitSize: 32
99+
Stride: 4
100+
FillSize: 128
70101
- Name: Out3
71102
Format: Float16
72103
Stride: 8
73-
ZeroInitSize: 32
104+
FillSize: 256
74105
- Name: Out4
75106
Format: Float16
76107
Stride: 8
77-
ZeroInitSize: 32
108+
FillSize: 256
78109
- Name: Out5
79110
Format: Float16
80111
Stride: 8
81-
ZeroInitSize: 8
112+
FillSize: 8
113+
- Name: Masks
114+
Format: Int32
115+
Stride: 16
116+
# 4 active mask sets for threads 0, 1, 2, 3:
117+
# 0 0 0 0
118+
# 1 1 1 1
119+
# 1 0 0 0
120+
# 0 1 1 0
121+
Data: [
122+
0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
82123
- Name: ExpectedOut1
83124
Format: Float16
84125
Stride: 8
85-
# 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0
86-
Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4200, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0 ]
126+
# 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
127+
Data: [ 0x0, 0x0, 0x0, 0x0,
128+
0x2200, 0x2200, 0x2200, 0x2200,
129+
0x2400, 0x0, 0x0, 0x0,
130+
0x0, 0x3000, 0x3000, 0x0,
131+
0x0, 0x0, 0x0, 0x0,
132+
0x2A00, 0x2A00, 0x2A00, 0x2A00,
133+
0x2C00, 0x0, 0x0, 0x0,
134+
0x0, 0x3600, 0x3600, 0x0]
87135
- Name: ExpectedOut2
88136
Format: Float16
89137
Stride: 8
90-
# 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0
91-
Data: [ 0x3c00, 0x4900, 0x0, 0x0, 0x4000, 0x4d00, 0x0, 0x0, 0x4200, 0x4f80, 0x0, 0x0, 0x4400, 0x5100, 0x0, 0x0 ]
138+
# 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
139+
Data: [ 0x0, 0x0, 0x0, 0x0,
140+
0x0, 0x0, 0x0, 0x0,
141+
0x2200, 0x2400, 0x2200, 0x2400,
142+
0x2200, 0x2400, 0x2200, 0x2400,
143+
0x2400, 0x2800, 0x0, 0x0,
144+
0x0, 0x0, 0x0, 0x0,
145+
0x0, 0x0, 0x3000, 0x3200,
146+
0x3000, 0x3200, 0x0, 0x0,
147+
0x0, 0x0, 0x0, 0x0,
148+
0x0, 0x0, 0x0, 0x0,
149+
0x2A00, 0x2800, 0x2A00, 0x2800,
150+
0x2A00, 0x2800, 0x2A00, 0x2800,
151+
0x2C00, 0x2A00, 0x0, 0x0,
152+
0x0, 0x0, 0x0, 0x0,
153+
0x0, 0x0, 0x3600, 0x3400,
154+
0x3600, 0x3400, 0x0, 0x0 ]
92155
- Name: ExpectedOut3
93156
Format: Float16
94157
Stride: 8
95-
# 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0
96-
Data: [ 0x3c00, 0x4900, 0x5640, 0x0, 0x4000, 0x4d00, 0x5a40, 0x0, 0x4200, 0x4f80, 0x5cb0, 0x0, 0x4400, 0x5100, 0x5e40, 0x0 ]
158+
# 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
159+
# Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
160+
Data: [ 0x0, 0x0, 0x0, 0x0,
161+
0x0, 0x0, 0x0, 0x0,
162+
0x0, 0x0, 0x0, 0x0,
163+
0x0, 0x0, 0x0, 0x0,
164+
0x2200, 0x2400, 0x2800, 0x0,
165+
0x2200, 0x2400, 0x2800, 0x0,
166+
0x2200, 0x2400, 0x2800, 0x0,
167+
0x2200, 0x2400, 0x2800, 0x0,
168+
0x2400, 0x2800, 0x2A00, 0x0,
169+
0x0, 0x0, 0x0, 0x0,
170+
0x0, 0x0, 0x0, 0x0,
171+
0x0, 0x0, 0x0, 0x0,
172+
0x0, 0x0, 0x0, 0x0,
173+
0x3000, 0x3200, 0x3400, 0x0,
174+
0x3000, 0x3200, 0x3400, 0x0,
175+
0x0, 0x0, 0x0, 0x0,
176+
0x0, 0x0, 0x0, 0x0,
177+
0x0, 0x0, 0x0, 0x0,
178+
0x0, 0x0, 0x0, 0x0,
179+
0x0, 0x0, 0x0, 0x0,
180+
0x2A00, 0x2800, 0x2400, 0x0,
181+
0x2A00, 0x2800, 0x2400, 0x0,
182+
0x2A00, 0x2800, 0x2400, 0x0,
183+
0x2A00, 0x2800, 0x2400, 0x0,
184+
0x2C00, 0x2A00, 0x2800, 0x0,
185+
0x0, 0x0, 0x0, 0x0,
186+
0x0, 0x0, 0x0, 0x0,
187+
0x0, 0x0, 0x0, 0x0,
188+
0x0, 0x0, 0x0, 0x0,
189+
0x3600, 0x3400, 0x3200, 0x0,
190+
0x3600, 0x3400, 0x3200, 0x0,
191+
0x0, 0x0, 0x0, 0x0 ]
97192
- Name: ExpectedOut4
98193
Format: Float16
99194
Stride: 8
100-
# 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000
101-
Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
195+
Data: [ 0x0, 0x0, 0x0, 0x0,
196+
0x0, 0x0, 0x0, 0x0,
197+
0x0, 0x0, 0x0, 0x0,
198+
0x0, 0x0, 0x0, 0x0,
199+
0x2200, 0x2400, 0x2800, 0x2A00,
200+
0x2200, 0x2400, 0x2800, 0x2A00,
201+
0x2200, 0x2400, 0x2800, 0x2A00,
202+
0x2200, 0x2400, 0x2800, 0x2A00,
203+
0x2400, 0x2800, 0x2A00, 0x2C00,
204+
0x0, 0x0, 0x0, 0x0,
205+
0x0, 0x0, 0x0, 0x0,
206+
0x0, 0x0, 0x0, 0x0,
207+
0x0, 0x0, 0x0, 0x0,
208+
0x3000, 0x3200, 0x3400, 0x3600,
209+
0x3000, 0x3200, 0x3400, 0x3600,
210+
0x0, 0x0, 0x0, 0x0,
211+
0x0, 0x0, 0x0, 0x0,
212+
0x0, 0x0, 0x0, 0x0,
213+
0x0, 0x0, 0x0, 0x0,
214+
0x0, 0x0, 0x0, 0x0,
215+
0x2A00, 0x2800, 0x2400, 0x2200,
216+
0x2A00, 0x2800, 0x2400, 0x2200,
217+
0x2A00, 0x2800, 0x2400, 0x2200,
218+
0x2A00, 0x2800, 0x2400, 0x2200,
219+
0x2C00, 0x2A00, 0x2800, 0x2400,
220+
0x0, 0x0, 0x0, 0x0,
221+
0x0, 0x0, 0x0, 0x0,
222+
0x0, 0x0, 0x0, 0x0,
223+
0x0, 0x0, 0x0, 0x0,
224+
0x3600, 0x3400, 0x3200, 0x3000,
225+
0x3600, 0x3400, 0x3200, 0x3000,
226+
0x0, 0x0, 0x0, 0x0 ]
102227
- Name: ExpectedOut5
103228
Format: Float16
104229
Stride: 8
105-
# 1, 2, 3, 4
106230
Data: [ 0x3C00, 0x4000, 0x4200, 0x4400 ]
107231
Results:
108232
- Result: ExpectedOut1
@@ -125,7 +249,6 @@ Results:
125249
Rule: BufferExact
126250
Actual: Out5
127251
Expected: ExpectedOut5
128-
129252
DescriptorSets:
130253
- Resources:
131254
- Name: In
@@ -170,19 +293,22 @@ DescriptorSets:
170293
Space: 0
171294
VulkanBinding:
172295
Binding: 5
296+
- Name: Masks
297+
Kind: StructuredBuffer
298+
DirectXBinding:
299+
Register: 6
300+
Space: 0
301+
VulkanBinding:
302+
Binding: 6
303+
173304
...
174305
#--- end
175306

176-
# REQUIRES: Half
177-
178-
# Bug https://github.com/llvm/offload-test-suite/issues/393
179-
# XFAIL: Metal
180-
181307
# Bug https://github.com/llvm/llvm-project/issues/156775
182308
# XFAIL: Clang
183309

184-
# Bug https://github.com/llvm/offload-test-suite/issues/433
185-
# XFAIL: DirectX-WARP
310+
# Bug https://github.com/llvm/offload-test-suite/issues/393
311+
# XFAIL: Metal
186312

187313
# RUN: split-file %s %t
188314
# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl

0 commit comments

Comments
 (0)