diff --git a/test/WaveOps/WaveReadLaneFirst.128Threads.test b/test/WaveOps/WaveReadLaneFirst.128Threads.test new file mode 100644 index 00000000..e92d32e4 --- /dev/null +++ b/test/WaveOps/WaveReadLaneFirst.128Threads.test @@ -0,0 +1,275 @@ +#--- source.hlsl +#define NUM_MASKS 1 +#define NUM_THREADS 128 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + +StructuredBuffer In : register(t0); +RWStructuredBuffer Out4 : register(u4); // test int4 +RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + +[WaveSize(32)] +[numthreads(NUM_THREADS,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + + for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + int4 v = In[MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out4[OutIdx] = WaveReadLaneFirst( v ); + } + } + + // constant folding case + Out5[0] = WaveReadLaneFirst(int4(1,2,3,4)); +} + + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int32 + Stride: 16 + # 1 value set, 1 mask, 128 threads, each thread supplies int4(tid, tid+1, tid+2, tid+3) + Data: [ + 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7, 5,6,7,8, 6,7,8,9, 7,8,9,10, + 8,9,10,11, 9,10,11,12, 10,11,12,13, 11,12,13,14, 12,13,14,15, 13,14,15,16, + 14,15,16,17, 15,16,17,18, 16,17,18,19, 17,18,19,20, 18,19,20,21, 19,20,21,22, + 20,21,22,23, 21,22,23,24, 22,23,24,25, 23,24,25,26, 24,25,26,27, 25,26,27,28, + 26,27,28,29, 27,28,29,30, 28,29,30,31, 29,30,31,32, 30,31,32,33, 31,32,33,34, + 32,33,34,35, 33,34,35,36, 34,35,36,37, 35,36,37,38, 36,37,38,39, 37,38,39,40, + 38,39,40,41, 39,40,41,42, 40,41,42,43, 41,42,43,44, 42,43,44,45, 43,44,45,46, + 44,45,46,47, 45,46,47,48, 46,47,48,49, 47,48,49,50, 48,49,50,51, 49,50,51,52, + 50,51,52,53, 51,52,53,54, 52,53,54,55, 53,54,55,56, 54,55,56,57, 55,56,57,58, + 56,57,58,59, 57,58,59,60, 58,59,60,61, 59,60,61,62, 60,61,62,63, 61,62,63,64, + 62,63,64,65, 63,64,65,66, 64,65,66,67, 65,66,67,68, 66,67,68,69, 67,68,69,70, + 68,69,70,71, 69,70,71,72, 70,71,72,73, 71,72,73,74, 72,73,74,75, 73,74,75,76, + 74,75,76,77, 75,76,77,78, 76,77,78,79, 77,78,79,80, 78,79,80,81, 79,80,81,82, + 80,81,82,83, 81,82,83,84, 82,83,84,85, 83,84,85,86, 84,85,86,87, 85,86,87,88, + 86,87,88,89, 87,88,89,90, 88,89,90,91, 89,90,91,92, 90,91,92,93, 91,92,93,94, + 92,93,94,95, 93,94,95,96, 94,95,96,97, 95,96,97,98, 96,97,98,99, 97,98,99,100, + 98,99,100,101, 99,100,101,102, 100,101,102,103, 101,102,103,104, 102,103,104,105, + 103,104,105,106, 104,105,106,107, 105,106,107,108, 106,107,108,109, 107,108,109,110, + 108,109,110,111, 109,110,111,112, 110,111,112,113, 111,112,113,114, 112,113,114,115, + 113,114,115,116, 114,115,116,117, 115,116,117,118, 116,117,118,119, 117,118,119,120, + 118,119,120,121, 119,120,121,122, 120,121,122,123, 121,122,123,124, 122,123,124,125, + 123,124,125,126, 124,125,126,127, 125,126,127,128, 126,127,128,129, 127,128,129,130 + ] + + - Name: Out4 + Format: Int32 + Stride: 16 + FillSize: 2048 + - Name: Out5 + Format: Int32 + Stride: 16 + FillSize: 16 + - Name: Masks + Format: Int32 + Stride: 16 + Data: [ + 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, + 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, + 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, + 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 ] + + - Name: ExpectedOut4 + Format: Int32 + Stride: 16 + Data: [ 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 2, 3, 4, + 1, 2, 3, 4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 33, 34, 35, 36, + 33, 34, 35, 36, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 65, 66, 67, 68, + 65, 66, 67, 68, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0, + 0, 0, 0, 0, + 97, 98, 99, 100, + 97, 98, 99, 100, + 0, 0, 0, 0 ] + - Name: ExpectedOut5 + Format: Int32 + Stride: 8 + Data: [ 1, 2, 3, 4 ] +Results: + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + +... +#--- end + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + +# Bug https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal + +# Bug https://github.com/llvm/offload-test-suite/issues/611 +# UNSUPPORTED: Vulkan + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_6 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveReadLaneFirst.fp16.test b/test/WaveOps/WaveReadLaneFirst.fp16.test new file mode 100644 index 00000000..f098e806 --- /dev/null +++ b/test/WaveOps/WaveReadLaneFirst.fp16.test @@ -0,0 +1,318 @@ +#--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test half2 +RWStructuredBuffer Out3 : register(u3); // test half3 +RWStructuredBuffer Out4 : register(u4); // test half4 +RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + + +[numthreads(NUM_THREADS,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + half4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveReadLaneFirst( v.x ); + Out2[OutIdx].xy = WaveReadLaneFirst( v.xy ); + Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz ); + Out4[OutIdx] = WaveReadLaneFirst( v ); + } + } + } + + // constant folding case + Out5[0] = WaveReadLaneFirst(half4(1,2,3,4)); +} + + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Float16 + Stride: 8 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 0x2000, 0x2200, 0x2400, 0x2800, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 0x2A00, 0x2C00, 0x2E00, 0x3000, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 0x3200, 0x3400, 0x3600, 0x3800, + 0x3900, 0x3A00, 0x3B00, 0x3BC0, + 0x2200, 0x2400, 0x2800, 0x2A00, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 0x2C00, 0x2E00, 0x3000, 0x3200, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x3A00, 0x3B00, 0x3BC0, 0x2000, + 0x2400, 0x2800, 0x2A00, 0x2C00, + 0x2E00, 0x3000, 0x3200, 0x3400, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x3B00, 0x3BC0, 0x2000, 0x2200, + 0x2800, 0x2A00, 0x2C00, 0x2E00, + 0x3000, 0x3200, 0x3400, 0x3600, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x2000, 0x2200, 0x2400, + 0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 0x3000, 0x2E00, 0x2C00, 0x2A00, + 0x3800, 0x3600, 0x3400, 0x3200, + 0x3BC0, 0x3B00, 0x3A00, 0x3900, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x3200, 0x3000, 0x2E00, 0x2C00, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x2000, 0x3BC0, 0x3B00, 0x3A00, + 0x2C00, 0x2A00, 0x2800, 0x2400, + 0x3400, 0x3200, 0x3000, 0x2E00, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x2200, 0x2000, 0x3BC0, 0x3B00, + 0x2E00, 0x2C00, 0x2A00, 0x2800, + 0x3600, 0x3400, 0x3200, 0x3000, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x2400, 0x2200, 0x2000, 0x3BC0 ] + + - Name: Out1 + Format: Float16 + Stride: 2 + # 1 half is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + FillSize: 64 + - Name: Out2 + Format: Float16 + Stride: 4 + FillSize: 128 + - Name: Out3 + Format: Float16 + Stride: 8 + FillSize: 256 + - Name: Out4 + Format: Float16 + Stride: 8 + FillSize: 256 + - Name: Out5 + Format: Float16 + Stride: 8 + FillSize: 8 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] + - Name: ExpectedOut1 + Format: Float16 + Stride: 8 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x2200, 0x2200, 0x2200, 0x2200, + 0x2400, 0x0, 0x0, 0x0, + 0x0, 0x3000, 0x3000, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2A00, 0x2A00, 0x2A00, 0x2A00, + 0x2C00, 0x0, 0x0, 0x0, + 0x0, 0x3600, 0x3600, 0x0] + - Name: ExpectedOut2 + Format: Float16 + Stride: 8 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2200, 0x2400, 0x2200, 0x2400, + 0x2200, 0x2400, 0x2200, 0x2400, + 0x2400, 0x2800, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x3000, 0x3200, + 0x3000, 0x3200, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2A00, 0x2800, 0x2A00, 0x2800, + 0x2A00, 0x2800, 0x2A00, 0x2800, + 0x2C00, 0x2A00, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x3600, 0x3400, + 0x3600, 0x3400, 0x0, 0x0 ] + - Name: ExpectedOut3 + Format: Float16 + Stride: 8 + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2200, 0x2400, 0x2800, 0x0, + 0x2200, 0x2400, 0x2800, 0x0, + 0x2200, 0x2400, 0x2800, 0x0, + 0x2200, 0x2400, 0x2800, 0x0, + 0x2400, 0x2800, 0x2A00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3000, 0x3200, 0x3400, 0x0, + 0x3000, 0x3200, 0x3400, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2A00, 0x2800, 0x2400, 0x0, + 0x2A00, 0x2800, 0x2400, 0x0, + 0x2A00, 0x2800, 0x2400, 0x0, + 0x2A00, 0x2800, 0x2400, 0x0, + 0x2C00, 0x2A00, 0x2800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3600, 0x3400, 0x3200, 0x0, + 0x3600, 0x3400, 0x3200, 0x0, + 0x0, 0x0, 0x0, 0x0 ] + - Name: ExpectedOut4 + Format: Float16 + Stride: 8 + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2200, 0x2400, 0x2800, 0x2A00, + 0x2200, 0x2400, 0x2800, 0x2A00, + 0x2200, 0x2400, 0x2800, 0x2A00, + 0x2200, 0x2400, 0x2800, 0x2A00, + 0x2400, 0x2800, 0x2A00, 0x2C00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3000, 0x3200, 0x3400, 0x3600, + 0x3000, 0x3200, 0x3400, 0x3600, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x2C00, 0x2A00, 0x2800, 0x2400, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3600, 0x3400, 0x3200, 0x3000, + 0x3600, 0x3400, 0x3200, 0x3000, + 0x0, 0x0, 0x0, 0x0 ] + - Name: ExpectedOut5 + Format: Float16 + Stride: 8 + Data: [ 0x3C00, 0x4000, 0x4200, 0x4400 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + +... +#--- end + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + +# Bug https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal + +# XFAIL: WARP +# Bug https://github.com/llvm/offload-test-suite/issues/433 + +# RUN: split-file %s %t +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveReadLaneFirst.fp32.test b/test/WaveOps/WaveReadLaneFirst.fp32.test new file mode 100644 index 00000000..7e417a2d --- /dev/null +++ b/test/WaveOps/WaveReadLaneFirst.fp32.test @@ -0,0 +1,315 @@ +#--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test float2 +RWStructuredBuffer Out3 : register(u3); // test float3 +RWStructuredBuffer Out4 : register(u4); // test float4 +RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + + +[numthreads(NUM_THREADS,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + float4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveReadLaneFirst( v.x ); + Out2[OutIdx].xy = WaveReadLaneFirst( v.xy ); + Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz ); + Out4[OutIdx] = WaveReadLaneFirst( v ); + } + } + } + + // constant folding case + Out5[0] = WaveReadLaneFirst(float4(1.5,2.5,3.5,4.5)); +} + + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Float32 + Stride: 16 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9.5, 10.5, 11.5, 12.5, + 13.5, 14.5, 15.5, 16.5, + 2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6.5, 7.5, 8.5, 9.5, + 10.5, 11.5, 12.5, 13.5, + 14.5, 15.5, 16.5, 1.5, + 3.5, 4.5, 5.5, 6.5, + 7.5, 8.5, 9.5, 10.5, + 11.5, 12.5, 13.5, 14.5, + 15.5, 16.5, 1.5, 2.5, + 4.5, 5.5, 6.5, 7.5, + 8.5, 9.5, 10.5, 11.5, + 12.5, 13.5, 14.5, 15.5, + 16.5, 1.5, 2.5, 3.5, + 4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8.5, 7.5, 6.5, 5.5, + 12.5, 11.5, 10.5, 9.5, + 16.5, 15.5, 14.5, 13.5, + 5.5, 4.5, 3.5, 2.5, + 9.5, 8.5, 7.5, 6.5, + 13.5, 12.5, 11.5, 10.5, + 1.5, 16.5, 15.5, 14.5, + 6.5, 5.5, 4.5, 3.5, + 10.5, 9.5, 8.5, 7.5, + 14.5, 13.5, 12.5, 11.5, + 2.5, 1.5, 16.5, 15.5, + 7.5, 6.5, 5.5, 4.5, + 11.5, 10.5, 9.5, 8.5, + 15.5, 14.5, 13.5, 12.5, + 3.5, 2.5, 1.5, 16 ] + + - Name: Out1 + Format: Float32 + Stride: 4 + # 1 float is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + FillSize: 128 + - Name: Out2 + Format: Float32 + Stride: 8 + FillSize: 256 + - Name: Out3 + Format: Float32 + Stride: 16 + FillSize: 512 + - Name: Out4 + Format: Float32 + Stride: 16 + FillSize: 512 + - Name: Out5 + Format: Float32 + Stride: 16 + FillSize: 16 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] + - Name: ExpectedOut1 + Format: Float32 + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 2.5, 2.5, 2.5, 2.5, + 3.5, 0, 0, 0, + 0, 8.5, 8.5, 0, + 0, 0, 0, 0, + 5.5, 5.5, 5.5, 5.5, + 6.5, 0, 0, 0, + 0, 11.5, 11.5, 0 ] + - Name: ExpectedOut2 + Format: Float32 + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 2.5, 3.5, 2.5, 3.5, + 2.5, 3.5, 2.5, 3.5, + 3.5, 4.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 8.5, 9.5, + 8.5, 9.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5.5, 4.5, 5.5, 4.5, + 5.5, 4.5, 5.5, 4.5, + 6.5, 5.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 11.5, 10.5, + 11.5, 10.5, 0, 0 ] + - Name: ExpectedOut3 + Format: Float32 + Stride: 16 + # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2.5, 3.5, 4.5, 0, + 2.5, 3.5, 4.5, 0, + 2.5, 3.5, 4.5, 0, + 2.5, 3.5, 4.5, 0, + 3.5, 4.5, 5.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8.5, 9.5, 10.5, 0, + 8.5, 9.5, 10.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5.5, 4.5, 3.5, 0, + 5.5, 4.5, 3.5, 0, + 5.5, 4.5, 3.5, 0, + 5.5, 4.5, 3.5, 0, + 6.5, 5.5, 4.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11.5, 10.5, 9.5, 0, + 11.5, 10.5, 9.5, 0, + 0, 0, 0, 0] + - Name: ExpectedOut4 + Format: Float32 + Stride: 16 + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2.5, 3.5, 4.5, 5.5, + 2.5, 3.5, 4.5, 5.5, + 2.5, 3.5, 4.5, 5.5, + 2.5, 3.5, 4.5, 5.5, + 3.5, 4.5, 5.5, 6.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8.5, 9.5, 10.5, 11.5, + 8.5, 9.5, 10.5, 11.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5.5, 4.5, 3.5, 2.5, + 5.5, 4.5, 3.5, 2.5, + 5.5, 4.5, 3.5, 2.5, + 5.5, 4.5, 3.5, 2.5, + 6.5, 5.5, 4.5, 3.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11.5, 10.5, 9.5, 8.5, + 11.5, 10.5, 9.5, 8.5, + 0, 0, 0, 0] + - Name: ExpectedOut5 + Format: Float32 + Stride: 8 + Data: [ 1.5, 2.5, 3.5, 4.5 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + +... +#--- end + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + +# Bug https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveReadLaneFirst.fp64.test b/test/WaveOps/WaveReadLaneFirst.fp64.test new file mode 100644 index 00000000..f9bfdc39 --- /dev/null +++ b/test/WaveOps/WaveReadLaneFirst.fp64.test @@ -0,0 +1,318 @@ +#--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test double2 +RWStructuredBuffer Out3 : register(u3); // test double3 +RWStructuredBuffer Out4 : register(u4); // test double4 +RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + + +[numthreads(NUM_THREADS,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + double4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveReadLaneFirst( v.x ); + Out2[OutIdx].xy = WaveReadLaneFirst( v.xy ); + Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz ); + Out4[OutIdx] = WaveReadLaneFirst( v ); + } + } + } + + // constant folding case + Out5[0] = WaveReadLaneFirst(double4(1.5,2.5,3.5,4.5)); +} + + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Float64 + Stride: 32 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9.5, 10.5, 11.5, 12.5, + 13.5, 14.5, 15.5, 16.5, + 2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6.5, 7.5, 8.5, 9.5, + 10.5, 11.5, 12.5, 13.5, + 14.5, 15.5, 16.5, 1.5, + 3.5, 4.5, 5.5, 6.5, + 7.5, 8.5, 9.5, 10.5, + 11.5, 12.5, 13.5, 14.5, + 15.5, 16.5, 1.5, 2.5, + 4.5, 5.5, 6.5, 7.5, + 8.5, 9.5, 10.5, 11.5, + 12.5, 13.5, 14.5, 15.5, + 16.5, 1.5, 2.5, 3.5, + 4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8.5, 7.5, 6.5, 5.5, + 12.5, 11.5, 10.5, 9.5, + 16.5, 15.5, 14.5, 13.5, + 5.5, 4.5, 3.5, 2.5, + 9.5, 8.5, 7.5, 6.5, + 13.5, 12.5, 11.5, 10.5, + 1.5, 16.5, 15.5, 14.5, + 6.5, 5.5, 4.5, 3.5, + 10.5, 9.5, 8.5, 7.5, + 14.5, 13.5, 12.5, 11.5, + 2.5, 1.5, 16.5, 15.5, + 7.5, 6.5, 5.5, 4.5, + 11.5, 10.5, 9.5, 8.5, + 15.5, 14.5, 13.5, 12.5, + 3.5, 2.5, 1.5, 16 ] + + - Name: Out1 + Format: Float64 + Stride: 4 + # 1 double is 8 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + FillSize: 256 + - Name: Out2 + Format: Float64 + Stride: 8 + FillSize: 512 + - Name: Out3 + Format: Float64 + Stride: 16 + FillSize: 1024 + - Name: Out4 + Format: Float64 + Stride: 16 + FillSize: 1024 + - Name: Out5 + Format: Float64 + Stride: 32 + FillSize: 32 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] + - Name: ExpectedOut1 + Format: Float64 + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 2.5, 2.5, 2.5, 2.5, + 3.5, 0, 0, 0, + 0, 8.5, 8.5, 0, + 0, 0, 0, 0, + 5.5, 5.5, 5.5, 5.5, + 6.5, 0, 0, 0, + 0, 11.5, 11.5, 0 ] + - Name: ExpectedOut2 + Format: Float64 + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 2.5, 3.5, 2.5, 3.5, + 2.5, 3.5, 2.5, 3.5, + 3.5, 4.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 8.5, 9.5, + 8.5, 9.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5.5, 4.5, 5.5, 4.5, + 5.5, 4.5, 5.5, 4.5, + 6.5, 5.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 11.5, 10.5, + 11.5, 10.5, 0, 0 ] + - Name: ExpectedOut3 + Format: Float64 + Stride: 16 + # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned.5, so the 3 result values are placed doubleo a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2.5, 3.5, 4.5, 0, + 2.5, 3.5, 4.5, 0, + 2.5, 3.5, 4.5, 0, + 2.5, 3.5, 4.5, 0, + 3.5, 4.5, 5.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8.5, 9.5, 10.5, 0, + 8.5, 9.5, 10.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5.5, 4.5, 3.5, 0, + 5.5, 4.5, 3.5, 0, + 5.5, 4.5, 3.5, 0, + 5.5, 4.5, 3.5, 0, + 6.5, 5.5, 4.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11.5, 10.5, 9.5, 0, + 11.5, 10.5, 9.5, 0, + 0, 0, 0, 0] + - Name: ExpectedOut4 + Format: Float64 + Stride: 16 + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2.5, 3.5, 4.5, 5.5, + 2.5, 3.5, 4.5, 5.5, + 2.5, 3.5, 4.5, 5.5, + 2.5, 3.5, 4.5, 5.5, + 3.5, 4.5, 5.5, 6.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8.5, 9.5, 10.5, 11.5, + 8.5, 9.5, 10.5, 11.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5.5, 4.5, 3.5, 2.5, + 5.5, 4.5, 3.5, 2.5, + 5.5, 4.5, 3.5, 2.5, + 5.5, 4.5, 3.5, 2.5, + 6.5, 5.5, 4.5, 3.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11.5, 10.5, 9.5, 8.5, + 11.5, 10.5, 9.5, 8.5, + 0, 0, 0, 0] + - Name: ExpectedOut5 + Format: Float64 + Stride: 8 + Data: [ 1.5, 2.5, 3.5, 4.5 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + +... +#--- end + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + +# Bug https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal + +# XFAIL: WARP +# Bug https://github.com/llvm/offload-test-suite/issues/433 + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveReadLaneFirst.int16.test b/test/WaveOps/WaveReadLaneFirst.int16.test new file mode 100644 index 00000000..29f1b0ae --- /dev/null +++ b/test/WaveOps/WaveReadLaneFirst.int16.test @@ -0,0 +1,315 @@ +#--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int16_t2 +RWStructuredBuffer Out3 : register(u3); // test int16_t3 +RWStructuredBuffer Out4 : register(u4); // test int16_t4 +RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + + +[numthreads(NUM_THREADS,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + int16_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveReadLaneFirst( v.x ); + Out2[OutIdx].xy = WaveReadLaneFirst( v.xy ); + Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz ); + Out4[OutIdx] = WaveReadLaneFirst( v ); + } + } + } + + // constant folding case + Out5[0] = WaveReadLaneFirst(int16_t4(1,2,3,4)); +} + + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int16 + Stride: 8 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9, 10, 11, 12, + 13, 14, 15, 16, + 2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6, 7, 8, 9, + 10, 11, 12, 13, + 14, 15, 16, 1, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + 15, 16, 1, 2, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 1, 2, 3, + 4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8, 7, 6, 5, + 12, 11, 10, 9, + 16, 15, 14, 13, + 5, 4, 3, 2, + 9, 8, 7, 6, + 13, 12, 11, 10, + 1, 16, 15, 14, + 6, 5, 4, 3, + 10, 9, 8, 7, + 14, 13, 12, 11, + 2, 1, 16, 15, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 3, 2, 1, 16 ] + + - Name: Out1 + Format: Int16 + Stride: 2 + # 1 int16_t is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + FillSize: 64 + - Name: Out2 + Format: Int16 + Stride: 4 + FillSize: 128 + - Name: Out3 + Format: Int16 + Stride: 8 + FillSize: 256 + - Name: Out4 + Format: Int16 + Stride: 8 + FillSize: 256 + - Name: Out5 + Format: Int16 + Stride: 8 + FillSize: 8 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] + - Name: ExpectedOut1 + Format: Int16 + Stride: 8 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 2, 2, 2, 2, + 3, 0, 0, 0, + 0, 8, 8, 0, + 0, 0, 0, 0, + 5, 5, 5, 5, + 6, 0, 0, 0, + 0, 11, 11, 0 ] + - Name: ExpectedOut2 + Format: Int16 + Stride: 8 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 2, 3, + 2, 3, 2, 3, + 3, 4, 0, 0, + 0, 0, 0, 0, + 0, 0, 8, 9, + 8, 9, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 5, 4, + 5, 4, 5, 4, + 6, 5, 0, 0, + 0, 0, 0, 0, + 0, 0, 11, 10, + 11, 10, 0, 0 ] + - Name: ExpectedOut3 + Format: Int16 + Stride: 8 + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 3, 4, 5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 0, + 8, 9, 10, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 6, 5, 4, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11, 10, 9, 0, + 11, 10, 9, 0, + 0, 0, 0, 0 ] + - Name: ExpectedOut4 + Format: Int16 + Stride: 8 + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 4, 5, + 2, 3, 4, 5, + 2, 3, 4, 5, + 2, 3, 4, 5, + 3, 4, 5, 6, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 11, + 8, 9, 10, 11, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 3, 2, + 5, 4, 3, 2, + 5, 4, 3, 2, + 5, 4, 3, 2, + 6, 5, 4, 3, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11, 10, 9, 8, + 11, 10, 9, 8, + 0, 0, 0, 0 ] + - Name: ExpectedOut5 + Format: Int16 + Stride: 8 + Data: [ 1, 2, 3, 4 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + +... +#--- end + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + +# Bug https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal + +# RUN: split-file %s %t +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveReadLaneFirst.int32.test b/test/WaveOps/WaveReadLaneFirst.int32.test new file mode 100644 index 00000000..29680fc8 --- /dev/null +++ b/test/WaveOps/WaveReadLaneFirst.int32.test @@ -0,0 +1,315 @@ +#--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int2 +RWStructuredBuffer Out3 : register(u3); // test int3 +RWStructuredBuffer Out4 : register(u4); // test int4 +RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + + +[numthreads(NUM_THREADS,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + int4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveReadLaneFirst( v.x ); + Out2[OutIdx].xy = WaveReadLaneFirst( v.xy ); + Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz ); + Out4[OutIdx] = WaveReadLaneFirst( v ); + } + } + } + + // constant folding case + Out5[0] = WaveReadLaneFirst(int4(1,2,3,4)); +} + + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int32 + Stride: 16 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9, 10, 11, 12, + 13, 14, 15, 16, + 2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6, 7, 8, 9, + 10, 11, 12, 13, + 14, 15, 16, 1, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + 15, 16, 1, 2, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 1, 2, 3, + 4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8, 7, 6, 5, + 12, 11, 10, 9, + 16, 15, 14, 13, + 5, 4, 3, 2, + 9, 8, 7, 6, + 13, 12, 11, 10, + 1, 16, 15, 14, + 6, 5, 4, 3, + 10, 9, 8, 7, + 14, 13, 12, 11, + 2, 1, 16, 15, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 3, 2, 1, 16 ] + + - Name: Out1 + Format: Int32 + Stride: 4 + # 1 int is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + FillSize: 128 + - Name: Out2 + Format: Int32 + Stride: 8 + FillSize: 256 + - Name: Out3 + Format: Int32 + Stride: 16 + FillSize: 512 + - Name: Out4 + Format: Int32 + Stride: 16 + FillSize: 512 + - Name: Out5 + Format: Int32 + Stride: 16 + FillSize: 16 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] + - Name: ExpectedOut1 + Format: Int32 + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 2, 2, 2, 2, + 3, 0, 0, 0, + 0, 8, 8, 0, + 0, 0, 0, 0, + 5, 5, 5, 5, + 6, 0, 0, 0, + 0, 11, 11, 0 ] + - Name: ExpectedOut2 + Format: Int32 + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 2, 3, + 2, 3, 2, 3, + 3, 4, 0, 0, + 0, 0, 0, 0, + 0, 0, 8, 9, + 8, 9, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 5, 4, + 5, 4, 5, 4, + 6, 5, 0, 0, + 0, 0, 0, 0, + 0, 0, 11, 10, + 11, 10, 0, 0 ] + - Name: ExpectedOut3 + Format: Int32 + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 3, 4, 5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 0, + 8, 9, 10, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 6, 5, 4, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11, 10, 9, 0, + 11, 10, 9, 0, + 0, 0, 0, 0 ] + - Name: ExpectedOut4 + Format: Int32 + Stride: 16 + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 4, 5, + 2, 3, 4, 5, + 2, 3, 4, 5, + 2, 3, 4, 5, + 3, 4, 5, 6, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 11, + 8, 9, 10, 11, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 3, 2, + 5, 4, 3, 2, + 5, 4, 3, 2, + 5, 4, 3, 2, + 6, 5, 4, 3, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11, 10, 9, 8, + 11, 10, 9, 8, + 0, 0, 0, 0 ] + - Name: ExpectedOut5 + Format: Int32 + Stride: 8 + Data: [ 1, 2, 3, 4 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + +... +#--- end + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + +# Bug https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveReadLaneFirst.int64.test b/test/WaveOps/WaveReadLaneFirst.int64.test new file mode 100644 index 00000000..f8afaf05 --- /dev/null +++ b/test/WaveOps/WaveReadLaneFirst.int64.test @@ -0,0 +1,320 @@ +#--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int64_t2 +RWStructuredBuffer Out3 : register(u3); // test int64_t3 +RWStructuredBuffer Out4 : register(u4); // test int64_t4 +RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + + +[numthreads(NUM_THREADS,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + int64_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveReadLaneFirst( v.x ); + Out2[OutIdx].xy = WaveReadLaneFirst( v.xy ); + Out3[OutIdx].xyz = WaveReadLaneFirst( v.xyz ); + Out4[OutIdx] = WaveReadLaneFirst( v ); + } + } + } + + // constant folding case + Out5[0] = WaveReadLaneFirst(int64_t4(1,2,3,4)); +} + + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int64 + Stride: 32 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9, 10, 11, 12, + 13, 14, 15, 16, + 2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6, 7, 8, 9, + 10, 11, 12, 13, + 14, 15, 16, 1, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + 15, 16, 1, 2, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 1, 2, 3, + 4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8, 7, 6, 5, + 12, 11, 10, 9, + 16, 15, 14, 13, + 5, 4, 3, 2, + 9, 8, 7, 6, + 13, 12, 11, 10, + 1, 16, 15, 14, + 6, 5, 4, 3, + 10, 9, 8, 7, + 14, 13, 12, 11, + 2, 1, 16, 15, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 3, 2, 1, 16 ] + + - Name: Out1 + Format: Int64 + Stride: 8 + # 1 int is 8 bytes, * 4 ints for 4 threads, * 4 thread masks, * 2 value sets + FillSize: 256 + - Name: Out2 + Format: Int64 + Stride: 16 + FillSize: 512 + - Name: Out3 + Format: Int64 + Stride: 32 + FillSize: 1024 + - Name: Out4 + Format: Int64 + Stride: 32 + FillSize: 1024 + - Name: Out5 + Format: Int64 + Stride: 32 + FillSize: 32 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] + - Name: ExpectedOut1 + Format: Int64 + Stride: 32 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 2, 2, 2, 2, + 3, 0, 0, 0, + 0, 8, 8, 0, + 0, 0, 0, 0, + 5, 5, 5, 5, + 6, 0, 0, 0, + 0, 11, 11, 0 ] + - Name: ExpectedOut2 + Format: Int64 + Stride: 32 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 2, 3, + 2, 3, 2, 3, + 3, 4, 0, 0, + 0, 0, 0, 0, + 0, 0, 8, 9, + 8, 9, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 5, 4, + 5, 4, 5, 4, + 6, 5, 0, 0, + 0, 0, 0, 0, + 0, 0, 11, 10, + 11, 10, 0, 0 ] + - Name: ExpectedOut3 + Format: Int64 + Stride: 32 + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 2, 3, 4, 0, + 3, 4, 5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 0, + 8, 9, 10, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 5, 4, 3, 0, + 6, 5, 4, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11, 10, 9, 0, + 11, 10, 9, 0, + 0, 0, 0, 0 ] + - Name: ExpectedOut4 + Format: Int64 + Stride: 32 + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 2, 3, 4, 5, + 2, 3, 4, 5, + 2, 3, 4, 5, + 2, 3, 4, 5, + 3, 4, 5, 6, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 8, 9, 10, 11, + 8, 9, 10, 11, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 5, 4, 3, 2, + 5, 4, 3, 2, + 5, 4, 3, 2, + 5, 4, 3, 2, + 6, 5, 4, 3, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 11, 10, 9, 8, + 11, 10, 9, 8, + 0, 0, 0, 0 ] + - Name: ExpectedOut5 + Format: Int64 + Stride: 16 + Data: [ 1, 2, 3, 4 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + +... +#--- end + +# REQUIRES: Int64 + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + +# Bug https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal + +# XFAIL: WARP +# Bug https://github.com/llvm/offload-test-suite/issues/433 + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o