Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ struct Consts
uint32_t workItemCount;
uint32_t blockCount;
uint32_t frameCount;
uint32_t flags;
};

ConstantBuffer<Consts> Constants : register(b0);
Expand All @@ -39,7 +38,7 @@ StructuredBuffer<zstdgpu_OffsetAndSize> ZstdInBlocksRefsTyped : re

StructuredBuffer<uint32_t> ZstdInGlobalBlockIndexTyped : register(t7);

[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")]
[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=4)")]
[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)]
void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId)
{
Expand Down Expand Up @@ -81,14 +80,16 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId)
if (byteIdx >= dstFrameOffsetAndSize.size)
return;

[branch] if (Constants.flags & 0x1u)
uint value;
[branch] if (!(blockRef.offs & kzstdgpu_RLEBlock_OffsetFlag))
{
const uint32_t byteOfs = blockRef.offs + byteIdx;

ZstdInOutUnCompressedFramesData[dstBlockOffset + byteIdx] = (ZstdInCompressedData[byteOfs >> 2u] >> ((byteOfs & 3u) << 3u)) & 0xffu;
value = (ZstdInCompressedData[byteOfs >> 2u] >> ((byteOfs & 3u) << 3u)) & 0xffu;
}
else
{
ZstdInOutUnCompressedFramesData[dstBlockOffset + byteIdx] = blockRef.offs;
value = blockRef.offs & 0xffu; // strip kzstdgpu_RLEBlock_OffsetFlag
}
ZstdInOutUnCompressedFramesData[dstBlockOffset + byteIdx] = value;
}
2 changes: 1 addition & 1 deletion zstd/zstdgpu/Shaders/ZstdGpuParseFrames.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ ZSTDGPU_PARSE_FRAMES_SRT()
#define __XBOX_ENABLE_WAVE32 1
#endif

[RootSignature("DescriptorTable(SRV(t0, numDescriptors=2), UAV(u0, numDescriptors=17)), RootConstants(b0, num32BitConstants=3)")]
[RootSignature("DescriptorTable(SRV(t0, numDescriptors=2), UAV(u0, numDescriptors=12)), RootConstants(b0, num32BitConstants=3)")]
[numthreads(kzstdgpu_TgSizeX_ParseCompressedBlocks, 1, 1)]
void main(uint i : SV_DispatchThreadId)
{
Expand Down
17 changes: 6 additions & 11 deletions zstd/zstdgpu/Shaders/ZstdGpuUpdateDispatchArgs.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -29,34 +29,30 @@ struct UpdateDispatchArgsConsts
uint32_t stage;

uint32_t cmpBlockCountMax;
uint32_t rawBlockCountMax;
uint32_t rleBlockCountMax;
uint32_t rrBlockCountMax;
uint32_t litByteCountMax;
uint32_t seqElemCountMax;
};

ConstantBuffer<UpdateDispatchArgsConsts> Consts : register(b0);

[RootSignature("UAV(u0), UAV(u1), UAV(u2), UAV(u3), RootConstants(b0, num32BitConstants=7)")]
[RootSignature("UAV(u0), UAV(u1), UAV(u2), UAV(u3), RootConstants(b0, num32BitConstants=6)")]
[numthreads(1, 1, 1)]
void main()
{
if (Consts.stage == 0)
{
// Block-count dependent slots (valid after Stage 0 ParseFrames :: Count Blocks)
const uint32_t cmpBlockCount = ZstdCounters[0].Blocks_CMP;
const uint32_t rawBlockCount = ZstdCounters[0].Blocks_RAW;
const uint32_t rleBlockCount = ZstdCounters[0].Blocks_RLE;
const uint32_t allBlockCount = rawBlockCount
+ rleBlockCount
const uint32_t rrBlockCount = ZstdCounters[0].Blocks_RR;
const uint32_t allBlockCount = rrBlockCount
+ cmpBlockCount;


// the arguments dependent on block counts/sizes -- these could be computed after ParseFrames
zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_ComputePrefixSum, cmpBlockCount, kzstdgpu_TgSizeX_PrefixSum_LiteralCount);
zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_PrefixBlockSizes, allBlockCount, kzstdgpu_TgSizeX_PrefixSum);
zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_MemcpyRAW, ZstdCounters[0].BlocksBytes_RAW, kzstdgpu_TgSizeX_MemsetMemcpy);
zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_MemsetRLE, ZstdCounters[0].BlocksBytes_RLE, kzstdgpu_TgSizeX_MemsetMemcpy);
zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_MemsetMemcpyRR, ZstdCounters[0].BlocksBytes_RR, kzstdgpu_TgSizeX_MemsetMemcpy);
zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_ParseCompressedBlocks, cmpBlockCount, kzstdgpu_TgSizeX_ParseCompressedBlocks);

// Memset dispatch slots for InitResources Stage 1
Expand All @@ -67,8 +63,7 @@ void main()

const uint32_t predicateMask = 0
| (cmpBlockCount > Consts.cmpBlockCountMax ? (1u << 0u) : 0u)
| (rawBlockCount > Consts.rawBlockCountMax ? (1u << 1u) : 0u)
| (rleBlockCount > Consts.rleBlockCountMax ? (1u << 2u) : 0u);
| (rrBlockCount > Consts.rrBlockCountMax ? (1u << 1u) : 0u);

ZstdPredicate[0] = predicateMask; // lower 32-bits of Stage 1 predicate
ZstdPredicate[2] = predicateMask; // lower 32-bits of Stage 2 predicate
Expand Down
Loading