From 273d1af59100d7aea3dd60ae68dfe47cdce8fdae Mon Sep 17 00:00:00 2001 From: Jonathan-Weinstein-AMD Date: Tue, 7 Apr 2026 10:42:00 -0700 Subject: [PATCH 1/7] zstd: minor refactor/cleanup in zstdgpu_ShaderEntry_ParseCompressedBlocks --- zstd/zstdgpu/zstdgpu_shaders.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index 3298f68..bbe74ce 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -1231,15 +1231,6 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp } #ifdef __hlsl_dx_compiler - - // Setup default FSE-indices we are going to propagate - uint32_t4 fseTableIndices = uint32_t4( - outBlockData.fseTableIndexHufW, - outBlockData.fseTableIndexLLen, - outBlockData.fseTableIndexOffs, - outBlockData.fseTableIndexMLen - ); - const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks); const uint32_t thisBlockIndex = WaveReadLaneFirst(threadId / blockSize); @@ -1489,7 +1480,7 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp static uint32_t lastOffsIndex = kzstdgpu_FseProbTableIndex_Unused; static uint32_t lastMLenIndex = kzstdgpu_FseProbTableIndex_Unused; - #define PROPAGATE_FSE_HUF_INDEX(name) \ + #define CPU_PROPAGATE_FSE_INDEX(name) \ if (outBlockData.fseTableIndex##name < kzstdgpu_FseProbTableIndex_Repeat) \ { \ last##name##Index = outBlockData.fseTableIndex##name; \ @@ -1499,12 +1490,12 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp outBlockData.fseTableIndex##name = last##name##Index; \ } - PROPAGATE_FSE_HUF_INDEX(HufW) - PROPAGATE_FSE_HUF_INDEX(LLen) - PROPAGATE_FSE_HUF_INDEX(Offs) - PROPAGATE_FSE_HUF_INDEX(MLen) + CPU_PROPAGATE_FSE_INDEX(HufW) + CPU_PROPAGATE_FSE_INDEX(LLen) + CPU_PROPAGATE_FSE_INDEX(Offs) + CPU_PROPAGATE_FSE_INDEX(MLen) - #undef PROPAGATE_FSE_HUF_INDEX + #undef CPU_PROPAGATE_FSE_INDEX #endif if (0 != seqCount) From fbc1c109542da6e3028c182cfcebbf9c556996c6 Mon Sep 17 00:00:00 2001 From: Jonathan-Weinstein-AMD Date: Thu, 9 Apr 2026 19:45:16 -0700 Subject: [PATCH 2/7] ZstdGpuMemsetMemcpy: Increase kzstdgpu_TgSizeX_MemsetMemcpy from 32 to 64. For one thing, enables wave64 on RDNA3/RDNA4. Also, all waves in a threadgroup are launched on the same compute-unit, which can have L0/K$ benefits, so in general having a larger threadgroup size can help, and performance for the insects archive does seem to improve more when trying 128 threads. Performance of [Memcpy RAW blocks, Memset RLE blocks]: new_duration/old_duration for insects archive, clocks were fixed at some non-specific frequency, but timings were fairly consistent: 7900 XTX: ~= 0.60 RTX 4080: ~= 0.67 --- zstd/zstdgpu/zstdgpu_structs.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/zstd/zstdgpu/zstdgpu_structs.h b/zstd/zstdgpu/zstdgpu_structs.h index c442363..931dbc2 100644 --- a/zstd/zstdgpu/zstdgpu_structs.h +++ b/zstd/zstdgpu/zstdgpu_structs.h @@ -438,11 +438,7 @@ static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 64; static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 256; #endif -#if defined(_GAMING_XBOX) || defined(__XBOX_SCARLETT) || defined(__XBOX_ONE) static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 64; -#else -static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 32; -#endif #define ZSTDGPU_TG_COUNT(elemCount, tgSize) (((elemCount) + (tgSize) - 1) / (tgSize)) From f570ccb7a9147c92e2d6231c67916f234602abff Mon Sep 17 00:00:00 2001 From: Jonathan-Weinstein-AMD Date: Thu, 9 Apr 2026 19:42:18 -0700 Subject: [PATCH 3/7] ZstdGpuMemsetMemcpy: Scalarize optimization for frame binary search. The insects archive has 233 raw blocks that sum to 18,860,773 bytes. It is quite likely then that all threads in a wave write to the same block. Detect this and scalarize the second zstdgpu_BinarySearch to lookup the frame index. Performance of [Memcpy RAW blocks, Memset RLE blocks]: new_duration/old_duration for insects archive, clocks were fixed at some non-specific frequency, but timings were fairly consistent: 7900 XTX: - Against last commit (tgsize already 64): ~= 0.62 - Against upstream: ~= 0.38 --- zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl index 9599aa9..32fd427 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl @@ -62,21 +62,35 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId) globalBlockGlobalOffset = ZstdInBlockSizePrefix[globalBlockIdx - 1]; } - const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, globalBlockIdx); - - const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx]; + zstdgpu_OffsetAndSize dstFrameOffsetAndSize; + uint32_t dstBlockOffset; + + #define GET_FRAME_INFO(optGlobalBlockIdx) { \ + const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, optGlobalBlockIdx);\ + const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx]; \ + uint32_t frameFirstBlockGlobalOffset = 0; \ + [branch] if (frameFirstGlobalBlockIdx > 0) \ + { \ + frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1]; \ + } \ + const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset; \ + dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx]; \ + dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset; \ + } - uint32_t frameFirstBlockGlobalOffset = 0; - [branch] if (frameFirstGlobalBlockIdx > 0) + // Detect the (likely) case all threads store within the same block, and if so use SMEM to do the frame lookup. + // Waterfall loop instead of if/else might be better. + if (WaveActiveAllEqual(globalBlockIdx)) { - frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1]; + const uint32_t uniformGlobalBlockIdx = WaveReadLaneFirst(globalBlockIdx); + GET_FRAME_INFO(uniformGlobalBlockIdx); + } + else + { + GET_FRAME_INFO(globalBlockIdx); } - const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset; - - const zstdgpu_OffsetAndSize dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx]; - - const uint32_t dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset; + #undef GET_FRAME_INFO if (byteIdx >= dstFrameOffsetAndSize.size) return; From 0600427f49c4d5c3a4b21bc88706d03926a62d59 Mon Sep 17 00:00:00 2001 From: Jonathan-Weinstein-AMD Date: Fri, 10 Apr 2026 12:57:01 -0700 Subject: [PATCH 4/7] ZstdGpuMemsetMemcpy: Try to scalarize the block size prefix sum array too. This is a bit fiddley and I don't super like the added `if (blockSize == 0) continue;` in `ParseFrame`. Performance of [Memcpy RAW blocks, Memset RLE blocks]: new_duration/old_duration for insects archive, clocks were fixed at some non-specific frequency, but timings were fairly consistent: 7900 XTX: - Against last commit (tgsize already 64 and had WaveActiveAllEqual from frame lookup): ~= 0.77 - Against upstream: ~= 0.29 --- zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 117 +++++++++++++----- zstd/zstdgpu/zstdgpu_shaders.h | 13 +- 2 files changed, 95 insertions(+), 35 deletions(-) diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl index 32fd427..e5c5a9e 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl @@ -39,20 +39,19 @@ StructuredBuffer ZstdInBlocksRefsTyped : re StructuredBuffer ZstdInGlobalBlockIndexTyped : register(t7); -[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")] -[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)] -void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId) +groupshared uint32_t lds[kzstdgpu_TgSizeX_MemsetMemcpy]; + +inline void GetDestinationInfo(uint32_t blockIdx, + uint32_t i, + // These are really just `out`: + ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) blockRef, + ZSTDGPU_PARAM_INOUT(uint32_t) byteIdx, + ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) dstFrameOffsetAndSize, + ZSTDGPU_PARAM_INOUT(uint32_t) dstBlockOffset) { - i += zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy; - - if (i >= Constants.workItemCount) - return; + blockRef = ZstdInBlocksRefsTyped[blockIdx]; - const uint32_t blockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, i); - - const zstdgpu_OffsetAndSize blockRef = ZstdInBlocksRefsTyped[blockIdx]; - - const uint32_t byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx]; + byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx]; const uint32_t globalBlockIdx = ZstdInGlobalBlockIndexTyped[blockIdx]; @@ -62,36 +61,88 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId) globalBlockGlobalOffset = ZstdInBlockSizePrefix[globalBlockIdx - 1]; } - zstdgpu_OffsetAndSize dstFrameOffsetAndSize; - uint32_t dstBlockOffset; + const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, globalBlockIdx); + + const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx]; + + uint32_t frameFirstBlockGlobalOffset = 0; + [branch] if (frameFirstGlobalBlockIdx > 0) + { + frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1]; + } + + const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset; + + dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx]; - #define GET_FRAME_INFO(optGlobalBlockIdx) { \ - const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, optGlobalBlockIdx);\ - const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx]; \ - uint32_t frameFirstBlockGlobalOffset = 0; \ - [branch] if (frameFirstGlobalBlockIdx > 0) \ - { \ - frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1]; \ - } \ - const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset; \ - dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx]; \ - dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset; \ + dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset; +} + +[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")] +[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)] +void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId) +{ + const uint32_t scaledGroupId = zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy; + const uint32_t i = scaledGroupId + threadIdInGroup; + const uint32_t numActiveThreads = zstdgpu_MinU32(Constants.workItemCount - scaledGroupId, kzstdgpu_TgSizeX_MemsetMemcpy); + + // There are likely much fewer blocks this threadgroup will write than kzstdgpu_TgSizeX_MemsetMemcpy, and commonly a single block. + // Do most of the work via scalar instructions. + if (threadIdInGroup == 0) + { + const uint32_t groupLeaderBlockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, scaledGroupId + 0); + lds[0] = groupLeaderBlockIdx; + } + GroupMemoryBarrierWithGroupSync(); + const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(lds[0]); + + uint32_t iEndForGroupLeaderBlock = uint32_t(-1); + [branch] if (Constants.blockCount >= 2) + { + iEndForGroupLeaderBlock = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + 1]; } - // Detect the (likely) case all threads store within the same block, and if so use SMEM to do the frame lookup. - // Waterfall loop instead of if/else might be better. - if (WaveActiveAllEqual(globalBlockIdx)) + zstdgpu_OffsetAndSize blockRef; + uint32_t byteIdx; + zstdgpu_OffsetAndSize dstFrameOffsetAndSize; + uint32_t dstBlockOffset; + + // The else path can handle any case, but try to pass a uniform blockIdx to GetDestinationInfo. + if (iEndForGroupLeaderBlock >= scaledGroupId + numActiveThreads) { - const uint32_t uniformGlobalBlockIdx = WaveReadLaneFirst(globalBlockIdx); - GET_FRAME_INFO(uniformGlobalBlockIdx); + if (i >= Constants.workItemCount) + return; + + GetDestinationInfo(groupLeaderBlockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset); } else { - GET_FRAME_INFO(globalBlockIdx); + // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS, + // which should be sufficient since we only kept nonzero-size RLE/Raw blocks. + lds[threadIdInGroup] = ZstdInBlockSizePrefixTyped[zstdgpu_MinU32(groupLeaderBlockIdx + threadIdInGroup, Constants.blockCount - 1)]; + GroupMemoryBarrierWithGroupSync(); + if (i >= Constants.workItemCount) + return; + + const uint32_t numActiveBlocks = zstdgpu_MinU32(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads); + uint32_t blockIdx; + // Instead of a binary search or linear search from the end, this does a linear search from the beginning. + // This has a more complicated loop exit test then linear search from end, but assuming block sizes are + // large compared to kzstdgpu_TgSizeX_MemsetMemcpy, the loop should iterate much less times. + for (uint32_t r = 0;; ++r) + { + if ((r == numActiveBlocks - 1) || + (i >= lds[r] && i < lds[r + 1])) + { + blockIdx = groupLeaderBlockIdx + r; + break; + } + } + + GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset); } - #undef GET_FRAME_INFO - + // Shouldn't be needed for valid data since already checked Constants.workItemCount? if (byteIdx >= dstFrameOffsetAndSize.size) return; diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index bbe74ce..9c2f41c 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -200,7 +200,9 @@ static inline uint32_t zstdgpu_OrderedAppendIndex(ZSTDGPU_RW_BUFFER_GLC(uint32_t return zstdgpu_GlobalExclusivePrefixSum(lookback, WavePrefixSum(threadAppendCnt), threadAppendCnt, globalThreadIdx, tgroupThreadCnt); } -static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t threadId) +// Returns leftmost index i such that inserting target _after_ i would keep the array sorted ascending. +// Assumes such an i in [start, start + count) exists. +static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t target) { uint32_t rangeBase = start; uint32_t rangeSize = count; @@ -211,7 +213,7 @@ static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSe const uint32_t rangeNext = rangeBase + rangeTest; const uint32_t value = sortedSequence[rangeNext]; - rangeBase = threadId < value ? rangeBase : rangeNext; + rangeBase = target < value ? rangeBase : rangeNext; rangeSize -= rangeTest; } @@ -419,6 +421,13 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr const uint32_t blockType = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 2); const uint32_t blockSize = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 21); + // The main zstd decompressor seems to accept 0-size RLE and Raw blocks. + // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them. + if (blockSize == 0) + { + continue; + } + const bool isRaw = 0 == blockType; const bool isRle = 1 == blockType; const bool isCmp = 2 == blockType; From 2b1cc47b6526ac789853ca59c927d87ad7f0c9c3 Mon Sep 17 00:00:00 2001 From: Jonathan-Weinstein-AMD Date: Mon, 13 Apr 2026 14:51:01 -0700 Subject: [PATCH 5/7] Revert "zstd: minor refactor/cleanup in zstdgpu_ShaderEntry_ParseCompressedBlocks" This reverts commit 273d1af59100d7aea3dd60ae68dfe47cdce8fdae. --- zstd/zstdgpu/zstdgpu_shaders.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index 9c2f41c..bfb25af 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -1240,6 +1240,15 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp } #ifdef __hlsl_dx_compiler + + // Setup default FSE-indices we are going to propagate + uint32_t4 fseTableIndices = uint32_t4( + outBlockData.fseTableIndexHufW, + outBlockData.fseTableIndexLLen, + outBlockData.fseTableIndexOffs, + outBlockData.fseTableIndexMLen + ); + const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks); const uint32_t thisBlockIndex = WaveReadLaneFirst(threadId / blockSize); @@ -1489,7 +1498,7 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp static uint32_t lastOffsIndex = kzstdgpu_FseProbTableIndex_Unused; static uint32_t lastMLenIndex = kzstdgpu_FseProbTableIndex_Unused; - #define CPU_PROPAGATE_FSE_INDEX(name) \ + #define PROPAGATE_FSE_HUF_INDEX(name) \ if (outBlockData.fseTableIndex##name < kzstdgpu_FseProbTableIndex_Repeat) \ { \ last##name##Index = outBlockData.fseTableIndex##name; \ @@ -1499,12 +1508,12 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp outBlockData.fseTableIndex##name = last##name##Index; \ } - CPU_PROPAGATE_FSE_INDEX(HufW) - CPU_PROPAGATE_FSE_INDEX(LLen) - CPU_PROPAGATE_FSE_INDEX(Offs) - CPU_PROPAGATE_FSE_INDEX(MLen) + PROPAGATE_FSE_HUF_INDEX(HufW) + PROPAGATE_FSE_HUF_INDEX(LLen) + PROPAGATE_FSE_HUF_INDEX(Offs) + PROPAGATE_FSE_HUF_INDEX(MLen) - #undef CPU_PROPAGATE_FSE_INDEX + #undef PROPAGATE_FSE_HUF_INDEX #endif if (0 != seqCount) From b1e07c18f847143796b4bbf6d002ee02e4501b0c Mon Sep 17 00:00:00 2001 From: Jonathan-Weinstein-AMD Date: Mon, 13 Apr 2026 15:52:27 -0700 Subject: [PATCH 6/7] Fix: Move early 0-size continue after advance past RLE filler value. Optimization: remove `i >= lds[r]` check. --- zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 16 +++++++++------- zstd/zstdgpu/zstdgpu_shaders.h | 19 ++++++++++--------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl index e5c5a9e..b1202b0 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl @@ -117,8 +117,8 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId) } else { - // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS, - // which should be sufficient since we only kept nonzero-size RLE/Raw blocks. + // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS. + // One VMEM load should be sufficient since we only tracked nonzero-decompressed-size RLE/Raw blocks in ParseFrame(). lds[threadIdInGroup] = ZstdInBlockSizePrefixTyped[zstdgpu_MinU32(groupLeaderBlockIdx + threadIdInGroup, Constants.blockCount - 1)]; GroupMemoryBarrierWithGroupSync(); if (i >= Constants.workItemCount) @@ -126,17 +126,19 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId) const uint32_t numActiveBlocks = zstdgpu_MinU32(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads); uint32_t blockIdx; - // Instead of a binary search or linear search from the end, this does a linear search from the beginning. - // This has a more complicated loop exit test then linear search from end, but assuming block sizes are - // large compared to kzstdgpu_TgSizeX_MemsetMemcpy, the loop should iterate much less times. + // Instead of a binary search, do a linear search under the assumption it should usually have few iterations. + // Find leftmost interval such that (i >= lds[r] && i < lds[r + 1]). + // We already know i >= lds[0] (since that is true for the i of the group leader). + uint32_t intervalEnd = iEndForGroupLeaderBlock; for (uint32_t r = 0;; ++r) { - if ((r == numActiveBlocks - 1) || - (i >= lds[r] && i < lds[r + 1])) + const uint32_t nextIterEnd = lds[(r + 2u) % kzstdgpu_TgSizeX_MemsetMemcpy]; + if (r == numActiveBlocks - 1 || i < intervalEnd) { blockIdx = groupLeaderBlockIdx + r; break; } + intervalEnd = nextIterEnd; } GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset); diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index bfb25af..f8c481d 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -200,8 +200,8 @@ static inline uint32_t zstdgpu_OrderedAppendIndex(ZSTDGPU_RW_BUFFER_GLC(uint32_t return zstdgpu_GlobalExclusivePrefixSum(lookback, WavePrefixSum(threadAppendCnt), threadAppendCnt, globalThreadIdx, tgroupThreadCnt); } -// Returns leftmost index i such that inserting target _after_ i would keep the array sorted ascending. -// Assumes such an i in [start, start + count) exists. +// Given sorted ascending array A where A[i] is the beginning of an interval that ends (exclusive) at A[i+1] (or at "infinity" for i+1 == N), +// finds assumed to exist index of interval that contains target. static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t target) { uint32_t rangeBase = start; @@ -421,13 +421,6 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr const uint32_t blockType = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 2); const uint32_t blockSize = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 21); - // The main zstd decompressor seems to accept 0-size RLE and Raw blocks. - // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them. - if (blockSize == 0) - { - continue; - } - const bool isRaw = 0 == blockType; const bool isRle = 1 == blockType; const bool isCmp = 2 == blockType; @@ -443,6 +436,14 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr zstdgpu_Forward_BitBuffer_Skip(bits, blockSize); } + // The main zstd decompressor seems to accept zero-decompressed-size RLE and Raw blocks. + // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them. + // For RLE blocks this needs to be after extracting the value byte from the stream. + if (blockSize == 0) + { + continue; + } + if (0 != outputBlockInfo) { const uint32_t blockIndex = outFrameInfo.rawBlockStart From b849491b3a694c423b734ff317c2166b64002e12 Mon Sep 17 00:00:00 2001 From: Jonathan-Weinstein-AMD Date: Mon, 13 Apr 2026 16:42:38 -0700 Subject: [PATCH 7/7] Modified version that doesn't need additional concerns for potential zero-decompressed-size Raw/RLE blocks. No per-thread LDS used (still some LDS, a word per group so don't have to care about wave size or things of that nature). K$ vs LDS latency is perhaps comparable. I think I decided LDS at first since I thought to do a binary search within the a tail of `min(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads)` when assuming there are no empty Raw/RLE blocks at that point. --- zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 51 +++++++++---------- zstd/zstdgpu/zstdgpu_shaders.h | 8 --- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl index b1202b0..a4cd139 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl @@ -39,7 +39,7 @@ StructuredBuffer ZstdInBlocksRefsTyped : re StructuredBuffer ZstdInGlobalBlockIndexTyped : register(t7); -groupshared uint32_t lds[kzstdgpu_TgSizeX_MemsetMemcpy]; +groupshared uint32_t GS_GroupLeaderBlockIdx; inline void GetDestinationInfo(uint32_t blockIdx, uint32_t i, @@ -91,13 +91,21 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId) if (threadIdInGroup == 0) { const uint32_t groupLeaderBlockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, scaledGroupId + 0); - lds[0] = groupLeaderBlockIdx; + GS_GroupLeaderBlockIdx = groupLeaderBlockIdx; } GroupMemoryBarrierWithGroupSync(); - const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(lds[0]); + if (i >= Constants.workItemCount) + return; + + const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(GS_GroupLeaderBlockIdx); - uint32_t iEndForGroupLeaderBlock = uint32_t(-1); - [branch] if (Constants.blockCount >= 2) + // We do a linear search within the tail, instead of a binary search. + // If we did the latter and we ensured we don't track zero-decompress-size Raw/RLE blocks, + // this could be min()'d with numActiveThreads: + const uint32_t tailBlockCount = Constants.blockCount - groupLeaderBlockIdx; // includes leader + + uint32_t iEndForGroupLeaderBlock = uint32_t(-1); // "infinity" + [branch] if (tailBlockCount >= 2) { iEndForGroupLeaderBlock = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + 1]; } @@ -110,37 +118,28 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId) // The else path can handle any case, but try to pass a uniform blockIdx to GetDestinationInfo. if (iEndForGroupLeaderBlock >= scaledGroupId + numActiveThreads) { - if (i >= Constants.workItemCount) - return; - GetDestinationInfo(groupLeaderBlockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset); } else { - // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS. - // One VMEM load should be sufficient since we only tracked nonzero-decompressed-size RLE/Raw blocks in ParseFrame(). - lds[threadIdInGroup] = ZstdInBlockSizePrefixTyped[zstdgpu_MinU32(groupLeaderBlockIdx + threadIdInGroup, Constants.blockCount - 1)]; - GroupMemoryBarrierWithGroupSync(); - if (i >= Constants.workItemCount) - return; - - const uint32_t numActiveBlocks = zstdgpu_MinU32(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads); - uint32_t blockIdx; - // Instead of a binary search, do a linear search under the assumption it should usually have few iterations. - // Find leftmost interval such that (i >= lds[r] && i < lds[r + 1]). - // We already know i >= lds[0] (since that is true for the i of the group leader). + // Instead of a binary search, do a linear search under the assumption it should usually have few iterations + // from the start of the tail. uint32_t intervalEnd = iEndForGroupLeaderBlock; - for (uint32_t r = 0;; ++r) + uint32_t r = 1; // index of interval _end_ relative to group leader index + for (;;) { - const uint32_t nextIterEnd = lds[(r + 2u) % kzstdgpu_TgSizeX_MemsetMemcpy]; - if (r == numActiveBlocks - 1 || i < intervalEnd) + if (i < intervalEnd) { - blockIdx = groupLeaderBlockIdx + r; break; } - intervalEnd = nextIterEnd; + ++r; + if (r >= tailBlockCount) + { + break; + } + intervalEnd = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + r]; } - + const uint32_t blockIdx = groupLeaderBlockIdx + (r - 1); GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset); } diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index f8c481d..bc23598 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -436,14 +436,6 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr zstdgpu_Forward_BitBuffer_Skip(bits, blockSize); } - // The main zstd decompressor seems to accept zero-decompressed-size RLE and Raw blocks. - // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them. - // For RLE blocks this needs to be after extracting the value byte from the stream. - if (blockSize == 0) - { - continue; - } - if (0 != outputBlockInfo) { const uint32_t blockIndex = outFrameInfo.rawBlockStart