From 273d1af59100d7aea3dd60ae68dfe47cdce8fdae Mon Sep 17 00:00:00 2001
From: Jonathan-Weinstein-AMD <Jonathan.Weinstein@amd.com>
Date: Tue, 7 Apr 2026 10:42:00 -0700
Subject: [PATCH 1/7] zstd: minor refactor/cleanup in
 zstdgpu_ShaderEntry_ParseCompressedBlocks

---
 zstd/zstdgpu/zstdgpu_shaders.h | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h
index 3298f68..bbe74ce 100644
--- a/zstd/zstdgpu/zstdgpu_shaders.h
+++ b/zstd/zstdgpu/zstdgpu_shaders.h
@@ -1231,15 +1231,6 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp
     }
 
 #ifdef __hlsl_dx_compiler
-
-    // Setup default FSE-indices we are going to propagate
-    uint32_t4 fseTableIndices = uint32_t4(
-        outBlockData.fseTableIndexHufW,
-        outBlockData.fseTableIndexLLen,
-        outBlockData.fseTableIndexOffs,
-        outBlockData.fseTableIndexMLen
-    );
-
     const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks);
 
     const uint32_t thisBlockIndex = WaveReadLaneFirst(threadId / blockSize);
@@ -1489,7 +1480,7 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp
     static uint32_t lastOffsIndex = kzstdgpu_FseProbTableIndex_Unused;
     static uint32_t lastMLenIndex = kzstdgpu_FseProbTableIndex_Unused;
 
-    #define PROPAGATE_FSE_HUF_INDEX(name) \
+    #define CPU_PROPAGATE_FSE_INDEX(name) \
         if (outBlockData.fseTableIndex##name < kzstdgpu_FseProbTableIndex_Repeat)    \
         {                                                                           \
             last##name##Index = outBlockData.fseTableIndex##name;                   \
@@ -1499,12 +1490,12 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp
             outBlockData.fseTableIndex##name = last##name##Index;                   \
         }
 
-    PROPAGATE_FSE_HUF_INDEX(HufW)
-    PROPAGATE_FSE_HUF_INDEX(LLen)
-    PROPAGATE_FSE_HUF_INDEX(Offs)
-    PROPAGATE_FSE_HUF_INDEX(MLen)
+    CPU_PROPAGATE_FSE_INDEX(HufW)
+    CPU_PROPAGATE_FSE_INDEX(LLen)
+    CPU_PROPAGATE_FSE_INDEX(Offs)
+    CPU_PROPAGATE_FSE_INDEX(MLen)
 
-    #undef PROPAGATE_FSE_HUF_INDEX
+    #undef CPU_PROPAGATE_FSE_INDEX
 #endif
 
     if (0 != seqCount)

From fbc1c109542da6e3028c182cfcebbf9c556996c6 Mon Sep 17 00:00:00 2001
From: Jonathan-Weinstein-AMD <Jonathan.Weinstein@amd.com>
Date: Thu, 9 Apr 2026 19:45:16 -0700
Subject: [PATCH 2/7] ZstdGpuMemsetMemcpy: Increase
 kzstdgpu_TgSizeX_MemsetMemcpy from 32 to 64. For one thing, enables wave64 on
 RDNA3/RDNA4. Also, all waves in a threadgroup are launched on the same
 compute-unit, which can have L0/K$ benefits, so in general having a larger
 threadgroup size can help, and performance for the insects archive does seem
 to improve more when trying 128 threads.

Performance of [Memcpy RAW blocks, Memset RLE blocks]:
new_duration/old_duration for insects archive, clocks were fixed at some non-specific frequency, but timings were fairly consistent:
7900 XTX: ~= 0.60
RTX 4080: ~= 0.67
---
 zstd/zstdgpu/zstdgpu_structs.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/zstd/zstdgpu/zstdgpu_structs.h b/zstd/zstdgpu/zstdgpu_structs.h
index c442363..931dbc2 100644
--- a/zstd/zstdgpu/zstdgpu_structs.h
+++ b/zstd/zstdgpu/zstdgpu_structs.h
@@ -438,11 +438,7 @@ static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 64;
 static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 256;
 #endif
 
-#if defined(_GAMING_XBOX) || defined(__XBOX_SCARLETT) || defined(__XBOX_ONE)
 static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 64;
-#else
-static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 32;
-#endif
 
 #define ZSTDGPU_TG_COUNT(elemCount, tgSize) (((elemCount) + (tgSize) - 1) / (tgSize))
 

From f570ccb7a9147c92e2d6231c67916f234602abff Mon Sep 17 00:00:00 2001
From: Jonathan-Weinstein-AMD <Jonathan.Weinstein@amd.com>
Date: Thu, 9 Apr 2026 19:42:18 -0700
Subject: [PATCH 3/7] ZstdGpuMemsetMemcpy: Scalarize optimization for frame
 binary search.

The insects archive has 233 raw blocks that sum to 18,860,773 bytes. It is quite likely then that all threads in a wave write to the same block. Detect this and scalarize the second zstdgpu_BinarySearch to lookup the frame index.

Performance of [Memcpy RAW blocks, Memset RLE blocks]:
new_duration/old_duration for insects archive, clocks were fixed at some non-specific frequency, but timings were fairly consistent:
7900 XTX:
- Against last commit (tgsize already 64): ~= 0.62
- Against upstream: ~= 0.38
---
 zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 36 +++++++++++++------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
index 9599aa9..32fd427 100644
--- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
+++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
@@ -62,21 +62,35 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId)
         globalBlockGlobalOffset = ZstdInBlockSizePrefix[globalBlockIdx - 1];
     }
 
-    const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, globalBlockIdx);
-
-    const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx];
+    zstdgpu_OffsetAndSize dstFrameOffsetAndSize;
+    uint32_t dstBlockOffset;
+
+    #define GET_FRAME_INFO(optGlobalBlockIdx) {                                                                                 \
+        const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, optGlobalBlockIdx);\
+        const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx];                    \
+        uint32_t frameFirstBlockGlobalOffset = 0;                                                           \
+        [branch] if (frameFirstGlobalBlockIdx > 0)                                                          \
+        {                                                                                                   \
+            frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1];              \
+        }                                                                                                   \
+        const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset;    \
+        dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx];                                     \
+        dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset;                             \
+    }
 
-    uint32_t frameFirstBlockGlobalOffset = 0;
-    [branch] if (frameFirstGlobalBlockIdx > 0)
+    // Detect the (likely) case all threads store within the same block, and if so use SMEM to do the frame lookup.
+    // Waterfall loop instead of if/else might be better.
+    if (WaveActiveAllEqual(globalBlockIdx))
     {
-        frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1];
+        const uint32_t uniformGlobalBlockIdx = WaveReadLaneFirst(globalBlockIdx);
+        GET_FRAME_INFO(uniformGlobalBlockIdx);
+    }
+    else
+    {
+        GET_FRAME_INFO(globalBlockIdx);
     }
 
-    const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset;
-
-    const zstdgpu_OffsetAndSize dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx];
-
-    const uint32_t dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset;
+    #undef GET_FRAME_INFO
 
     if (byteIdx >= dstFrameOffsetAndSize.size)
         return;

From 0600427f49c4d5c3a4b21bc88706d03926a62d59 Mon Sep 17 00:00:00 2001
From: Jonathan-Weinstein-AMD <Jonathan.Weinstein@amd.com>
Date: Fri, 10 Apr 2026 12:57:01 -0700
Subject: [PATCH 4/7] ZstdGpuMemsetMemcpy: Try to scalarize the block size
 prefix sum array too.

This is a bit fiddley and I don't super like the added `if (blockSize == 0) continue;` in `ParseFrame`.

Performance of [Memcpy RAW blocks, Memset RLE blocks]:
new_duration/old_duration for insects archive, clocks were fixed at some non-specific frequency, but timings were fairly consistent:
7900 XTX:
- Against last commit (tgsize already 64 and had WaveActiveAllEqual from frame lookup): ~= 0.77
- Against upstream: ~= 0.29
---
 zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 117 +++++++++++++-----
 zstd/zstdgpu/zstdgpu_shaders.h                |  13 +-
 2 files changed, 95 insertions(+), 35 deletions(-)

diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
index 32fd427..e5c5a9e 100644
--- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
+++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
@@ -39,20 +39,19 @@ StructuredBuffer<zstdgpu_OffsetAndSize> ZstdInBlocksRefsTyped               : re
 
 StructuredBuffer<uint32_t>              ZstdInGlobalBlockIndexTyped         : register(t7);
 
-[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")]
-[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)]
-void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId)
+groupshared uint32_t lds[kzstdgpu_TgSizeX_MemsetMemcpy];
+
+inline void GetDestinationInfo(uint32_t blockIdx,
+                               uint32_t i,
+                               // These are really just `out`:
+                               ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) blockRef,
+                               ZSTDGPU_PARAM_INOUT(uint32_t) byteIdx,
+                               ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) dstFrameOffsetAndSize,
+                               ZSTDGPU_PARAM_INOUT(uint32_t) dstBlockOffset)
 {
-    i += zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy;
-
-    if (i >= Constants.workItemCount)
-        return;
+    blockRef = ZstdInBlocksRefsTyped[blockIdx];
 
-    const uint32_t blockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, i);
-
-    const zstdgpu_OffsetAndSize blockRef = ZstdInBlocksRefsTyped[blockIdx];
-
-    const uint32_t byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx];
+    byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx];
 
     const uint32_t globalBlockIdx = ZstdInGlobalBlockIndexTyped[blockIdx];
 
@@ -62,36 +61,88 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId)
         globalBlockGlobalOffset = ZstdInBlockSizePrefix[globalBlockIdx - 1];
     }
 
-    zstdgpu_OffsetAndSize dstFrameOffsetAndSize;
-    uint32_t dstBlockOffset;
+    const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, globalBlockIdx);
+
+    const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx];
+
+    uint32_t frameFirstBlockGlobalOffset = 0;
+    [branch] if (frameFirstGlobalBlockIdx > 0)
+    {
+        frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1];
+    }
+
+    const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset;
+
+    dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx];
 
-    #define GET_FRAME_INFO(optGlobalBlockIdx) {                                                                                 \
-        const uint32_t frameIdx = zstdgpu_BinarySearch(ZstdInPerFrameBlockCountAll, 0, Constants.frameCount, optGlobalBlockIdx);\
-        const uint32_t frameFirstGlobalBlockIdx = ZstdInPerFrameBlockCountAll[frameIdx];                    \
-        uint32_t frameFirstBlockGlobalOffset = 0;                                                           \
-        [branch] if (frameFirstGlobalBlockIdx > 0)                                                          \
-        {                                                                                                   \
-            frameFirstBlockGlobalOffset = ZstdInBlockSizePrefix[frameFirstGlobalBlockIdx - 1];              \
-        }                                                                                                   \
-        const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset;    \
-        dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx];                                     \
-        dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset;                             \
+    dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset;
+}
+
+[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")]
+[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)]
+void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId)
+{
+    const uint32_t scaledGroupId = zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy;
+    const uint32_t i = scaledGroupId + threadIdInGroup;
+    const uint32_t numActiveThreads = zstdgpu_MinU32(Constants.workItemCount - scaledGroupId, kzstdgpu_TgSizeX_MemsetMemcpy);
+
+    // There are likely much fewer blocks this threadgroup will write than kzstdgpu_TgSizeX_MemsetMemcpy, and commonly a single block.
+    // Do most of the work via scalar instructions.
+    if (threadIdInGroup == 0)
+    {
+        const uint32_t groupLeaderBlockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, scaledGroupId + 0);
+        lds[0] = groupLeaderBlockIdx;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(lds[0]);
+
+    uint32_t iEndForGroupLeaderBlock = uint32_t(-1);
+    [branch] if (Constants.blockCount >= 2)
+    {
+        iEndForGroupLeaderBlock = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + 1];
     }
 
-    // Detect the (likely) case all threads store within the same block, and if so use SMEM to do the frame lookup.
-    // Waterfall loop instead of if/else might be better.
-    if (WaveActiveAllEqual(globalBlockIdx))
+    zstdgpu_OffsetAndSize blockRef;
+    uint32_t byteIdx;
+    zstdgpu_OffsetAndSize dstFrameOffsetAndSize;
+    uint32_t dstBlockOffset;
+
+    // The else path can handle any case, but try to pass a uniform blockIdx to GetDestinationInfo.
+    if (iEndForGroupLeaderBlock >= scaledGroupId + numActiveThreads)
     {
-        const uint32_t uniformGlobalBlockIdx = WaveReadLaneFirst(globalBlockIdx);
-        GET_FRAME_INFO(uniformGlobalBlockIdx);
+        if (i >= Constants.workItemCount)
+            return;
+
+        GetDestinationInfo(groupLeaderBlockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset);
     }
     else
     {
-        GET_FRAME_INFO(globalBlockIdx);
+        // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS,
+        // which should be sufficient since we only kept nonzero-size RLE/Raw blocks.
+        lds[threadIdInGroup] = ZstdInBlockSizePrefixTyped[zstdgpu_MinU32(groupLeaderBlockIdx + threadIdInGroup, Constants.blockCount - 1)];
+        GroupMemoryBarrierWithGroupSync();
+        if (i >= Constants.workItemCount)
+            return;
+
+        const uint32_t numActiveBlocks = zstdgpu_MinU32(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads);
+        uint32_t blockIdx;
+        // Instead of a binary search or linear search from the end, this does a linear search from the beginning.
+        // This has a more complicated loop exit test then linear search from end, but assuming block sizes are
+        // large compared to kzstdgpu_TgSizeX_MemsetMemcpy, the loop should iterate much less times.
+        for (uint32_t r = 0;; ++r)
+        {
+            if ((r == numActiveBlocks - 1) ||
+                (i >= lds[r] && i < lds[r + 1]))
+            {
+                blockIdx = groupLeaderBlockIdx + r;
+                break;
+            }
+        }
+
+        GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset);
     }
 
-    #undef GET_FRAME_INFO
-
+    // Shouldn't be needed for valid data since already checked Constants.workItemCount?
     if (byteIdx >= dstFrameOffsetAndSize.size)
         return;
 
diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h
index bbe74ce..9c2f41c 100644
--- a/zstd/zstdgpu/zstdgpu_shaders.h
+++ b/zstd/zstdgpu/zstdgpu_shaders.h
@@ -200,7 +200,9 @@ static inline uint32_t zstdgpu_OrderedAppendIndex(ZSTDGPU_RW_BUFFER_GLC(uint32_t
     return zstdgpu_GlobalExclusivePrefixSum(lookback, WavePrefixSum(threadAppendCnt), threadAppendCnt, globalThreadIdx, tgroupThreadCnt);
 }
 
-static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t threadId)
+// Returns leftmost index i such that inserting target _after_ i would keep the array sorted ascending.
+// Assumes such an i in [start, start + count) exists.
+static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t target)
 {
     uint32_t rangeBase = start;
     uint32_t rangeSize = count;
@@ -211,7 +213,7 @@ static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSe
         const uint32_t rangeNext = rangeBase + rangeTest;
 
         const uint32_t value = sortedSequence[rangeNext];
-        rangeBase = threadId < value ? rangeBase : rangeNext;
+        rangeBase = target < value ? rangeBase : rangeNext;
         rangeSize -= rangeTest;
     }
 
@@ -419,6 +421,13 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr
         const uint32_t blockType = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 2);
         const uint32_t blockSize = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 21);
 
+        // The main zstd decompressor seems to accept 0-size RLE and Raw blocks.
+        // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them.
+        if (blockSize == 0)
+        {
+            continue;
+        }
+
         const bool isRaw = 0 == blockType;
         const bool isRle = 1 == blockType;
         const bool isCmp = 2 == blockType;

From 2b1cc47b6526ac789853ca59c927d87ad7f0c9c3 Mon Sep 17 00:00:00 2001
From: Jonathan-Weinstein-AMD <Jonathan.Weinstein@amd.com>
Date: Mon, 13 Apr 2026 14:51:01 -0700
Subject: [PATCH 5/7] Revert "zstd: minor refactor/cleanup in
 zstdgpu_ShaderEntry_ParseCompressedBlocks"

This reverts commit 273d1af59100d7aea3dd60ae68dfe47cdce8fdae.
---
 zstd/zstdgpu/zstdgpu_shaders.h | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h
index 9c2f41c..bfb25af 100644
--- a/zstd/zstdgpu/zstdgpu_shaders.h
+++ b/zstd/zstdgpu/zstdgpu_shaders.h
@@ -1240,6 +1240,15 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp
     }
 
 #ifdef __hlsl_dx_compiler
+
+    // Setup default FSE-indices we are going to propagate
+    uint32_t4 fseTableIndices = uint32_t4(
+        outBlockData.fseTableIndexHufW,
+        outBlockData.fseTableIndexLLen,
+        outBlockData.fseTableIndexOffs,
+        outBlockData.fseTableIndexMLen
+    );
+
     const uint32_t blockSize = min(WaveGetLaneCount(), kzstdgpu_TgSizeX_ParseCompressedBlocks);
 
     const uint32_t thisBlockIndex = WaveReadLaneFirst(threadId / blockSize);
@@ -1489,7 +1498,7 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp
     static uint32_t lastOffsIndex = kzstdgpu_FseProbTableIndex_Unused;
     static uint32_t lastMLenIndex = kzstdgpu_FseProbTableIndex_Unused;
 
-    #define CPU_PROPAGATE_FSE_INDEX(name) \
+    #define PROPAGATE_FSE_HUF_INDEX(name) \
         if (outBlockData.fseTableIndex##name < kzstdgpu_FseProbTableIndex_Repeat)    \
         {                                                                           \
             last##name##Index = outBlockData.fseTableIndex##name;                   \
@@ -1499,12 +1508,12 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp
             outBlockData.fseTableIndex##name = last##name##Index;                   \
         }
 
-    CPU_PROPAGATE_FSE_INDEX(HufW)
-    CPU_PROPAGATE_FSE_INDEX(LLen)
-    CPU_PROPAGATE_FSE_INDEX(Offs)
-    CPU_PROPAGATE_FSE_INDEX(MLen)
+    PROPAGATE_FSE_HUF_INDEX(HufW)
+    PROPAGATE_FSE_HUF_INDEX(LLen)
+    PROPAGATE_FSE_HUF_INDEX(Offs)
+    PROPAGATE_FSE_HUF_INDEX(MLen)
 
-    #undef CPU_PROPAGATE_FSE_INDEX
+    #undef PROPAGATE_FSE_HUF_INDEX
 #endif
 
     if (0 != seqCount)

From b1e07c18f847143796b4bbf6d002ee02e4501b0c Mon Sep 17 00:00:00 2001
From: Jonathan-Weinstein-AMD <Jonathan.Weinstein@amd.com>
Date: Mon, 13 Apr 2026 15:52:27 -0700
Subject: [PATCH 6/7] Fix: Move early 0-size continue after advance past RLE
 filler value. Optimization: remove `i >= lds[r]` check.

---
 zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 16 +++++++++-------
 zstd/zstdgpu/zstdgpu_shaders.h                | 19 ++++++++++---------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
index e5c5a9e..b1202b0 100644
--- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
+++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
@@ -117,8 +117,8 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId)
     }
     else
     {
-        // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS,
-        // which should be sufficient since we only kept nonzero-size RLE/Raw blocks.
+        // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS.
+        // One VMEM load should be sufficient since we only tracked nonzero-decompressed-size RLE/Raw blocks in ParseFrame().
         lds[threadIdInGroup] = ZstdInBlockSizePrefixTyped[zstdgpu_MinU32(groupLeaderBlockIdx + threadIdInGroup, Constants.blockCount - 1)];
         GroupMemoryBarrierWithGroupSync();
         if (i >= Constants.workItemCount)
@@ -126,17 +126,19 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId)
 
         const uint32_t numActiveBlocks = zstdgpu_MinU32(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads);
         uint32_t blockIdx;
-        // Instead of a binary search or linear search from the end, this does a linear search from the beginning.
-        // This has a more complicated loop exit test then linear search from end, but assuming block sizes are
-        // large compared to kzstdgpu_TgSizeX_MemsetMemcpy, the loop should iterate much less times.
+        // Instead of a binary search, do a linear search under the assumption it should usually have few iterations.
+        // Find leftmost interval such that (i >= lds[r] && i < lds[r + 1]).
+        // We already know i >= lds[0] (since that is true for the i of the group leader).
+        uint32_t intervalEnd = iEndForGroupLeaderBlock;
         for (uint32_t r = 0;; ++r)
         {
-            if ((r == numActiveBlocks - 1) ||
-                (i >= lds[r] && i < lds[r + 1]))
+            const uint32_t nextIterEnd = lds[(r + 2u) % kzstdgpu_TgSizeX_MemsetMemcpy];
+            if (r == numActiveBlocks - 1 || i < intervalEnd)
             {
                 blockIdx = groupLeaderBlockIdx + r;
                 break;
             }
+            intervalEnd = nextIterEnd;
         }
 
         GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset);
diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h
index bfb25af..f8c481d 100644
--- a/zstd/zstdgpu/zstdgpu_shaders.h
+++ b/zstd/zstdgpu/zstdgpu_shaders.h
@@ -200,8 +200,8 @@ static inline uint32_t zstdgpu_OrderedAppendIndex(ZSTDGPU_RW_BUFFER_GLC(uint32_t
     return zstdgpu_GlobalExclusivePrefixSum(lookback, WavePrefixSum(threadAppendCnt), threadAppendCnt, globalThreadIdx, tgroupThreadCnt);
 }
 
-// Returns leftmost index i such that inserting target _after_ i would keep the array sorted ascending.
-// Assumes such an i in [start, start + count) exists.
+// Given sorted ascending array A where A[i] is the beginning of an interval that ends (exclusive) at A[i+1] (or at "infinity" for i+1 == N),
+// finds assumed to exist index of interval that contains target.
 static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t target)
 {
     uint32_t rangeBase = start;
@@ -421,13 +421,6 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr
         const uint32_t blockType = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 2);
         const uint32_t blockSize = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 21);
 
-        // The main zstd decompressor seems to accept 0-size RLE and Raw blocks.
-        // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them.
-        if (blockSize == 0)
-        {
-            continue;
-        }
-
         const bool isRaw = 0 == blockType;
         const bool isRle = 1 == blockType;
         const bool isCmp = 2 == blockType;
@@ -443,6 +436,14 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr
             zstdgpu_Forward_BitBuffer_Skip(bits, blockSize);
         }
 
+        // The main zstd decompressor seems to accept zero-decompressed-size RLE and Raw blocks.
+        // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them.
+        // For RLE blocks this needs to be after extracting the value byte from the stream.
+        if (blockSize == 0)
+        {
+            continue;
+        }
+
         if (0 != outputBlockInfo)
         {
             const uint32_t blockIndex = outFrameInfo.rawBlockStart

From b849491b3a694c423b734ff317c2166b64002e12 Mon Sep 17 00:00:00 2001
From: Jonathan-Weinstein-AMD <Jonathan.Weinstein@amd.com>
Date: Mon, 13 Apr 2026 16:42:38 -0700
Subject: [PATCH 7/7] Modified version that doesn't need additional concerns
 for potential zero-decompressed-size Raw/RLE blocks. No per-thread LDS used
 (still some LDS, a word per group so don't have to care about wave size or
 things of that nature). K$ vs LDS latency is perhaps comparable. I think I
 decided LDS at first since I thought to do a binary search within the a tail
 of `min(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads)` when
 assuming there are no empty Raw/RLE blocks at that point.

---
 zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl | 51 +++++++++----------
 zstd/zstdgpu/zstdgpu_shaders.h                |  8 ---
 2 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
index b1202b0..a4cd139 100644
--- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
+++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
@@ -39,7 +39,7 @@ StructuredBuffer<zstdgpu_OffsetAndSize> ZstdInBlocksRefsTyped               : re
 
 StructuredBuffer<uint32_t>              ZstdInGlobalBlockIndexTyped         : register(t7);
 
-groupshared uint32_t lds[kzstdgpu_TgSizeX_MemsetMemcpy];
+groupshared uint32_t GS_GroupLeaderBlockIdx;
 
 inline void GetDestinationInfo(uint32_t blockIdx,
                                uint32_t i,
@@ -91,13 +91,21 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId)
     if (threadIdInGroup == 0)
     {
         const uint32_t groupLeaderBlockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, scaledGroupId + 0);
-        lds[0] = groupLeaderBlockIdx;
+        GS_GroupLeaderBlockIdx = groupLeaderBlockIdx;
     }
     GroupMemoryBarrierWithGroupSync();
-    const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(lds[0]);
+    if (i >= Constants.workItemCount)
+        return;
+
+    const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(GS_GroupLeaderBlockIdx);
 
-    uint32_t iEndForGroupLeaderBlock = uint32_t(-1);
-    [branch] if (Constants.blockCount >= 2)
+    // We do a linear search within the tail, instead of a binary search.
+    // If we did the latter and we ensured we don't track zero-decompress-size Raw/RLE blocks,
+    // this could be min()'d with numActiveThreads:
+    const uint32_t tailBlockCount = Constants.blockCount - groupLeaderBlockIdx; // includes leader
+
+    uint32_t iEndForGroupLeaderBlock = uint32_t(-1); // "infinity"
+    [branch] if (tailBlockCount >= 2)
     {
         iEndForGroupLeaderBlock = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + 1];
     }
@@ -110,37 +118,28 @@ void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId)
     // The else path can handle any case, but try to pass a uniform blockIdx to GetDestinationInfo.
     if (iEndForGroupLeaderBlock >= scaledGroupId + numActiveThreads)
     {
-        if (i >= Constants.workItemCount)
-            return;
-
         GetDestinationInfo(groupLeaderBlockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset);
     }
     else
     {
-        // After using SMEM to narrow down the block range, do one VMEM load per-wave into LDS.
-        // One VMEM load should be sufficient since we only tracked nonzero-decompressed-size RLE/Raw blocks in ParseFrame().
-        lds[threadIdInGroup] = ZstdInBlockSizePrefixTyped[zstdgpu_MinU32(groupLeaderBlockIdx + threadIdInGroup, Constants.blockCount - 1)];
-        GroupMemoryBarrierWithGroupSync();
-        if (i >= Constants.workItemCount)
-            return;
-
-        const uint32_t numActiveBlocks = zstdgpu_MinU32(Constants.blockCount - groupLeaderBlockIdx, numActiveThreads);
-        uint32_t blockIdx;
-        // Instead of a binary search, do a linear search under the assumption it should usually have few iterations.
-        // Find leftmost interval such that (i >= lds[r] && i < lds[r + 1]).
-        // We already know i >= lds[0] (since that is true for the i of the group leader).
+        // Instead of a binary search, do a linear search under the assumption it should usually have few iterations
+        // from the start of the tail.
         uint32_t intervalEnd = iEndForGroupLeaderBlock;
-        for (uint32_t r = 0;; ++r)
+        uint32_t r = 1; // index of interval _end_ relative to group leader index
+        for (;;)
         {
-            const uint32_t nextIterEnd = lds[(r + 2u) % kzstdgpu_TgSizeX_MemsetMemcpy];
-            if (r == numActiveBlocks - 1 || i < intervalEnd)
+            if (i < intervalEnd)
             {
-                blockIdx = groupLeaderBlockIdx + r;
                 break;
             }
-            intervalEnd = nextIterEnd;
+            ++r;
+            if (r >= tailBlockCount)
+            {
+                break;
+            }
+            intervalEnd = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + r];
         }
-
+        const uint32_t blockIdx = groupLeaderBlockIdx + (r - 1);
         GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset);
     }
 
diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h
index f8c481d..bc23598 100644
--- a/zstd/zstdgpu/zstdgpu_shaders.h
+++ b/zstd/zstdgpu/zstdgpu_shaders.h
@@ -436,14 +436,6 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr
             zstdgpu_Forward_BitBuffer_Skip(bits, blockSize);
         }
 
-        // The main zstd decompressor seems to accept zero-decompressed-size RLE and Raw blocks.
-        // ZstdGpuMemsetMemcpy.hlsl doesn't handle that so don't track them.
-        // For RLE blocks this needs to be after extracting the value byte from the stream.
-        if (blockSize == 0)
-        {
-            continue;
-        }
-
         if (0 != outputBlockInfo)
         {
             const uint32_t blockIndex = outFrameInfo.rawBlockStart