diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl index 9599aa9..a4cd139 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl @@ -39,20 +39,19 @@ StructuredBuffer ZstdInBlocksRefsTyped : re StructuredBuffer ZstdInGlobalBlockIndexTyped : register(t7); -[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")] -[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)] -void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId) +groupshared uint32_t GS_GroupLeaderBlockIdx; + +inline void GetDestinationInfo(uint32_t blockIdx, + uint32_t i, + // These are really just `out`: + ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) blockRef, + ZSTDGPU_PARAM_INOUT(uint32_t) byteIdx, + ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) dstFrameOffsetAndSize, + ZSTDGPU_PARAM_INOUT(uint32_t) dstBlockOffset) { - i += zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy; - - if (i >= Constants.workItemCount) - return; - - const uint32_t blockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, i); - - const zstdgpu_OffsetAndSize blockRef = ZstdInBlocksRefsTyped[blockIdx]; + blockRef = ZstdInBlocksRefsTyped[blockIdx]; - const uint32_t byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx]; + byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx]; const uint32_t globalBlockIdx = ZstdInGlobalBlockIndexTyped[blockIdx]; @@ -74,10 +73,77 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId) const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset; - const zstdgpu_OffsetAndSize dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx]; + dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx]; + + dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset; +} + +[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")] +[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)] +void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId) +{ + const uint32_t scaledGroupId = zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy; + const uint32_t i = scaledGroupId + threadIdInGroup; + const uint32_t numActiveThreads = zstdgpu_MinU32(Constants.workItemCount - scaledGroupId, kzstdgpu_TgSizeX_MemsetMemcpy); + + // There are likely much fewer blocks this threadgroup will write than kzstdgpu_TgSizeX_MemsetMemcpy, and commonly a single block. + // Do most of the work via scalar instructions. + if (threadIdInGroup == 0) + { + const uint32_t groupLeaderBlockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, scaledGroupId + 0); + GS_GroupLeaderBlockIdx = groupLeaderBlockIdx; + } + GroupMemoryBarrierWithGroupSync(); + if (i >= Constants.workItemCount) + return; + + const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(GS_GroupLeaderBlockIdx); + + // We do a linear search within the tail, instead of a binary search. + // If we did the latter and we ensured we don't track zero-decompress-size Raw/RLE blocks, + // this could be min()'d with numActiveThreads: + const uint32_t tailBlockCount = Constants.blockCount - groupLeaderBlockIdx; // includes leader + + uint32_t iEndForGroupLeaderBlock = uint32_t(-1); // "infinity" + [branch] if (tailBlockCount >= 2) + { + iEndForGroupLeaderBlock = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + 1]; + } + + zstdgpu_OffsetAndSize blockRef; + uint32_t byteIdx; + zstdgpu_OffsetAndSize dstFrameOffsetAndSize; + uint32_t dstBlockOffset; - const uint32_t dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset; + // The else path can handle any case, but try to pass a uniform blockIdx to GetDestinationInfo. + if (iEndForGroupLeaderBlock >= scaledGroupId + numActiveThreads) + { + GetDestinationInfo(groupLeaderBlockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset); + } + else + { + // Instead of a binary search, do a linear search under the assumption it should usually have few iterations + // from the start of the tail. + uint32_t intervalEnd = iEndForGroupLeaderBlock; + uint32_t r = 1; // index of interval _end_ relative to group leader index + for (;;) + { + if (i < intervalEnd) + { + break; + } + ++r; + if (r >= tailBlockCount) + { + break; + } + intervalEnd = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + r]; + } + const uint32_t blockIdx = groupLeaderBlockIdx + (r - 1); + GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset); + } + // Shouldn't be needed for valid data since already checked Constants.workItemCount? if (byteIdx >= dstFrameOffsetAndSize.size) return; diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index 3298f68..bc23598 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -200,7 +200,9 @@ static inline uint32_t zstdgpu_OrderedAppendIndex(ZSTDGPU_RW_BUFFER_GLC(uint32_t return zstdgpu_GlobalExclusivePrefixSum(lookback, WavePrefixSum(threadAppendCnt), threadAppendCnt, globalThreadIdx, tgroupThreadCnt); } -static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t threadId) +// Given sorted ascending array A where A[i] is the beginning of an interval that ends (exclusive) at A[i+1] (or at "infinity" for i+1 == N), +// finds assumed to exist index of interval that contains target. +static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t target) { uint32_t rangeBase = start; uint32_t rangeSize = count; @@ -211,7 +213,7 @@ static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSe const uint32_t rangeNext = rangeBase + rangeTest; const uint32_t value = sortedSequence[rangeNext]; - rangeBase = threadId < value ? rangeBase : rangeNext; + rangeBase = target < value ? rangeBase : rangeNext; rangeSize -= rangeTest; } diff --git a/zstd/zstdgpu/zstdgpu_structs.h b/zstd/zstdgpu/zstdgpu_structs.h index c442363..931dbc2 100644 --- a/zstd/zstdgpu/zstdgpu_structs.h +++ b/zstd/zstdgpu/zstdgpu_structs.h @@ -438,11 +438,7 @@ static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 64; static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 256; #endif -#if defined(_GAMING_XBOX) || defined(__XBOX_SCARLETT) || defined(__XBOX_ONE) static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 64; -#else -static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 32; -#endif #define ZSTDGPU_TG_COUNT(elemCount, tgSize) (((elemCount) + (tgSize) - 1) / (tgSize))