Skip to content
Open
94 changes: 80 additions & 14 deletions zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,19 @@ StructuredBuffer<zstdgpu_OffsetAndSize> ZstdInBlocksRefsTyped : re

StructuredBuffer<uint32_t> ZstdInGlobalBlockIndexTyped : register(t7);

[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")]
[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)]
void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId)
groupshared uint32_t GS_GroupLeaderBlockIdx;

inline void GetDestinationInfo(uint32_t blockIdx,
uint32_t i,
// These are really just `out`:
ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) blockRef,
ZSTDGPU_PARAM_INOUT(uint32_t) byteIdx,
ZSTDGPU_PARAM_INOUT(zstdgpu_OffsetAndSize) dstFrameOffsetAndSize,
ZSTDGPU_PARAM_INOUT(uint32_t) dstBlockOffset)
{
i += zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy;

if (i >= Constants.workItemCount)
return;

const uint32_t blockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, i);

const zstdgpu_OffsetAndSize blockRef = ZstdInBlocksRefsTyped[blockIdx];
blockRef = ZstdInBlocksRefsTyped[blockIdx];

const uint32_t byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx];
byteIdx = i - ZstdInBlockSizePrefixTyped[blockIdx];

const uint32_t globalBlockIdx = ZstdInGlobalBlockIndexTyped[blockIdx];

Expand All @@ -74,10 +73,77 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId)

const uint32_t frameRelativeBlockOffset = globalBlockGlobalOffset - frameFirstBlockGlobalOffset;

const zstdgpu_OffsetAndSize dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx];
dstFrameOffsetAndSize = ZstdInUnCompressedFramesRefs[frameIdx];

dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset;
}

[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")]
[numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)]
void main(uint2 groupId : SV_GroupId, uint threadIdInGroup : SV_GroupThreadId)
{
const uint32_t scaledGroupId = zstdgpu_ConvertTo32BitGroupId(groupId, Constants.tgOffset) * kzstdgpu_TgSizeX_MemsetMemcpy;
const uint32_t i = scaledGroupId + threadIdInGroup;
const uint32_t numActiveThreads = zstdgpu_MinU32(Constants.workItemCount - scaledGroupId, kzstdgpu_TgSizeX_MemsetMemcpy);

// There are likely much fewer blocks this threadgroup will write than kzstdgpu_TgSizeX_MemsetMemcpy, and commonly a single block.
// Do most of the work via scalar instructions.
if (threadIdInGroup == 0)
{
const uint32_t groupLeaderBlockIdx = zstdgpu_BinarySearch(ZstdInBlockSizePrefixTyped, 0, Constants.blockCount, scaledGroupId + 0);
GS_GroupLeaderBlockIdx = groupLeaderBlockIdx;
}
GroupMemoryBarrierWithGroupSync();
if (i >= Constants.workItemCount)
return;

const uint32_t groupLeaderBlockIdx = WaveReadLaneFirst(GS_GroupLeaderBlockIdx);

// We do a linear search within the tail, instead of a binary search.
// If we did the latter and we ensured we don't track zero-decompress-size Raw/RLE blocks,
// this could be min()'d with numActiveThreads:
const uint32_t tailBlockCount = Constants.blockCount - groupLeaderBlockIdx; // includes leader

uint32_t iEndForGroupLeaderBlock = uint32_t(-1); // "infinity"
[branch] if (tailBlockCount >= 2)
{
iEndForGroupLeaderBlock = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + 1];
}

zstdgpu_OffsetAndSize blockRef;
uint32_t byteIdx;
zstdgpu_OffsetAndSize dstFrameOffsetAndSize;
uint32_t dstBlockOffset;

const uint32_t dstBlockOffset = dstFrameOffsetAndSize.offs + frameRelativeBlockOffset;
// The else path can handle any case, but try to pass a uniform blockIdx to GetDestinationInfo.
if (iEndForGroupLeaderBlock >= scaledGroupId + numActiveThreads)
{
GetDestinationInfo(groupLeaderBlockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset);
}
else
{
// Instead of a binary search, do a linear search under the assumption it should usually have few iterations
// from the start of the tail.
uint32_t intervalEnd = iEndForGroupLeaderBlock;
uint32_t r = 1; // index of interval _end_ relative to group leader index
for (;;)
{
if (i < intervalEnd)
{
break;
}
++r;
if (r >= tailBlockCount)
{
break;
}
intervalEnd = ZstdInBlockSizePrefixTyped[groupLeaderBlockIdx + r];
}
const uint32_t blockIdx = groupLeaderBlockIdx + (r - 1);
GetDestinationInfo(blockIdx, i, blockRef, byteIdx, dstFrameOffsetAndSize, dstBlockOffset);
}

// Shouldn't be needed for valid data since already checked Constants.workItemCount?
if (byteIdx >= dstFrameOffsetAndSize.size)
return;

Expand Down
6 changes: 4 additions & 2 deletions zstd/zstdgpu/zstdgpu_shaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,9 @@ static inline uint32_t zstdgpu_OrderedAppendIndex(ZSTDGPU_RW_BUFFER_GLC(uint32_t
return zstdgpu_GlobalExclusivePrefixSum(lookback, WavePrefixSum(threadAppendCnt), threadAppendCnt, globalThreadIdx, tgroupThreadCnt);
}

static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t threadId)
// Given sorted ascending array A where A[i] is the beginning of an interval that ends (exclusive) at A[i+1] (or at "infinity" for i+1 == N),
// finds assumed to exist index of interval that contains target.
static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSequence, uint32_t start, uint32_t count, uint32_t target)
{
uint32_t rangeBase = start;
uint32_t rangeSize = count;
Expand All @@ -211,7 +213,7 @@ static inline uint32_t zstdgpu_BinarySearch(ZSTDGPU_RO_BUFFER(uint32_t) sortedSe
const uint32_t rangeNext = rangeBase + rangeTest;

const uint32_t value = sortedSequence[rangeNext];
rangeBase = threadId < value ? rangeBase : rangeNext;
rangeBase = target < value ? rangeBase : rangeNext;
rangeSize -= rangeTest;
}

Expand Down
4 changes: 0 additions & 4 deletions zstd/zstdgpu/zstdgpu_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -438,11 +438,7 @@ static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 64;
static const uint32_t kzstdgpu_TgSizeX_FinaliseSequenceOffsets = 256;
#endif

#if defined(_GAMING_XBOX) || defined(__XBOX_SCARLETT) || defined(__XBOX_ONE)
static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 64;
#else
static const uint32_t kzstdgpu_TgSizeX_MemsetMemcpy = 32;
#endif

#define ZSTDGPU_TG_COUNT(elemCount, tgSize) (((elemCount) + (tgSize) - 1) / (tgSize))

Expand Down