diff --git a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl index 9599aa9..41035c8 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuMemsetMemcpy.hlsl @@ -22,7 +22,6 @@ struct Consts uint32_t workItemCount; uint32_t blockCount; uint32_t frameCount; - uint32_t flags; }; ConstantBuffer Constants : register(b0); @@ -39,7 +38,7 @@ StructuredBuffer ZstdInBlocksRefsTyped : re StructuredBuffer ZstdInGlobalBlockIndexTyped : register(t7); -[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=5)")] +[RootSignature("DescriptorTable(SRV(t0, numDescriptors=4), UAV(u0, numDescriptors=1)), SRV(t4), SRV(t5), SRV(t6), SRV(t7), RootConstants(b0, num32BitConstants=4)")] [numthreads(kzstdgpu_TgSizeX_MemsetMemcpy, 1, 1)] void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId) { @@ -81,14 +80,16 @@ void main(uint2 groupId : SV_GroupId, uint i : SV_GroupThreadId) if (byteIdx >= dstFrameOffsetAndSize.size) return; - [branch] if (Constants.flags & 0x1u) + uint value; + [branch] if (!(blockRef.offs & kzstdgpu_RLEBlock_OffsetFlag)) { const uint32_t byteOfs = blockRef.offs + byteIdx; - ZstdInOutUnCompressedFramesData[dstBlockOffset + byteIdx] = (ZstdInCompressedData[byteOfs >> 2u] >> ((byteOfs & 3u) << 3u)) & 0xffu; + value = (ZstdInCompressedData[byteOfs >> 2u] >> ((byteOfs & 3u) << 3u)) & 0xffu; } else { - ZstdInOutUnCompressedFramesData[dstBlockOffset + byteIdx] = blockRef.offs; + value = blockRef.offs & 0xffu; // strip kzstdgpu_RLEBlock_OffsetFlag } + ZstdInOutUnCompressedFramesData[dstBlockOffset + byteIdx] = value; } \ No newline at end of file diff --git a/zstd/zstdgpu/Shaders/ZstdGpuParseFrames.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuParseFrames.hlsl index 34835b9..2426bf7 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuParseFrames.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuParseFrames.hlsl @@ -34,7 +34,7 @@ ZSTDGPU_PARSE_FRAMES_SRT() #define __XBOX_ENABLE_WAVE32 1 #endif -[RootSignature("DescriptorTable(SRV(t0, numDescriptors=2), UAV(u0, numDescriptors=17)), RootConstants(b0, num32BitConstants=3)")] +[RootSignature("DescriptorTable(SRV(t0, numDescriptors=2), UAV(u0, numDescriptors=12)), RootConstants(b0, num32BitConstants=3)")] [numthreads(kzstdgpu_TgSizeX_ParseCompressedBlocks, 1, 1)] void main(uint i : SV_DispatchThreadId) { diff --git a/zstd/zstdgpu/Shaders/ZstdGpuUpdateDispatchArgs.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuUpdateDispatchArgs.hlsl index 275b8c7..e4e1f45 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuUpdateDispatchArgs.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuUpdateDispatchArgs.hlsl @@ -29,15 +29,14 @@ struct UpdateDispatchArgsConsts uint32_t stage; uint32_t cmpBlockCountMax; - uint32_t rawBlockCountMax; - uint32_t rleBlockCountMax; + uint32_t rrBlockCountMax; uint32_t litByteCountMax; uint32_t seqElemCountMax; }; ConstantBuffer Consts : register(b0); -[RootSignature("UAV(u0), UAV(u1), UAV(u2), UAV(u3), RootConstants(b0, num32BitConstants=7)")] +[RootSignature("UAV(u0), UAV(u1), UAV(u2), UAV(u3), RootConstants(b0, num32BitConstants=6)")] [numthreads(1, 1, 1)] void main() { @@ -45,18 +44,15 @@ void main() { // Block-count dependent slots (valid after Stage 0 ParseFrames :: Count Blocks) const uint32_t cmpBlockCount = ZstdCounters[0].Blocks_CMP; - const uint32_t rawBlockCount = ZstdCounters[0].Blocks_RAW; - const uint32_t rleBlockCount = ZstdCounters[0].Blocks_RLE; - const uint32_t allBlockCount = rawBlockCount - + rleBlockCount + const uint32_t rrBlockCount = ZstdCounters[0].Blocks_RR; + const uint32_t allBlockCount = rrBlockCount + cmpBlockCount; // the arguments dependent on block counts/sizes -- these could be computed after ParseFrames zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_ComputePrefixSum, cmpBlockCount, kzstdgpu_TgSizeX_PrefixSum_LiteralCount); zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_PrefixBlockSizes, allBlockCount, kzstdgpu_TgSizeX_PrefixSum); - zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_MemcpyRAW, ZstdCounters[0].BlocksBytes_RAW, kzstdgpu_TgSizeX_MemsetMemcpy); - zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_MemsetRLE, ZstdCounters[0].BlocksBytes_RLE, kzstdgpu_TgSizeX_MemsetMemcpy); + zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_MemsetMemcpyRR, ZstdCounters[0].BlocksBytes_RR, kzstdgpu_TgSizeX_MemsetMemcpy); zstdgpu_EmitDispatch(ZstdDispatchArgs, ZstdDispatchCnts, kzstdgpu_DispatchSlot_ParseCompressedBlocks, cmpBlockCount, kzstdgpu_TgSizeX_ParseCompressedBlocks); // Memset dispatch slots for InitResources Stage 1 @@ -67,8 +63,7 @@ void main() const uint32_t predicateMask = 0 | (cmpBlockCount > Consts.cmpBlockCountMax ? (1u << 0u) : 0u) - | (rawBlockCount > Consts.rawBlockCountMax ? (1u << 1u) : 0u) - | (rleBlockCount > Consts.rleBlockCountMax ? (1u << 2u) : 0u); + | (rrBlockCount > Consts.rrBlockCountMax ? (1u << 1u) : 0u); ZstdPredicate[0] = predicateMask; // lower 32-bits of Stage 1 predicate ZstdPredicate[2] = predicateMask; // lower 32-bits of Stage 2 predicate diff --git a/zstd/zstdgpu/zstdgpu.cpp b/zstd/zstdgpu/zstdgpu.cpp index 0adca3f..66fb0ff 100644 --- a/zstd/zstdgpu/zstdgpu.cpp +++ b/zstd/zstdgpu/zstdgpu.cpp @@ -108,8 +108,7 @@ struct zstdgpu_BlockInfo */ static inline void zstdgpu_ParseFrame(zstdgpu_FrameInfo *outFrameInfo, zstdgpu_BlockInfo *outBlockInfo, - zstdgpu_OffsetAndSize *outBlocksRAWRefs, - zstdgpu_OffsetAndSize *outBlocksRLERefs, + zstdgpu_OffsetAndSize *outBlocksRRRefs, zstdgpu_OffsetAndSize *outBlocksCMPRefs, zstdgpu_Forward_BitBuffer & bits) { @@ -336,32 +335,33 @@ static inline void zstdgpu_ParseFrame(zstdgpu_FrameInfo *outFrameInfo, } else if (/* RAW */ 0 == blockType) { - if (outBlocksRAWRefs) + if (outBlocksRRRefs) { - outBlocksRAWRefs[outFrameInfo->rawBlockStart].offs = blockBase; + outBlocksRRRefs[outFrameInfo->rrBlockStart].offs = blockBase; // `Raw_Block` - this is an uncompressed block. `Block_Content` contains `Block_Size` bytes. - outBlocksRAWRefs[outFrameInfo->rawBlockStart].size = blockSize; + outBlocksRRRefs[outFrameInfo->rrBlockStart].size = blockSize; } - outFrameInfo->rawBlockStart += 1; - outFrameInfo->rawBlockBytesStart += blockSize; + outFrameInfo->rrBlockStart += 1; + outFrameInfo->rrBlockBytesStart += blockSize; zstdgpu_Forward_BitBuffer_Skip(bits, blockSize); } else if (/* RLE */ 1 == blockType) { - if (outBlocksRLERefs) + if (outBlocksRRRefs) { - outBlocksRLERefs[outFrameInfo->rleBlockStart].offs = zstdgpu_Forward_BitBuffer_Get(bits, 8); + outBlocksRRRefs[outFrameInfo->rrBlockStart].offs = + zstdgpu_Forward_BitBuffer_Get(bits, 8) | kzstdgpu_RLEBlock_OffsetFlag; // `RLE_Block` - this is a single byte, repeated `Block_Size` times. `Block_Content` consists of a single byte. // On the decompression side, this byte must be repeated `Block_Size` times. - outBlocksRLERefs[outFrameInfo->rleBlockStart].size = blockSize; + outBlocksRRRefs[outFrameInfo->rrBlockStart].size = blockSize; } else { zstdgpu_Forward_BitBuffer_Skip(bits, 1); } - outFrameInfo->rleBlockStart += 1; - outFrameInfo->rleBlockBytesStart += blockSize; + outFrameInfo->rrBlockStart += 1; + outFrameInfo->rrBlockBytesStart += blockSize; } else { @@ -380,8 +380,7 @@ static inline void zstdgpu_ParseFrame(zstdgpu_FrameInfo *outFrameInfo, void zstdgpu_CountFramesAndBlocks(zstdgpu_CountFramesAndBlocksInfo *outInfo, const void *memoryBlock, uint32_t memoryBlockSizeInBytes, uint32_t contentSizeInBytes) { - outInfo->rawBlockCount = 0; - outInfo->rleBlockCount = 0; + outInfo->rrBlockCount = 0; outInfo->cmpBlockCount = 0; outInfo->frameCount = 0; outInfo->frameByteCount = 0; @@ -407,12 +406,11 @@ void zstdgpu_CountFramesAndBlocks(zstdgpu_CountFramesAndBlocksInfo *outInfo, con if (magic == 0xFD2FB528U) { zstdgpu_FrameInfo frameInfo = {}; - zstdgpu_ParseFrame(&frameInfo, NULL, NULL, NULL, NULL, bits); + zstdgpu_ParseFrame(&frameInfo, NULL, NULL, NULL, bits); byteOfs = zstdgpu_Forward_BitBuffer_GetByteOffset(bits); - outInfo->rawBlockCount += frameInfo.rawBlockStart; - outInfo->rleBlockCount += frameInfo.rleBlockStart; + outInfo->rrBlockCount += frameInfo.rrBlockStart; outInfo->cmpBlockCount += frameInfo.cmpBlockStart; outInfo->frameCount += 1u; outInfo->frameByteCount += frameInfo.uncompSize; @@ -451,21 +449,17 @@ void zstdgpu_CollectFrames(zstdgpu_OffsetAndSize *outFrames, zstdgpu_FrameInfo * outFrames[frameId].offs = byteOfs; // store prefix - outFrameInfos[frameId].rawBlockStart = frameInfo.rawBlockStart; - outFrameInfos[frameId].rleBlockStart = frameInfo.rleBlockStart; + outFrameInfos[frameId].rrBlockStart = frameInfo.rrBlockStart; outFrameInfos[frameId].cmpBlockStart = frameInfo.cmpBlockStart; - outFrameInfos[frameId].rawBlockBytesStart = frameInfo.rawBlockBytesStart; - outFrameInfos[frameId].rleBlockBytesStart = frameInfo.rleBlockBytesStart; - - frameInfo.windowSize = 0; - frameInfo.uncompSize = 0; - frameInfo.dictionary = 0; - frameInfo.rawBlockStart = 0; - frameInfo.rleBlockStart = 0; - frameInfo.cmpBlockStart = 0; - frameInfo.rawBlockBytesStart = 0; - frameInfo.rleBlockBytesStart = 0; - zstdgpu_ParseFrame(&frameInfo, NULL, NULL, NULL, NULL, bits); + outFrameInfos[frameId].rrBlockBytesStart = frameInfo.rrBlockBytesStart; + + frameInfo.windowSize = 0; + frameInfo.uncompSize = 0; + frameInfo.dictionary = 0; + frameInfo.rrBlockStart = 0; + frameInfo.cmpBlockStart = 0; + frameInfo.rrBlockBytesStart = 0; + zstdgpu_ParseFrame(&frameInfo, NULL, NULL, NULL, bits); // store just retrieved data outFrameInfos[frameId].windowSize = frameInfo.windowSize; @@ -476,11 +470,9 @@ void zstdgpu_CollectFrames(zstdgpu_OffsetAndSize *outFrames, zstdgpu_FrameInfo * outFrames[frameId].size = byteOfs - outFrames[frameId].offs; // accumulate previous prefix onto current frame's block counts - frameInfo.rawBlockStart += outFrameInfos[frameId].rawBlockStart; - frameInfo.rleBlockStart += outFrameInfos[frameId].rleBlockStart; - frameInfo.cmpBlockStart += outFrameInfos[frameId].cmpBlockStart; - frameInfo.rawBlockBytesStart += outFrameInfos[frameId].rawBlockBytesStart; - frameInfo.rleBlockBytesStart += outFrameInfos[frameId].rleBlockBytesStart; + frameInfo.rrBlockStart += outFrameInfos[frameId].rrBlockStart; + frameInfo.cmpBlockStart += outFrameInfos[frameId].cmpBlockStart; + frameInfo.rrBlockBytesStart += outFrameInfos[frameId].rrBlockBytesStart; } else { @@ -489,7 +481,7 @@ void zstdgpu_CollectFrames(zstdgpu_OffsetAndSize *outFrames, zstdgpu_FrameInfo * } } -void zstdgpu_CollectBlocks(zstdgpu_OffsetAndSize *outBlocksRaw, zstdgpu_OffsetAndSize *outBlocksRLE, zstdgpu_OffsetAndSize *outBlocksCmp, const zstdgpu_OffsetAndSize *frames, const zstdgpu_FrameInfo *frameInfos, uint32_t frameIndex, uint32_t frameCount, const void *memoryBlock, uint32_t memoryBlockSizeInBytes, uint32_t contentSizeInBytes) +void zstdgpu_CollectBlocks(zstdgpu_OffsetAndSize *outBlocksRR, zstdgpu_OffsetAndSize *outBlocksCmp, const zstdgpu_OffsetAndSize *frames, const zstdgpu_FrameInfo *frameInfos, uint32_t frameIndex, uint32_t frameCount, const void *memoryBlock, uint32_t memoryBlockSizeInBytes, uint32_t contentSizeInBytes) { uint32_t byteOfs = 0; @@ -510,13 +502,12 @@ void zstdgpu_CollectBlocks(zstdgpu_OffsetAndSize *outBlocksRaw, zstdgpu_OffsetAn { zstdgpu_FrameInfo frameInfo = {}; - const uint32_t rawBlockStart = frameInfos[frameIndex].rawBlockStart; - const uint32_t rleBlockStart = frameInfos[frameIndex].rleBlockStart; + const uint32_t rrBlockStart = frameInfos[frameIndex].rrBlockStart; const uint32_t cmpBlockStart = frameInfos[frameIndex].cmpBlockStart; const uint32_t byteEnd = frameIndex < frameCount - 1u ? frames[frameIndex + 1u].offs : contentSizeInBytes; - zstdgpu_ParseFrame(&frameInfo, NULL, &outBlocksRaw[rawBlockStart], &outBlocksRLE[rleBlockStart], &outBlocksCmp[cmpBlockStart], bits); + zstdgpu_ParseFrame(&frameInfo, NULL, &outBlocksRR[rrBlockStart], &outBlocksCmp[cmpBlockStart], bits); byteOfs = zstdgpu_Forward_BitBuffer_GetByteOffset(bits); ZSTDGPU_ASSERT(byteOfs == byteEnd); @@ -550,7 +541,7 @@ void zstdgpu_CountCompressedLiteralsAndSequences(zstdgpu_CountLiteralAndSequence zstdgpu_FrameInfo frameInfo = {}; zstdgpu_BlockInfo blockInfo = {}; - zstdgpu_ParseFrame(&frameInfo, &blockInfo, NULL, NULL, NULL, bits); + zstdgpu_ParseFrame(&frameInfo, &blockInfo, NULL, NULL, bits); byteOfs = zstdgpu_Forward_BitBuffer_GetByteOffset(bits); ZSTDGPU_ASSERT(byteOfs == frames[frameIdx].offs + frames[frameIdx].size); @@ -927,8 +918,7 @@ struct zstdgpu_PerRequestContextImpl * memory until Zstd frames are parsed and the number of blocks read back to CPU * either through `zstdgpu_SubmitWithInteralMemory` or `zstdgpu_GetGpuMemoryRequirement` */ - uint32_t zstdRawBlockCountMax; - uint32_t zstdRleBlockCountMax; + uint32_t zstdRRBlockCountMax; uint32_t zstdCmpBlockCountMax; uint32_t zstdUncompressedLitByteCountMax; @@ -1172,8 +1162,7 @@ ZSTDGPU_ENUM(Status) zstdgpu_CreatePerRequestContext(zstdgpu_PerRequestContext * context->zstdUncompressedFrameCount = 0; context->zstdUncompressedFramesByteCount = 0; - context->zstdRawBlockCountMax = 0; - context->zstdRleBlockCountMax = 0; + context->zstdRRBlockCountMax = 0; context->zstdCmpBlockCountMax = 0; context->zstdUncompressedLitByteCountMax = 0; context->zstdUncompressedSeqElemCountMax = 0; @@ -1334,17 +1323,16 @@ ZSTDGPU_API ZSTDGPU_ENUM(Status) zstdgpu_SetupOutputs(zstdgpu_PerRequestContext return ZSTDGPU_ENUM_CONST(StatusInvalidArgument); } -ZSTDGPU_ENUM(Status) zstdgpu_SetupFrameInfoConstants(zstdgpu_PerRequestContext inPerRequestContext, uint32_t rawBlockCount, uint32_t rleBlockCount, uint32_t cmpBlockCount) +ZSTDGPU_ENUM(Status) zstdgpu_SetupFrameInfoConstants(zstdgpu_PerRequestContext inPerRequestContext, uint32_t rrBlockCount, uint32_t cmpBlockCount) { uint32_t proceed = 1; proceed = proceed && (inPerRequestContext->thisMemoryBlock == (void *)inPerRequestContext); - proceed = proceed && (0 != rawBlockCount + rleBlockCount + cmpBlockCount); + proceed = proceed && (0 != rrBlockCount + cmpBlockCount); ZSTDGPU_ASSERT(proceed > 0); if (proceed) { - inPerRequestContext->zstdRawBlockCountMax = rawBlockCount; - inPerRequestContext->zstdRleBlockCountMax = rleBlockCount; + inPerRequestContext->zstdRRBlockCountMax = rrBlockCount; inPerRequestContext->zstdCmpBlockCountMax = cmpBlockCount; inPerRequestContext->setupFlags |= kzstdgpu_SetupFlags_HasFrameInfoConstants; return ZSTDGPU_ENUM_CONST(StatusSuccess); @@ -1407,27 +1395,24 @@ ZSTDGPU_ENUM(Status) zstdgpu_GetGpuMemoryRequirement(uint32_t *outDefaultHeapByt } else if (stageIndex == 1) { - uint32_t cntRaw, cntRle, cntCmp; + uint32_t cntRR, cntCmp; if (zstdgpu_HasFlag(req->setupFlags, kzstdgpu_SetupFlags_HasFrameInfoConstants)) { - cntRaw = req->zstdRawBlockCountMax; - cntRle = req->zstdRleBlockCountMax; + cntRR = req->zstdRRBlockCountMax; cntCmp = req->zstdCmpBlockCountMax; } else { - cntRaw = CNTRS(Blocks_RAW); - cntRle = CNTRS(Blocks_RLE); + cntRR = CNTRS(Blocks_RR); cntCmp = CNTRS(Blocks_CMP); - req->zstdRawBlockCountMax = cntRaw; - req->zstdRleBlockCountMax = cntRle; + req->zstdRRBlockCountMax = cntRR; req->zstdCmpBlockCountMax = cntCmp; } - ZSTDGPU_ASSERT(0 != cntRaw + cntRle + cntCmp); + ZSTDGPU_ASSERT(0 != cntRR + cntCmp); - zstdgpu_ResourceInfo_Stage_1_Init(&req->resInfo, cntRaw, cntRle, cntCmp); + zstdgpu_ResourceInfo_Stage_1_Init(&req->resInfo, cntRR, cntCmp); } else if (stageIndex == 2) @@ -1592,27 +1577,24 @@ ZSTDGPU_ENUM(Status) zstdgpu_SubmitWithInteralMemory(zstdgpu_PerRequestContext r } else if (stageIndex == 1) { - uint32_t cntRaw, cntRle, cntCmp; + uint32_t cntRR, cntCmp; if (zstdgpu_HasFlag(req->setupFlags, kzstdgpu_SetupFlags_HasFrameInfoConstants)) { - cntRaw = req->zstdRawBlockCountMax; - cntRle = req->zstdRleBlockCountMax; + cntRR = req->zstdRRBlockCountMax; cntCmp = req->zstdCmpBlockCountMax; } else { - cntRaw = CNTRS(Blocks_RAW); - cntRle = CNTRS(Blocks_RLE); + cntRR = CNTRS(Blocks_RR); cntCmp = CNTRS(Blocks_CMP); - req->zstdRawBlockCountMax = cntRaw; - req->zstdRleBlockCountMax = cntRle; + req->zstdRRBlockCountMax = cntRR; req->zstdCmpBlockCountMax = cntCmp; } - ZSTDGPU_ASSERT(0 != cntRaw + cntRle + cntCmp); + ZSTDGPU_ASSERT(0 != cntRR + cntCmp); - zstdgpu_ResourceInfo_Stage_1_Init(&req->resInfo, cntRaw, cntRle, cntCmp); + zstdgpu_ResourceInfo_Stage_1_Init(&req->resInfo, cntRR, cntCmp); } else if (stageIndex == 2) { @@ -1897,17 +1879,13 @@ void zstdgpu_SubmitStage0(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi cmdList->SetComputeRoot32BitConstant(1, lookbackCount /* workItemCount */, 1); cmdList->SetComputeRoot32BitConstant(1, 0 /* value */, 2); - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountRAW->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); - cmdList->Dispatch(tgCount, 1, 1); - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountRLE->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); + cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountRR->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); cmdList->Dispatch(tgCount, 1, 1); cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountCMP->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); cmdList->Dispatch(tgCount, 1, 1); cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountAll->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); cmdList->Dispatch(tgCount, 1, 1); - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockSizesRAW->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); - cmdList->Dispatch(tgCount, 1, 1); - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockSizesRLE->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); + cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockSizesRR->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); cmdList->Dispatch(tgCount, 1, 1); PIXEndEvent(cmdList); @@ -1915,7 +1893,7 @@ void zstdgpu_SubmitStage0(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi { PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"Barrier for [Parse Frames :: Count Blocks]"); - D3D12_RESOURCE_BARRIER barriers[8]; + D3D12_RESOURCE_BARRIER barriers[6]; uint32_t bc = 0; // last written by [Init Resources :: Stage 0] @@ -1926,12 +1904,10 @@ void zstdgpu_SubmitStage0(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameSeqStreamMinIdx); // last written by [InitResources :: Memset :: Stage 0] // next written by [Parse Frames :: Block Counts] - setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockCountRAW); - setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockCountRLE); + setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockCountRR); setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockCountCMP); setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockCountAll); - setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockSizesRAW); - setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockSizesRLE); + setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerFrameBlockSizesRR); ZSTDGPU_ASSERT(bc == _countof(barriers)); cmdList->ResourceBarrier(bc, barriers); @@ -1952,19 +1928,17 @@ void zstdgpu_SubmitStage0(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi { PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"Barrier for [PrefixSum :: Block Counts]"); - D3D12_RESOURCE_BARRIER barriers[7]; + D3D12_RESOURCE_BARRIER barriers[5]; // next written/atomically updated by [Parse Frames :: Count Blocks] // next written by [PrefixSum :: Block Counts] to store prefix sum instead of counts - setResourceUavSync(barriers, 0, req->resData.gpuOnly.PerFrameBlockCountRAW); - setResourceUavSync(barriers, 1, req->resData.gpuOnly.PerFrameBlockCountRLE); - setResourceUavSync(barriers, 2, req->resData.gpuOnly.PerFrameBlockCountCMP); - setResourceUavSync(barriers, 3, req->resData.gpuOnly.PerFrameBlockCountAll); - setResourceUavSync(barriers, 4, req->resData.gpuOnly.PerFrameBlockSizesRAW); - setResourceUavSync(barriers, 5, req->resData.gpuOnly.PerFrameBlockSizesRLE); + setResourceUavSync(barriers, 0, req->resData.gpuOnly.PerFrameBlockCountRR); + setResourceUavSync(barriers, 1, req->resData.gpuOnly.PerFrameBlockCountCMP); + setResourceUavSync(barriers, 2, req->resData.gpuOnly.PerFrameBlockCountAll); + setResourceUavSync(barriers, 3, req->resData.gpuOnly.PerFrameBlockSizesRR); // last written by [Parse Frames :: Count Blocks] // next read by [Update Dispatch Args :: Stage 0] as RWStructuredBuffer (read-only) // NOTE: stays in UNORDERED_ACCESS because UpdateDispatchArgs binds Counters as UAV - setResourceUavSync(barriers, 6, req->resData.gpuOnly.Counters); + setResourceUavSync(barriers, 4, req->resData.gpuOnly.Counters); cmdList->ResourceBarrier(_countof(barriers), barriers); PIXEndEvent(cmdList); } @@ -1984,12 +1958,8 @@ void zstdgpu_SubmitStage0(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi ZSTDGPU_KERNEL_SCOPE(PrefixSum, cmdList, { - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountRAW->GetGPUVirtualAddress()); - cmdList->SetComputeRootUnorderedAccessView(1, req->resData.gpuOnly.PerFrameBlockCountRAW->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); - cmdList->Dispatch(tgCountX, 1, 1); - - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountRLE->GetGPUVirtualAddress()); - cmdList->SetComputeRootUnorderedAccessView(1, req->resData.gpuOnly.PerFrameBlockCountRLE->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); + cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountRR->GetGPUVirtualAddress()); + cmdList->SetComputeRootUnorderedAccessView(1, req->resData.gpuOnly.PerFrameBlockCountRR->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); cmdList->Dispatch(tgCountX, 1, 1); cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockCountCMP->GetGPUVirtualAddress()); @@ -2000,12 +1970,8 @@ void zstdgpu_SubmitStage0(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi cmdList->SetComputeRootUnorderedAccessView(1, req->resData.gpuOnly.PerFrameBlockCountAll->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); cmdList->Dispatch(tgCountX, 1, 1); - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockSizesRAW->GetGPUVirtualAddress()); - cmdList->SetComputeRootUnorderedAccessView(1, req->resData.gpuOnly.PerFrameBlockSizesRAW->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); - cmdList->Dispatch(tgCountX, 1, 1); - - cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockSizesRLE->GetGPUVirtualAddress()); - cmdList->SetComputeRootUnorderedAccessView(1, req->resData.gpuOnly.PerFrameBlockSizesRLE->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); + cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.PerFrameBlockSizesRR->GetGPUVirtualAddress()); + cmdList->SetComputeRootUnorderedAccessView(1, req->resData.gpuOnly.PerFrameBlockSizesRR->GetGPUVirtualAddress() + req->zstdFrameCount * sizeof(uint32_t)); cmdList->Dispatch(tgCountX, 1, 1); }); @@ -2022,10 +1988,9 @@ void zstdgpu_SubmitStage0(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi cmdList->SetComputeRoot32BitConstant(4, 0 /* stage */, 1); cmdList->SetComputeRoot32BitConstant(4, req->zstdCmpBlockCountMax, 2); - cmdList->SetComputeRoot32BitConstant(4, req->zstdRawBlockCountMax, 3); - cmdList->SetComputeRoot32BitConstant(4, req->zstdRleBlockCountMax, 4); + cmdList->SetComputeRoot32BitConstant(4, req->zstdRRBlockCountMax, 3); + cmdList->SetComputeRoot32BitConstant(4, 0 /* unused for stage == 0*/, 4); cmdList->SetComputeRoot32BitConstant(4, 0 /* unused for stage == 0*/, 5); - cmdList->SetComputeRoot32BitConstant(4, 0 /* unused for stage == 0*/, 6); ZSTDGPU_KERNEL_SCOPE(UpdateDispatchArgs_Stage0, cmdList, cmdList->Dispatch(1, 1, 1); ); @@ -2091,8 +2056,7 @@ void zstdgpu_SubmitStage1(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi } { const uint32_t initResourcesStage = 1; - const uint32_t allBlockCount = req->zstdRawBlockCountMax - + req->zstdRleBlockCountMax + const uint32_t allBlockCount = req->zstdRRBlockCountMax + req->zstdCmpBlockCountMax; PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"[Init Resources :: Stage 1]"); @@ -2139,8 +2103,7 @@ void zstdgpu_SubmitStage1(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi // Group 4: BlockSizePrefix lookback (allBlockCount-sized) { - const uint32_t allBlockCount = req->zstdRawBlockCountMax - + req->zstdRleBlockCountMax + const uint32_t allBlockCount = req->zstdRRBlockCountMax + req->zstdCmpBlockCountMax; cmdList->SetComputeRootUnorderedAccessView(0, req->resData.gpuOnly.BlockSizePrefix->GetGPUVirtualAddress() + allBlockCount * sizeof(uint32_t)); } @@ -2172,7 +2135,7 @@ void zstdgpu_SubmitStage1(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi } { PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"Barrier with Resources for [Parse Compressed Blocks] and [Memcpy RAW blocks, Memset RLE blocks]"); - D3D12_RESOURCE_BARRIER barriers[15]; + D3D12_RESOURCE_BARRIER barriers[14]; uint32_t bc = 0; if (req->zstdCmpBlockCountMax > 0) @@ -2226,14 +2189,9 @@ void zstdgpu_SubmitStage1(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi // last written by [Parse Frames :: Collect Blocks] // next read by [Memcpy RAW blocks, Memset RLE blocks] - if (req->zstdRawBlockCountMax > 0) + if (req->zstdRRBlockCountMax > 0) { - setResourceUavToSrvSync(barriers, bc ++, req->resData.gpuOnly.BlocksRAWRefs); - } - - if (req->zstdRleBlockCountMax > 0) - { - setResourceUavToSrvSync(barriers, bc ++, req->resData.gpuOnly.BlocksRLERefs); + setResourceUavToSrvSync(barriers, bc ++, req->resData.gpuOnly.BlocksRRRefs); } cmdList->ResourceBarrier(bc, barriers); @@ -2314,10 +2272,9 @@ void zstdgpu_SubmitStage1(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi cmdList->SetComputeRoot32BitConstant(4, 1 /* stage */, 1); cmdList->SetComputeRoot32BitConstant(4, req->zstdCmpBlockCountMax, 2); - cmdList->SetComputeRoot32BitConstant(4, req->zstdRawBlockCountMax, 3); - cmdList->SetComputeRoot32BitConstant(4, req->zstdRleBlockCountMax, 4); - cmdList->SetComputeRoot32BitConstant(4, req->zstdUncompressedLitByteCountMax, 5); - cmdList->SetComputeRoot32BitConstant(4, req->zstdUncompressedSeqElemCountMax, 6); + cmdList->SetComputeRoot32BitConstant(4, req->zstdRRBlockCountMax, 3); + cmdList->SetComputeRoot32BitConstant(4, req->zstdUncompressedLitByteCountMax, 4); + cmdList->SetComputeRoot32BitConstant(4, req->zstdUncompressedSeqElemCountMax, 5); ZSTDGPU_KERNEL_SCOPE(UpdateDispatchArgs, cmdList, cmdList->Dispatch(1, 1, 1); ); @@ -2420,10 +2377,9 @@ void zstdgpu_SubmitStage2(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi cmdList->SetComputeRoot32BitConstant(4, 2 /* stage */, 1); cmdList->SetComputeRoot32BitConstant(4, req->zstdCmpBlockCountMax, 2); - cmdList->SetComputeRoot32BitConstant(4, req->zstdRawBlockCountMax, 3); - cmdList->SetComputeRoot32BitConstant(4, req->zstdRleBlockCountMax, 4); + cmdList->SetComputeRoot32BitConstant(4, req->zstdRRBlockCountMax, 3); + cmdList->SetComputeRoot32BitConstant(4, 0 /* unused for stage == 2 */, 4); cmdList->SetComputeRoot32BitConstant(4, 0 /* unused for stage == 2 */, 5); - cmdList->SetComputeRoot32BitConstant(4, 0 /* unused for stage == 2 */, 6); ZSTDGPU_KERNEL_SCOPE(UpdateDispatchArgs_DecompressLiterals, cmdList, cmdList->Dispatch(1, 1, 1); ); @@ -2685,33 +2641,34 @@ void zstdgpu_SubmitStage2(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi uint32_t bc = 0; // last written/updated by [Decompress Sequences] // next written/updated by [Prefix Block Sizes] - setResourceUavSync(barriers, bc + 0, req->resData.gpuOnly.BlockSizePrefix); + setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.BlockSizePrefix); // last written by [Decompress Sequences] // next read/written by [Prefix Sequence Offsets] - setResourceUavSync(barriers, bc + 1, req->resData.gpuOnly.PerSeqStreamFinalOffset1); - setResourceUavSync(barriers, bc + 2, req->resData.gpuOnly.PerSeqStreamFinalOffset2); - setResourceUavSync(barriers, bc + 3, req->resData.gpuOnly.PerSeqStreamFinalOffset3); - // last written/updated by [Decompress Sequences] - // next written/updated by [Finalise Sequence Offsets] - setResourceUavSync(barriers, bc + 4, req->resData.gpuOnly.DecompressedSequenceOffs); - // last written/updated by [Decompress Sequences] - // next read by [Execute Sequences] - setResourceUavToSrvSync(barriers, bc + 5, req->resData.gpuOnly.DecompressedSequenceLLen); - setResourceUavToSrvSync(barriers, bc + 6, req->resData.gpuOnly.DecompressedSequenceMLen); - bc += 7; + setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerSeqStreamFinalOffset1); + setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerSeqStreamFinalOffset2); + setResourceUavSync(barriers, bc ++, req->resData.gpuOnly.PerSeqStreamFinalOffset3); + // not all CMP blocks have sequences + if (req->resData.gpuOnly.DecompressedSequenceOffs != nullptr) + { + // last written/updated by [Decompress Sequences] + // next written/updated by [Finalise Sequence Offsets] + setResourceUavSync(barriers, bc++, req->resData.gpuOnly.DecompressedSequenceOffs); + // last written/updated by [Decompress Sequences] + // next read by [Execute Sequences] + setResourceUavToSrvSync(barriers, bc++, req->resData.gpuOnly.DecompressedSequenceLLen); + setResourceUavToSrvSync(barriers, bc++, req->resData.gpuOnly.DecompressedSequenceMLen); + } // last written/updated by [Init Huffman Table and Decompress Literals] // next read by [Execute Sequences] if (req->zstdUncompressedLitByteCountMax > 0) { - setResourceUavToSrvSync(barriers, bc + 0, req->resData.gpuOnly.DecompressedLiterals); - bc += 1; + setResourceUavToSrvSync(barriers, bc ++, req->resData.gpuOnly.DecompressedLiterals); } cmdList->ResourceBarrier(bc, barriers); PIXEndEvent(cmdList); } { - const uint32_t allBlockCount = req->zstdRawBlockCountMax - + req->zstdRleBlockCountMax + const uint32_t allBlockCount = req->zstdRRBlockCountMax + req->zstdCmpBlockCountMax; PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"[Prefix Block Sizes]"); @@ -2785,7 +2742,7 @@ void zstdgpu_SubmitStage2(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi PIXEndEvent(cmdList); } - if (req->zstdCmpBlockCountMax > 0) + if (req->zstdCmpBlockCountMax > 0 && req->resData.gpuOnly.DecompressedSequenceOffs != nullptr) // not all CMP blocks have sequences { PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"Barrier with Resources for [Memcpy RAW blocks, Memset RLE blocks] and [Execute Sequences]"); D3D12_RESOURCE_BARRIER barriers[1]; @@ -2795,39 +2752,25 @@ void zstdgpu_SubmitStage2(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi cmdList->ResourceBarrier(_countof(barriers), barriers); PIXEndEvent(cmdList); } - if (req->zstdRawBlockCountMax > 0 || req->zstdRleBlockCountMax > 0) + if (req->zstdRRBlockCountMax > 0) { PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"[Memcpy RAW blocks, Memset RLE blocks]"); d3d12aid_ComputeRsPs_Set(&req->MemsetMemcpy, cmdList); cmdList->SetDescriptorHeaps(1, &req->srts.heap); cmdList->SetComputeRootDescriptorTable(0, req->srts.MemsetMemcpyGpuHandle); - ZSTDGPU_KERNEL_SCOPE(MemcpyRAW_MemsetRLE, cmdList, + + ZSTDGPU_KERNEL_SCOPE(MemsetMemcpy, cmdList, { - if (req->zstdRawBlockCountMax > 0) - { - cmdList->SetComputeRootShaderResourceView(1, req->resData.gpuOnly.RawBlockSizePrefix->GetGPUVirtualAddress()); - cmdList->SetComputeRootShaderResourceView(2, req->resData.gpuOnly.PerFrameBlockSizesRAW->GetGPUVirtualAddress()); - cmdList->SetComputeRootShaderResourceView(3, req->resData.gpuOnly.BlocksRAWRefs->GetGPUVirtualAddress()); - cmdList->SetComputeRootShaderResourceView(4, req->resData.gpuOnly.GlobalBlockIndexPerRawBlock->GetGPUVirtualAddress()); - // NOTE: Slots 0 (tgOffset) and 1 (workItemCount) are set by command signature via indirect dispatch - cmdList->SetComputeRoot32BitConstant(5, req->zstdRawBlockCountMax, 2); - cmdList->SetComputeRoot32BitConstant(5, req->zstdFrameCount, 3); - cmdList->SetComputeRoot32BitConstant(5, 1 /* flags */, 4); - zstdgpu_DispatchIndirect(cmdList, MemsetMemcpy, MemcpyRAW); - } - if (req->zstdRleBlockCountMax > 0) - { - cmdList->SetComputeRootShaderResourceView(1, req->resData.gpuOnly.RleBlockSizePrefix->GetGPUVirtualAddress()); - cmdList->SetComputeRootShaderResourceView(2, req->resData.gpuOnly.PerFrameBlockSizesRLE->GetGPUVirtualAddress()); - cmdList->SetComputeRootShaderResourceView(3, req->resData.gpuOnly.BlocksRLERefs->GetGPUVirtualAddress()); - cmdList->SetComputeRootShaderResourceView(4, req->resData.gpuOnly.GlobalBlockIndexPerRleBlock->GetGPUVirtualAddress()); - // NOTE: Slots 0 (tgOffset) and 1 (workItemCount) are set by command signature via indirect dispatch - cmdList->SetComputeRoot32BitConstant(5, req->zstdRleBlockCountMax, 2); - cmdList->SetComputeRoot32BitConstant(5, req->zstdFrameCount, 3); - cmdList->SetComputeRoot32BitConstant(5, 0 /* flags */, 4); - zstdgpu_DispatchIndirect(cmdList, MemsetMemcpy, MemsetRLE); - } + cmdList->SetComputeRootShaderResourceView(1, req->resData.gpuOnly.RRBlockSizePrefix->GetGPUVirtualAddress()); + cmdList->SetComputeRootShaderResourceView(2, req->resData.gpuOnly.PerFrameBlockSizesRR->GetGPUVirtualAddress()); + cmdList->SetComputeRootShaderResourceView(3, req->resData.gpuOnly.BlocksRRRefs->GetGPUVirtualAddress()); + cmdList->SetComputeRootShaderResourceView(4, req->resData.gpuOnly.GlobalBlockIndexPerRRBlock->GetGPUVirtualAddress()); + // NOTE: Slots 0 (tgOffset) and 1 (workItemCount) are set by command signature via indirect dispatch + cmdList->SetComputeRoot32BitConstant(5, req->zstdRRBlockCountMax, 2); + cmdList->SetComputeRoot32BitConstant(5, req->zstdFrameCount, 3); + zstdgpu_DispatchIndirect(cmdList, MemsetMemcpy, MemsetMemcpyRR); }); + PIXEndEvent(cmdList); } if (req->zstdCmpBlockCountMax > 0) @@ -2835,7 +2778,7 @@ void zstdgpu_SubmitStage2(zstdgpu_PerRequestContext req, ID3D12GraphicsCommandLi PIXBeginEvent(cmdList, PIX_COLOR_DEFAULT, L"Barrier with Resources for [Execute Sequences]"); D3D12_RESOURCE_BARRIER barriers[2]; uint32_t bc = 0; - if (req->zstdRawBlockCountMax > 0 || req->zstdRleBlockCountMax > 0) + if (req->zstdRRBlockCountMax > 0) { // in case if the number of RAW+RLE blocks > 0, [Memcpy RAW blocks, Memset RLE blocks] has written to 'UnCompressedFramesData' // next read by [Execute Sequences] @@ -2894,34 +2837,26 @@ ZSTDGPU_API void zstdgpu_ReadbackGpuResults(zstdgpu_PerRequestContext req, ID3D1 // Read-only resource from the last stage (== 2) get a NON_PS_RESOURCE state as a result of promotion from COMMON state // (which happens in case if the stage prior to it (==1) is submitted in a separate CommandList/ExecuteCommandList) // and then used as COPY_SOURCE for debug readback - D3D12_RESOURCE_BARRIER barriers[13]; + D3D12_RESOURCE_BARRIER barriers[9]; uint32_t bc = 0; if (zstdgpu_IsReadbackRequired(req, 1)) { setResourceState(barriers, 0, req->resData.gpuOnly.PerFrameBlockCountCMP, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); setResourceState(barriers, 1, req->resData.gpuOnly.PerFrameBlockCountAll, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - setResourceState(barriers, 2, req->resData.gpuOnly.PerFrameBlockSizesRAW, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - setResourceState(barriers, 3, req->resData.gpuOnly.PerFrameBlockSizesRLE, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - setResourceState(barriers, 4, req->resData.gpuOnly.PerFrameSeqStreamMinIdx, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - bc += 5; + setResourceState(barriers, 2, req->resData.gpuOnly.PerFrameBlockSizesRR, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); + setResourceState(barriers, 3, req->resData.gpuOnly.PerFrameSeqStreamMinIdx, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); + bc += 4; if (req->zstdCmpBlockCountMax > 0) { setResourceState(barriers, bc + 0, req->resData.gpuOnly.GlobalBlockIndexPerCmpBlock, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); setResourceState(barriers, bc + 1, req->resData.gpuOnly.PerSeqStreamSeqStart, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); bc += 2; } - if (req->zstdRawBlockCountMax > 0) - { - setResourceState(barriers, bc + 0, req->resData.gpuOnly.GlobalBlockIndexPerRawBlock, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - setResourceState(barriers, bc + 1, req->resData.gpuOnly.RawBlockSizePrefix, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - setResourceState(barriers, bc + 2, req->resData.gpuOnly.BlocksRAWRefs, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - bc += 3; - } - if (req->zstdRleBlockCountMax > 0) + if (req->zstdRRBlockCountMax > 0) { - setResourceState(barriers, bc + 0, req->resData.gpuOnly.GlobalBlockIndexPerRleBlock, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - setResourceState(barriers, bc + 1, req->resData.gpuOnly.RleBlockSizePrefix, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); - setResourceState(barriers, bc + 2, req->resData.gpuOnly.BlocksRLERefs, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); + setResourceState(barriers, bc + 0, req->resData.gpuOnly.GlobalBlockIndexPerRRBlock, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); + setResourceState(barriers, bc + 1, req->resData.gpuOnly.RRBlockSizePrefix, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); + setResourceState(barriers, bc + 2, req->resData.gpuOnly.BlocksRRRefs, NON_PIXEL_SHADER_RESOURCE, COPY_SOURCE); bc += 3; } } @@ -2943,8 +2878,7 @@ ZSTDGPU_API void zstdgpu_RetrieveGpuResults(zstdgpu_ResourceDataCpu *outGpuResou zstdgpu_ResourceDataCpu_InitFromResourceDataGpu(outGpuResources, &req->resData); // HACK (pamartis): we copy block counts for now, because validation needs it, but we never compute block count on GPU - outGpuResources->Counters->Blocks_RAW = req->zstdRawBlockCountMax; - outGpuResources->Counters->Blocks_RLE = req->zstdRleBlockCountMax; + outGpuResources->Counters->Blocks_RR = req->zstdRRBlockCountMax; outGpuResources->Counters->Blocks_CMP = req->zstdCmpBlockCountMax; } diff --git a/zstd/zstdgpu/zstdgpu.h b/zstd/zstdgpu/zstdgpu.h index 4ff36b8..95fa57d 100644 --- a/zstd/zstdgpu/zstdgpu.h +++ b/zstd/zstdgpu/zstdgpu.h @@ -24,8 +24,7 @@ struct zstdgpu_CountFramesAndBlocksInfo { - uint32_t rawBlockCount; - uint32_t rleBlockCount; + uint32_t rrBlockCount; uint32_t cmpBlockCount; uint32_t frameCount; uint64_t frameByteCount; @@ -92,7 +91,7 @@ ZSTDGPU_API void zstdgpu_CollectFrames(zstdgpu_OffsetAndSize *outFrames, zstdgpu * NB2: For RLE block `zstdgpu_OffsetAndSize::offs` stores the actual 8-bit symbol. At the same time `zstdgpu_OffsetAndSize::size` * stores the number of times the symbol has to be repeated in the decompressed stream. */ -ZSTDGPU_API void zstdgpu_CollectBlocks(zstdgpu_OffsetAndSize *outBlocksRaw, zstdgpu_OffsetAndSize *outBlocksRLE, zstdgpu_OffsetAndSize *outBlocksCmp, const zstdgpu_OffsetAndSize *frames, const zstdgpu_FrameInfo *frameInfos, uint32_t frameIndex, uint32_t frameCount, const void *memoryBlock, uint32_t memoryBlockSizeInBytes, uint32_t contentSizeInBytes); +ZSTDGPU_API void zstdgpu_CollectBlocks(zstdgpu_OffsetAndSize *outBlocksRR, zstdgpu_OffsetAndSize *outBlocksCmp, const zstdgpu_OffsetAndSize *frames, const zstdgpu_FrameInfo *frameInfos, uint32_t frameIndex, uint32_t frameCount, const void *memoryBlock, uint32_t memoryBlockSizeInBytes, uint32_t contentSizeInBytes); struct zstdgpu_CountLiteralAndSequenceInfo { @@ -195,7 +194,7 @@ ZSTDGPU_API zstdgpu_Status zstdgpu_SetupOutputs(zstdgpu_PerRequestContext inPerR * the same command list without a CPU fence. * Can be called before or after `zstdgpu_SetupInputs*` functions. */ -ZSTDGPU_API zstdgpu_Status zstdgpu_SetupFrameInfoConstants(zstdgpu_PerRequestContext inPerRequestContext, uint32_t rawBlockCount, uint32_t rleBlockCount, uint32_t cmpBlockCount); +ZSTDGPU_API zstdgpu_Status zstdgpu_SetupFrameInfoConstants(zstdgpu_PerRequestContext inPerRequestContext, uint32_t rrBlockCount, uint32_t cmpBlockCount); /** * @brief Specifies the total decoded literal byte count and sequence count. diff --git a/zstd/zstdgpu/zstdgpu_reference_store.cpp b/zstd/zstdgpu/zstdgpu_reference_store.cpp index e86a9c6..54f192d 100644 --- a/zstd/zstdgpu/zstdgpu_reference_store.cpp +++ b/zstd/zstdgpu/zstdgpu_reference_store.cpp @@ -26,15 +26,13 @@ typedef uint32_t (*zstdgpu_FseElemOffsetFn)(uint32_t fseTableIndex, uint32_t cmp static uint32_t GFrameCount = 0; -static uint32_t GBlockCountRAW = 0; -static uint32_t GBlockCountRLE = 0; +static uint32_t GBlockCountRR = 0; static uint32_t GBlockCountCMP = 0; static uint32_t GZstdDataSize = 0; static zstdgpu_ResourceDataCpu GZstd; -static uint32_t GBlockIndexRAW = 0; -static uint32_t GBlockIndexRLE = 0; +static uint32_t GBlockIndexRR = 0; static uint32_t GBlockIndexCMP = 0; static uint32_t GHufCompressedLiteralCount = 0; @@ -78,9 +76,10 @@ void zstdgpu_ReferenceStore_Report_ChunkBase(const void *base) void zstdgpu_ReferenceStore_Report_FrameAndBlockCount(uint32_t frameCount, uint32_t rawBlockCount, uint32_t rleBlockCount, uint32_t cmpBlockCount, uint32_t zstdDataSize) { + const uint32_t rrBlockCount = rawBlockCount + rleBlockCount; + GFrameCount = frameCount; - GBlockCountRAW = rawBlockCount; - GBlockCountRLE = rleBlockCount; + GBlockCountRR = rrBlockCount; GBlockCountCMP = cmpBlockCount; GZstdDataSize = zstdDataSize; } @@ -90,7 +89,7 @@ static zstdgpu_ResourceInfo GZstdInfo; void zstdgpu_ReferenceStore_AllocateMemory(void) { zstdgpu_ResourceInfo_Stage_0_Init(&GZstdInfo, GFrameCount, GZstdDataSize, 0); - zstdgpu_ResourceInfo_Stage_1_Init(&GZstdInfo, GBlockCountRAW, GBlockCountRLE, GBlockCountCMP); + zstdgpu_ResourceInfo_Stage_1_Init(&GZstdInfo, GBlockCountRR, GBlockCountCMP); zstdgpu_ResourceInfo_Stage_2_Init(&GZstdInfo, 4 * 1024 * 1024 /*literal count*/, 4 * 1024 * 1024 /*sequence count*/, 0, 0); zstdgpu_ResourceDataCpu_InitZero(&GZstd); @@ -111,8 +110,7 @@ void zstdgpu_ReferenceStore_FreeMemory(void) static uint32_t zstdgpu_GetLastBlockIndex(void) { - const uint32_t allBlockCount = GBlockIndexRAW - + GBlockIndexRLE + const uint32_t allBlockCount = GBlockIndexRR + GBlockIndexCMP; ZSTDGPU_ASSERT(allBlockCount >= 1u); @@ -140,18 +138,22 @@ static void zstdgpu_AppendLastBlockSize(uint32_t size) void zstdgpu_ReferenceStore_Report_Block(const void *base, uint32_t size, ZSTDGPU_ENUM(ReferenceStore_BlockType) type) { -#define APPEND(TYPE, type, base, size) \ - if (type == kzstdgpu_ReferenceStore_Block##TYPE) \ - { \ - GZstd.Blocks##TYPE##Refs[GBlockIndex##TYPE].offs = izstdgpu_ReferenceStore_PtrToOffs(base);\ - GZstd.Blocks##TYPE##Refs[GBlockIndex##TYPE].size = size; \ - GBlockIndex##TYPE += 1; \ + const uint32_t actualOffset = izstdgpu_ReferenceStore_PtrToOffs(base); + + if (type == ZSTDGPU_ENUM_CONST(ReferenceStore_BlockCMP)) + { + GZstd.BlocksCMPRefs[GBlockIndexCMP++] = { actualOffset, size }; + } + else // Raw/RLE: + { + uint32_t offs = actualOffset; + if (type == ZSTDGPU_ENUM_CONST(ReferenceStore_BlockRLE)) + { + offs = (*(const uint8_t*)base) | kzstdgpu_RLEBlock_OffsetFlag; + } + GZstd.BlocksRRRefs[GBlockIndexRR++] = { offs, size }; } - APPEND(RAW, type, base, size); - APPEND(RLE, type, base, size); - APPEND(CMP, type, base, size); -#undef APPEND uint32_t lastBlockIndex = zstdgpu_SetLastBlockSize(type == ZSTDGPU_ENUM_CONST(ReferenceStore_BlockCMP) ? 0 : size); if (type == ZSTDGPU_ENUM_CONST(ReferenceStore_BlockCMP)) { @@ -768,6 +770,7 @@ void zstdgpu_ReferenceStore_Report_ResolvedOffset(size_t offset) static ZSTDGPU_ENUM(Validate_Result) izstdgpu_ReferenceStore_Validate_OffsetAndSize(const zstdgpu_OffsetAndSize *ref, const zstdgpu_OffsetAndSize *tst) { + // For RLE blocks, offs won't be an actual offset. if (ref->offs == tst->offs && ref->size == tst->size) return ZSTDGPU_ENUM_CONST(Validate_Success); else @@ -792,10 +795,7 @@ static ZSTDGPU_ENUM(Validate_Result) izstdgpu_ReferenceStore_Validate_OffsetsAnd ZSTDGPU_ENUM(Validate_Result) zstdgpu_ReferenceStore_Validate_Blocks(const zstdgpu_ResourceDataCpu *resourceDataCpu) { - if (resourceDataCpu->Counters->Blocks_RAW != GBlockIndexRAW) - return ZSTDGPU_ENUM_CONST(Validate_Failed); - - if (resourceDataCpu->Counters->Blocks_RLE != GBlockIndexRLE) + if (resourceDataCpu->Counters->Blocks_RR != GBlockIndexRR) return ZSTDGPU_ENUM_CONST(Validate_Failed); if (resourceDataCpu->Counters->Blocks_CMP != GBlockIndexCMP) @@ -804,10 +804,9 @@ ZSTDGPU_ENUM(Validate_Result) zstdgpu_ReferenceStore_Validate_Blocks(const zstdg #define VALIDATE_BLOCKS(name) \ izstdgpu_ReferenceStore_Validate_OffsetsAndSizes(GZstd.Blocks##name##Refs, GBlockCount##name, resourceDataCpu->Blocks##name##Refs, resourceDataCpu->Counters->Blocks_##name) - if (ZSTDGPU_ENUM_CONST(Validate_Success) != VALIDATE_BLOCKS(RAW)) + if (ZSTDGPU_ENUM_CONST(Validate_Success) != VALIDATE_BLOCKS(RR)) return ZSTDGPU_ENUM_CONST(Validate_Failed); - //VALIDATE_BLOCKS(RLE); if (ZSTDGPU_ENUM_CONST(Validate_Success) != VALIDATE_BLOCKS(CMP)) return ZSTDGPU_ENUM_CONST(Validate_Failed); diff --git a/zstd/zstdgpu/zstdgpu_resources.h b/zstd/zstdgpu/zstdgpu_resources.h index 71e0adc..c47f98f 100644 --- a/zstd/zstdgpu/zstdgpu_resources.h +++ b/zstd/zstdgpu/zstdgpu_resources.h @@ -30,12 +30,10 @@ #define ZSTDGPU_BUFFERS_LIST_READBACK_STAGE_0() \ ZSTDGPU_BUFFER(zstdgpu_Counters , Counters ) \ - ZSTDGPU_BUFFER(uint32_t , PerFrameBlockCountRAW ) \ - ZSTDGPU_BUFFER(uint32_t , PerFrameBlockCountRLE ) \ + ZSTDGPU_BUFFER(uint32_t , PerFrameBlockCountRR ) \ ZSTDGPU_BUFFER(uint32_t , PerFrameBlockCountCMP ) \ ZSTDGPU_BUFFER(uint32_t , PerFrameBlockCountAll ) \ - ZSTDGPU_BUFFER(uint32_t , PerFrameBlockSizesRAW ) \ - ZSTDGPU_BUFFER(uint32_t , PerFrameBlockSizesRLE ) \ + ZSTDGPU_BUFFER(uint32_t , PerFrameBlockSizesRR ) \ ZSTDGPU_BUFFER(uint32_t , PerFrameSeqStreamMinIdx ) \ ZSTDGPU_BUFFER(zstdgpu_FrameInfo , Frames ) @@ -63,16 +61,14 @@ #define ZSTDGPU_BUFFERS_LIST_READBACK_STAGE_1() \ ZSTDGPU_BUFFER(uint32_t , BlockSizePrefix ) \ - ZSTDGPU_BUFFER(uint32_t , GlobalBlockIndexPerRawBlock ) \ - ZSTDGPU_BUFFER(uint32_t , GlobalBlockIndexPerRleBlock ) \ + ZSTDGPU_BUFFER(uint32_t , GlobalBlockIndexPerRRBlock ) \ ZSTDGPU_BUFFER(uint32_t , GlobalBlockIndexPerCmpBlock ) \ ZSTDGPU_BUFFER(uint32_t , PerSeqStreamFinalOffset1 ) \ ZSTDGPU_BUFFER(uint32_t , PerSeqStreamFinalOffset2 ) \ ZSTDGPU_BUFFER(uint32_t , PerSeqStreamFinalOffset3 ) \ ZSTDGPU_BUFFER(uint32_t , PerSeqStreamSeqStart ) \ \ - ZSTDGPU_BUFFER(uint32_t , RawBlockSizePrefix ) \ - ZSTDGPU_BUFFER(uint32_t , RleBlockSizePrefix ) \ + ZSTDGPU_BUFFER(uint32_t , RRBlockSizePrefix ) \ \ ZSTDGPU_BUFFER(int16_t , FseProbs ) \ ZSTDGPU_BUFFER(zstdgpu_FseInfo , FseInfos ) \ @@ -95,8 +91,7 @@ ZSTDGPU_BUFFER(uint32_t , LitStreamEndPerHuffmanTable ) \ ZSTDGPU_BUFFER(uint32_t , LitGroupEndPerHuffmanTable ) \ \ - ZSTDGPU_BUFFER(zstdgpu_OffsetAndSize , BlocksRLERefs ) \ - ZSTDGPU_BUFFER(zstdgpu_OffsetAndSize , BlocksRAWRefs ) \ + ZSTDGPU_BUFFER(zstdgpu_OffsetAndSize , BlocksRRRefs ) \ ZSTDGPU_BUFFER(zstdgpu_OffsetAndSize , BlocksCMPRefs ) #define ZSTDGPU_BUFFERS_LIST_STAGE_1() \ @@ -262,12 +257,10 @@ static void zstdgpu_ResourceInfo_Stage_0_InitSize(zstdgpu_ResourceInfo *outInfo, const uint32_t FramesRefs_Count = frameCount; const uint32_t CompressedData_Count = (dataCount + 3) / 4; // because CompressedData is in uint32_t const uint32_t Counters_Count = 1; - const uint32_t PerFrameBlockCountRAW_Count = frameCount + zstdgpu_GetLookbackBlockCount(frameCount); - const uint32_t PerFrameBlockCountRLE_Count = PerFrameBlockCountRAW_Count; - const uint32_t PerFrameBlockCountCMP_Count = PerFrameBlockCountRAW_Count; - const uint32_t PerFrameBlockCountAll_Count = PerFrameBlockCountRAW_Count; - const uint32_t PerFrameBlockSizesRAW_Count = PerFrameBlockCountRAW_Count; - const uint32_t PerFrameBlockSizesRLE_Count = PerFrameBlockCountRLE_Count; + const uint32_t PerFrameBlockCountRR_Count = frameCount + zstdgpu_GetLookbackBlockCount(frameCount); + const uint32_t PerFrameBlockCountCMP_Count = PerFrameBlockCountRR_Count; + const uint32_t PerFrameBlockCountAll_Count = PerFrameBlockCountRR_Count; + const uint32_t PerFrameBlockSizesRR_Count = PerFrameBlockCountRR_Count; const uint32_t PerFrameSeqStreamMinIdx_Count = frameCount; const uint32_t DispatchArgs_Count = kzstdgpu_DispatchSlot_Count * kzstdgpu_DispatchSlot_StrideInUInt32; const uint32_t DispatchCnts_Count = kzstdgpu_DispatchSlot_Count; @@ -276,20 +269,17 @@ static void zstdgpu_ResourceInfo_Stage_0_InitSize(zstdgpu_ResourceInfo *outInfo, ZSTDGPU_ALL_BUFFERS_LIST_STAGE_0() } -static void zstdgpu_ResourceInfo_Stage_1_InitSize(zstdgpu_ResourceInfo *outInfo, uint32_t rawBlockCount, uint32_t rleBlockCount, uint32_t cmpBlockCount) +static void zstdgpu_ResourceInfo_Stage_1_InitSize(zstdgpu_ResourceInfo *outInfo, uint32_t rrBlockCount, uint32_t cmpBlockCount) { - const uint32_t BlocksRAWRefs_Count = rawBlockCount; - const uint32_t BlocksRLERefs_Count = rleBlockCount; + const uint32_t BlocksRRRefs_Count = rrBlockCount; const uint32_t BlocksCMPRefs_Count = cmpBlockCount; - const uint32_t allBlockCount = rawBlockCount + rleBlockCount + cmpBlockCount; + const uint32_t allBlockCount = rrBlockCount + cmpBlockCount; - const uint32_t RawBlockSizePrefix_Count = rawBlockCount; - const uint32_t RleBlockSizePrefix_Count = rleBlockCount; + const uint32_t RRBlockSizePrefix_Count = rrBlockCount; // TODO: this must a total of all blocks (including RLE and RAW) const uint32_t BlockSizePrefix_Count = allBlockCount + zstdgpu_GetLookbackBlockCount(allBlockCount); - const uint32_t GlobalBlockIndexPerRawBlock_Count = rawBlockCount; - const uint32_t GlobalBlockIndexPerRleBlock_Count = rleBlockCount; + const uint32_t GlobalBlockIndexPerRRBlock_Count = rrBlockCount; const uint32_t GlobalBlockIndexPerCmpBlock_Count = cmpBlockCount; const uint32_t PerSeqStreamFinalOffset1_Count = cmpBlockCount + zstdgpu_GetLookbackBlockCount(cmpBlockCount); @@ -443,9 +433,9 @@ static void zstdgpu_ResourceInfo_Stage_0_Init(zstdgpu_ResourceInfo *outInfo, uin zstdgpu_ResourceInfo_Stage_0_InitOffsetGpu2Cpu(outInfo); } -static void zstdgpu_ResourceInfo_Stage_1_Init(zstdgpu_ResourceInfo *outInfo, uint32_t rawBlockCount, uint32_t rleBlockCount, uint32_t cmpBlockCount) +static void zstdgpu_ResourceInfo_Stage_1_Init(zstdgpu_ResourceInfo *outInfo, uint32_t rrBlockCount, uint32_t cmpBlockCount) { - zstdgpu_ResourceInfo_Stage_1_InitSize(outInfo, rawBlockCount, rleBlockCount, cmpBlockCount); + zstdgpu_ResourceInfo_Stage_1_InitSize(outInfo, rrBlockCount, cmpBlockCount); zstdgpu_ResourceInfo_Stage_1_InitOffsetGpuOnly(outInfo); zstdgpu_ResourceInfo_Stage_1_InitOffsetCpu2Gpu(outInfo); zstdgpu_ResourceInfo_Stage_1_InitOffsetGpu2Cpu(outInfo); diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index 83e2cd8..7c01bfd 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -288,15 +288,12 @@ static inline void zstdgpu_GroupBallotLdsStore(uint32_t laneCnt, uint32_t VGPR, } static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_FrameInfo) outFrameInfo, - ZSTDGPU_RW_BUFFER(zstdgpu_OffsetAndSize) outBlocksRAWRefs, - ZSTDGPU_RW_BUFFER(zstdgpu_OffsetAndSize) outBlocksRLERefs, + ZSTDGPU_RW_BUFFER(zstdgpu_OffsetAndSize) outBlocksRRRefs, ZSTDGPU_RW_BUFFER(zstdgpu_OffsetAndSize) outBlocksCMPRefs, ZSTDGPU_RW_BUFFER(uint32_t) outPerBlockUncompressedSize, - ZSTDGPU_RW_BUFFER(uint32_t) outGlobalBlockIndexPerRawBlock, - ZSTDGPU_RW_BUFFER(uint32_t) outGlobalBlockIndexPerRleBlock, + ZSTDGPU_RW_BUFFER(uint32_t) outGlobalBlockIndexPerRRBlock, ZSTDGPU_RW_BUFFER(uint32_t) outGlobalBlockIndexPerCmpBlock, - ZSTDGPU_RW_BUFFER(uint32_t) outRawBlockSizes, - ZSTDGPU_RW_BUFFER(uint32_t) outRleBlockSizes, + ZSTDGPU_RW_BUFFER(uint32_t) outRRBlockSizes, ZSTDGPU_PARAM_INOUT(zstdgpu_Forward_BitBuffer) bits, uint32_t outputBlockInfo) { @@ -419,14 +416,14 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr const uint32_t blockType = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 2); const uint32_t blockSize = zstdgpu_Forward_BitBuffer_GetNoRefill(bits, 21); - const bool isRaw = 0 == blockType; - const bool isRle = 1 == blockType; + const bool isRR = 2 != blockType; const bool isCmp = 2 == blockType; uint32_t blockOffs = 0; - if (isRle) + if (1 == blockType) // RLE { - blockOffs = zstdgpu_Forward_BitBuffer_Get(bits, 8); + + blockOffs = zstdgpu_Forward_BitBuffer_Get(bits, 8) | kzstdgpu_RLEBlock_OffsetFlag; } else { @@ -436,30 +433,11 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr if (0 != outputBlockInfo) { - const uint32_t blockIndex = outFrameInfo.rawBlockStart - + outFrameInfo.rleBlockStart + const uint32_t blockIndex = outFrameInfo.rrBlockStart + outFrameInfo.cmpBlockStart; // NOTE(pamartis): Without branch, there's out-of-bounds access detected by validation layer when outBlock{Type}Refs aren't bound // so, it's either DXC or IHV compiler not preserving branches with memory accesses. - ZSTDGPU_BRANCH if (isRaw) - { - outBlocksRAWRefs[outFrameInfo.rawBlockStart].offs = blockOffs; - outBlocksRAWRefs[outFrameInfo.rawBlockStart].size = blockSize; - - outRawBlockSizes[outFrameInfo.rawBlockStart] = outFrameInfo.rawBlockBytesStart; - outGlobalBlockIndexPerRawBlock[outFrameInfo.rawBlockStart] = blockIndex; - } - - ZSTDGPU_BRANCH if (isRle) - { - outBlocksRLERefs[outFrameInfo.rleBlockStart].offs = blockOffs; - outBlocksRLERefs[outFrameInfo.rleBlockStart].size = blockSize; - - outRleBlockSizes[outFrameInfo.rleBlockStart] = outFrameInfo.rleBlockBytesStart; - outGlobalBlockIndexPerRleBlock[outFrameInfo.rleBlockStart] = blockIndex; - } - ZSTDGPU_BRANCH if (isCmp) { outBlocksCMPRefs[outFrameInfo.cmpBlockStart].offs = blockOffs; @@ -467,23 +445,31 @@ static inline void zstdgpu_ShaderEntry_ParseFrame(ZSTDGPU_PARAM_INOUT(zstdgpu_Fr outGlobalBlockIndexPerCmpBlock[outFrameInfo.cmpBlockStart] = blockIndex; } + else // RLE/Raw ("RR") + { + outBlocksRRRefs[outFrameInfo.rrBlockStart].offs = blockOffs; + outBlocksRRRefs[outFrameInfo.rrBlockStart].size = blockSize; + + outRRBlockSizes[outFrameInfo.rrBlockStart] = outFrameInfo.rrBlockBytesStart; + outGlobalBlockIndexPerRRBlock[outFrameInfo.rrBlockStart] = blockIndex; + } outPerBlockUncompressedSize[blockIndex] = isCmp ? 0 : blockSize; } // `Raw_Block` - this is an uncompressed block. `Block_Content` contains `Block_Size` bytes. - outFrameInfo.rawBlockStart += (isRaw) ? 1 : 0; - + // + // or: + // // `RLE_Block` - this is a single byte, repeated `Block_Size` times. `Block_Content` consists of a single byte. // On the decompression side, this byte must be repeated `Block_Size` times. - outFrameInfo.rleBlockStart += (isRle) ? 1 : 0; + outFrameInfo.rrBlockStart += (isRR) ? 1 : 0; // `Compressed_Block` - this is a Zstandard compressed block. `Block_Size` is the length of `Block_Content`, the compressed data. // The decompressed size is not known, but its maximum possible value is guaranteed (see below). outFrameInfo.cmpBlockStart += (isCmp) ? 1 : 0; - outFrameInfo.rawBlockBytesStart += (isRaw) ? blockSize : 0; - outFrameInfo.rleBlockBytesStart += (isRle) ? blockSize : 0; + outFrameInfo.rrBlockBytesStart += (isRR) ? blockSize : 0; } while (0 == lastBlock); @@ -518,33 +504,26 @@ static inline void zstdgpu_ShaderEntry_ParseFrames(ZSTDGPU_PARAM_INOUT(zstdgpu_P if (srt.countBlocksOnly > 0) { - frameInfo.rawBlockStart = 0; - frameInfo.rleBlockStart = 0; + frameInfo.rrBlockStart = 0; frameInfo.cmpBlockStart = 0; - frameInfo.rawBlockBytesStart = 0; - frameInfo.rleBlockBytesStart = 0; + frameInfo.rrBlockBytesStart = 0; } else { - frameInfo.rawBlockStart = srt.inoutPerFrameBlockCountRAW[threadId]; - frameInfo.rleBlockStart = srt.inoutPerFrameBlockCountRLE[threadId]; + frameInfo.rrBlockStart = srt.inoutPerFrameBlockCountRR[threadId]; frameInfo.cmpBlockStart = srt.inoutPerFrameBlockCountCMP[threadId]; - frameInfo.rawBlockBytesStart = srt.inoutPerFrameBlockSizesRAW[threadId]; - frameInfo.rleBlockBytesStart = srt.inoutPerFrameBlockSizesRLE[threadId]; + frameInfo.rrBlockBytesStart = srt.inoutPerFrameBlockSizesRR[threadId]; } zstdgpu_ShaderEntry_ParseFrame( frameInfo, - srt.inoutBlocksRAWRefs, - srt.inoutBlocksRLERefs, + srt.inoutBlocksRRRefs, srt.inoutBlocksCMPRefs, srt.inoutBlockSizePrefix, - srt.inoutGlobalBlockIndexPerRawBlock, - srt.inoutGlobalBlockIndexPerRleBlock, + srt.inoutGlobalBlockIndexPerRRBlock, srt.inoutGlobalBlockIndexPerCmpBlock, - srt.inoutRawBlockSizePrefix, - srt.inoutRleBlockSizePrefix, + srt.inoutRRBlockSizePrefix, bits, srt.countBlocksOnly > 0 ? 0u : 1u ); @@ -552,32 +531,25 @@ static inline void zstdgpu_ShaderEntry_ParseFrames(ZSTDGPU_PARAM_INOUT(zstdgpu_P if (srt.countBlocksOnly > 0) { - srt.inoutPerFrameBlockCountRAW[threadId] = frameInfo.rawBlockStart; - srt.inoutPerFrameBlockCountRLE[threadId] = frameInfo.rleBlockStart; + srt.inoutPerFrameBlockCountRR[threadId] = frameInfo.rrBlockStart; srt.inoutPerFrameBlockCountCMP[threadId] = frameInfo.cmpBlockStart; - srt.inoutPerFrameBlockCountAll[threadId] = frameInfo.rawBlockStart - + frameInfo.rleBlockStart + srt.inoutPerFrameBlockCountAll[threadId] = frameInfo.rrBlockStart + frameInfo.cmpBlockStart; - srt.inoutPerFrameBlockSizesRAW[threadId] = frameInfo.rawBlockBytesStart; - srt.inoutPerFrameBlockSizesRLE[threadId] = frameInfo.rleBlockBytesStart; + srt.inoutPerFrameBlockSizesRR[threadId] = frameInfo.rrBlockBytesStart; - const uint32_t rawBlockCount = WaveActiveSum(frameInfo.rawBlockStart); - const uint32_t rleBlockCount = WaveActiveSum(frameInfo.rleBlockStart); + const uint32_t rrBlockCount = WaveActiveSum(frameInfo.rrBlockStart); const uint32_t cmpBlockCount = WaveActiveSum(frameInfo.cmpBlockStart); - const uint32_t rawBlockByteCount = WaveActiveSum(frameInfo.rawBlockBytesStart); - const uint32_t rleBlockByteCount = WaveActiveSum(frameInfo.rleBlockBytesStart); + const uint32_t rrBlockByteCount = WaveActiveSum(frameInfo.rrBlockBytesStart); const uint32_t uncompSize = (uint32_t)WaveActiveSum(frameInfo.uncompSize); const uint32_t frameCount = WaveActiveCountBits(true); if (WaveIsFirstLane()) { - InterlockedAdd(srt.inoutCounters[0].Blocks_RAW, rawBlockCount); - InterlockedAdd(srt.inoutCounters[0].Blocks_RLE, rleBlockCount); + InterlockedAdd(srt.inoutCounters[0].Blocks_RR, rrBlockCount); InterlockedAdd(srt.inoutCounters[0].Blocks_CMP, cmpBlockCount); - InterlockedAdd(srt.inoutCounters[0].BlocksBytes_RAW, rawBlockByteCount); - InterlockedAdd(srt.inoutCounters[0].BlocksBytes_RLE, rleBlockByteCount); + InterlockedAdd(srt.inoutCounters[0].BlocksBytes_RR, rrBlockByteCount); InterlockedAdd(srt.inoutCounters[0].Frames, frameCount); InterlockedAdd(srt.inoutCounters[0].Frames_UncompressedByteSize, uncompSize); } @@ -611,11 +583,9 @@ static void zstdgpu_ShaderEntry_InitResources(ZSTDGPU_PARAM_INOUT(zstdgpu_InitRe srt.inoutCounters[0].HUF_Streams = 0; srt.inoutCounters[0].RAW_Streams = 0; srt.inoutCounters[0].RLE_Streams = 0; - srt.inoutCounters[0].Blocks_RAW = 0; - srt.inoutCounters[0].Blocks_RLE = 0; + srt.inoutCounters[0].Blocks_RR = 0; srt.inoutCounters[0].Blocks_CMP = 0; - srt.inoutCounters[0].BlocksBytes_RAW = 0; - srt.inoutCounters[0].BlocksBytes_RLE = 0; + srt.inoutCounters[0].BlocksBytes_RR = 0; srt.inoutCounters[0].Frames = 0; srt.inoutCounters[0].Frames_UncompressedByteSize = 0; srt.inoutCounters[0].Frames_ExecuteSequences = 0; diff --git a/zstd/zstdgpu/zstdgpu_shared_structs.h b/zstd/zstdgpu/zstdgpu_shared_structs.h index 14fa1af..c591a4a 100644 --- a/zstd/zstdgpu/zstdgpu_shared_structs.h +++ b/zstd/zstdgpu/zstdgpu_shared_structs.h @@ -24,10 +24,8 @@ struct zstdgpu_FrameInfo uint64_t uncompSize; uint32_t dictionary; - uint32_t rawBlockStart; - uint32_t rleBlockStart; + uint32_t rrBlockStart; uint32_t cmpBlockStart; - uint32_t rawBlockBytesStart; - uint32_t rleBlockBytesStart; + uint32_t rrBlockBytesStart; }; diff --git a/zstd/zstdgpu/zstdgpu_structs.h b/zstd/zstdgpu/zstdgpu_structs.h index 2f3f724..04d88c9 100644 --- a/zstd/zstdgpu/zstdgpu_structs.h +++ b/zstd/zstdgpu/zstdgpu_structs.h @@ -313,11 +313,9 @@ typedef struct zstdgpu_Counters uint32_t HUF_Streams; uint32_t RAW_Streams; uint32_t RLE_Streams; - uint32_t Blocks_RAW; - uint32_t Blocks_RLE; + uint32_t Blocks_RR; uint32_t Blocks_CMP; - uint32_t BlocksBytes_RAW; - uint32_t BlocksBytes_RLE; + uint32_t BlocksBytes_RR; uint32_t Frames; uint32_t Frames_UncompressedByteSize; uint32_t Frames_ExecuteSequences; @@ -337,14 +335,13 @@ static const uint32_t kzstdgpu_DispatchSlot_FinaliseSequenceOffsets = 10; static const uint32_t kzstdgpu_DispatchSlot_PrefixSequenceOffsets = 11; static const uint32_t kzstdgpu_DispatchSlot_ComputePrefixSum = 12; static const uint32_t kzstdgpu_DispatchSlot_PrefixBlockSizes = 13; -static const uint32_t kzstdgpu_DispatchSlot_MemcpyRAW = 14; -static const uint32_t kzstdgpu_DispatchSlot_MemsetRLE = 15; -static const uint32_t kzstdgpu_DispatchSlot_ParseCompressedBlocks = 16; -static const uint32_t kzstdgpu_DispatchSlot_Memset_CmpBlockLookback = 17; -static const uint32_t kzstdgpu_DispatchSlot_Memset_TableIndexLookback = 18; -static const uint32_t kzstdgpu_DispatchSlot_Memset_LitStreamEnd = 19; -static const uint32_t kzstdgpu_DispatchSlot_Memset_AllBlockLookback = 20; -static const uint32_t kzstdgpu_DispatchSlot_Count = 21; +static const uint32_t kzstdgpu_DispatchSlot_MemsetMemcpyRR = 14; +static const uint32_t kzstdgpu_DispatchSlot_ParseCompressedBlocks = 15; +static const uint32_t kzstdgpu_DispatchSlot_Memset_CmpBlockLookback = 16; +static const uint32_t kzstdgpu_DispatchSlot_Memset_TableIndexLookback = 17; +static const uint32_t kzstdgpu_DispatchSlot_Memset_LitStreamEnd = 18; +static const uint32_t kzstdgpu_DispatchSlot_Memset_AllBlockLookback = 19; +static const uint32_t kzstdgpu_DispatchSlot_Count = 20; #if defined(_GAMING_XBOX) || defined(__XBOX_SCARLETT) || defined(__XBOX_ONE) static const uint32_t kzstdgpu_DispatchSlot_CmdsPerSlot = 1; @@ -663,6 +660,9 @@ static inline uint32_t zstdgpu_MaxI32(int32_t a, int32_t b) #endif } +// Our max total compressed input or decompressed output bytes is <= INT_MAX. +static const uint32_t kzstdgpu_RLEBlock_OffsetFlag = 1u << 31; + static inline uint32_t zstdgpu_Encode30BitLookbackSelf(uint32_t x) { ZSTDGPU_ASSERT(x <= ~0xc0000000u); @@ -1553,22 +1553,17 @@ static inline uint32_t zstdgpu_InitResources_GetDispatchSizeX(uint32_t initResou \ ZSTDGPU_RW_BUFFER_DECL(zstdgpu_Counters , Counters , 0) \ ZSTDGPU_RW_BUFFER_DECL(zstdgpu_FrameInfo , Frames , 1) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockCountRAW , 2) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockCountRLE , 3) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockCountCMP , 4) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockCountAll , 5) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockSizesRAW , 6) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockSizesRLE , 7) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , RawBlockSizePrefix , 8) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , RleBlockSizePrefix , 9) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockCountRR , 2) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockCountCMP , 3) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockCountAll , 4) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , PerFrameBlockSizesRR , 5) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , RRBlockSizePrefix , 6) \ \ - ZSTDGPU_RW_BUFFER_DECL(zstdgpu_OffsetAndSize , BlocksRAWRefs ,10) \ - ZSTDGPU_RW_BUFFER_DECL(zstdgpu_OffsetAndSize , BlocksRLERefs ,11) \ - ZSTDGPU_RW_BUFFER_DECL(zstdgpu_OffsetAndSize , BlocksCMPRefs ,12) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , BlockSizePrefix ,13) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , GlobalBlockIndexPerRawBlock ,14) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , GlobalBlockIndexPerRleBlock ,15) \ - ZSTDGPU_RW_BUFFER_DECL(uint32_t , GlobalBlockIndexPerCmpBlock ,16) + ZSTDGPU_RW_BUFFER_DECL(zstdgpu_OffsetAndSize , BlocksRRRefs , 7) \ + ZSTDGPU_RW_BUFFER_DECL(zstdgpu_OffsetAndSize , BlocksCMPRefs , 8) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , BlockSizePrefix , 9) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , GlobalBlockIndexPerRRBlock ,10) \ + ZSTDGPU_RW_BUFFER_DECL(uint32_t , GlobalBlockIndexPerCmpBlock ,11) #define ZSTDGPU_INIT_RESOURCES_SRT() \ ZSTDGPU_RO_TYPED_BUFFER_DECL(int32_t, int16_t , FseProbsDefault , 0) \ diff --git a/zstd/zstdgpu_demo/main.cpp b/zstd/zstdgpu_demo/main.cpp index d1c4c52..ce78ee3 100644 --- a/zstd/zstdgpu_demo/main.cpp +++ b/zstd/zstdgpu_demo/main.cpp @@ -420,15 +420,10 @@ static void zstdgpu_Test_DecompressSequences(zstdgpu_ResourceDataCpu & cpuRes, z const uint32_t dstBlockIndex = cpuRes.GlobalBlockIndexPerCmpBlock[i]; cpuRes.BlockSizePrefix[dstBlockIndex] = literalSize; } - for (uint32_t i = 0; i < cpuRes.Counters->Blocks_RAW; ++i) + for (uint32_t i = 0; i < cpuRes.Counters->Blocks_RR; ++i) { - const uint32_t dstBlockIndex = cpuRes.GlobalBlockIndexPerRawBlock[i]; - cpuRes.BlockSizePrefix[dstBlockIndex] = cpuRes.BlocksRAWRefs[i].size;; - } - for (uint32_t i = 0; i < cpuRes.Counters->Blocks_RLE; ++i) - { - const uint32_t dstBlockIndex = cpuRes.GlobalBlockIndexPerRleBlock[i]; - cpuRes.BlockSizePrefix[dstBlockIndex] = cpuRes.BlocksRLERefs[i].size;; + const uint32_t dstBlockIndex = cpuRes.GlobalBlockIndexPerRRBlock[i]; + cpuRes.BlockSizePrefix[dstBlockIndex] = cpuRes.BlocksRRRefs[i].size;; } for (uint32_t i = 0; i < gpuReadbackRes.Counters->Seq_Streams; ++i) @@ -437,8 +432,7 @@ static void zstdgpu_Test_DecompressSequences(zstdgpu_ResourceDataCpu & cpuRes, z } // Compute prefix sum of block sizes const uint32_t allBlockCount = cpuRes.Counters->Blocks_CMP - + cpuRes.Counters->Blocks_RAW - + cpuRes.Counters->Blocks_RLE; + + cpuRes.Counters->Blocks_RR; // FIXUP(pamartis): because after `DecompreSequences` execution 'BlockSizePrefix' contain actual size of the block, // not the prefix (it's computed after `DecompreSequences` on GPU) we update the prefix manually @@ -491,15 +485,12 @@ static void zstdgpu_Test_DecompressSequences(zstdgpu_ResourceDataCpu & cpuRes, z static void zstdgpu_Test_BlockPrefix(zstdgpu_ResourceDataCpu & cpuRes, zstdgpu_ResourceDataCpu & gpuReadbackRes) { /** these buffers could be zero if some block types don't exist */ - const uint32_t refRleBlockCount = cpuRes.Counters->Blocks_RLE; - const uint32_t refRawBlockCount = cpuRes.Counters->Blocks_RAW; + const uint32_t refRRBlockCount = cpuRes.Counters->Blocks_RR; const uint32_t refCmpBlockCount = cpuRes.Counters->Blocks_CMP; - const uint32_t refAllBlockCount = refRleBlockCount - + refRawBlockCount + const uint32_t refAllBlockCount = refRRBlockCount + refCmpBlockCount; - VALIDATE_CND(refRleBlockCount == gpuReadbackRes.Counters->Blocks_RLE); - VALIDATE_CND(refRawBlockCount == gpuReadbackRes.Counters->Blocks_RAW); + VALIDATE_CND(refRRBlockCount == gpuReadbackRes.Counters->Blocks_RR); VALIDATE_CND(refCmpBlockCount == gpuReadbackRes.Counters->Blocks_CMP); if (NULL != cpuRes.GlobalBlockIndexPerCmpBlock) @@ -507,15 +498,10 @@ static void zstdgpu_Test_BlockPrefix(zstdgpu_ResourceDataCpu & cpuRes, zstdgpu_R else VALIDATE_CND(NULL == gpuReadbackRes.GlobalBlockIndexPerCmpBlock); - if (NULL != cpuRes.GlobalBlockIndexPerRawBlock) - VALIDATE_CND(0 == memcmp(cpuRes.GlobalBlockIndexPerRawBlock, gpuReadbackRes.GlobalBlockIndexPerRawBlock, sizeof(cpuRes.GlobalBlockIndexPerRawBlock[0]) * refRawBlockCount)); + if (NULL != cpuRes.GlobalBlockIndexPerRRBlock) + VALIDATE_CND(0 == memcmp(cpuRes.GlobalBlockIndexPerRRBlock, gpuReadbackRes.GlobalBlockIndexPerRRBlock, sizeof(cpuRes.GlobalBlockIndexPerRRBlock[0]) * refRRBlockCount)); else - VALIDATE_CND(NULL == gpuReadbackRes.GlobalBlockIndexPerRawBlock); - - if (NULL != cpuRes.GlobalBlockIndexPerRleBlock) - VALIDATE_CND(0 == memcmp(cpuRes.GlobalBlockIndexPerRleBlock, gpuReadbackRes.GlobalBlockIndexPerRleBlock, sizeof(cpuRes.GlobalBlockIndexPerRleBlock[0]) * refRleBlockCount)); - else - VALIDATE_CND(NULL == gpuReadbackRes.GlobalBlockIndexPerRleBlock); + VALIDATE_CND(NULL == gpuReadbackRes.GlobalBlockIndexPerRRBlock); VALIDATE_CND(0 == memcmp(cpuRes.BlockSizePrefix, gpuReadbackRes.BlockSizePrefix, sizeof(cpuRes.BlockSizePrefix[0]) * refAllBlockCount)); } @@ -605,15 +591,13 @@ static void zstdgpu_Validate_GpuDecompressOnCpu(zstdgpu_ResourceDataCpu & zstdCp ZSTDGPU_ASSERT(zstdFrameCount == CNTRS(Frames)); ZSTDGPU_ASSERT(zstdUncompressedFramesByteCount == CNTRS(Frames_UncompressedByteSize)); - const uint32_t zstdRawBlockCount = CNTRS(Blocks_RAW); - const uint32_t zstdRleBlockCount = CNTRS(Blocks_RLE); + const uint32_t zstdRRBlockCount = CNTRS(Blocks_RR); const uint32_t zstdCmpBlockCount = CNTRS(Blocks_CMP); - const uint32_t zstdAllBlockCount = zstdRawBlockCount - + zstdRleBlockCount + const uint32_t zstdAllBlockCount = zstdRRBlockCount + zstdCmpBlockCount; - zstdgpu_ResourceInfo_Stage_1_Init(&zstdInfo, zstdRawBlockCount, zstdRleBlockCount, zstdCmpBlockCount); + zstdgpu_ResourceInfo_Stage_1_Init(&zstdInfo, zstdRRBlockCount, zstdCmpBlockCount); zstdgpu_ResourceDataCpu_InitFromHeap(&zstdCpu, &zstdInfo); // NOTE(pamartis):On CPU, lookback regions for PerFrameBlockCount{RAW,RLE,CMP,All} and @@ -622,16 +606,8 @@ static void zstdgpu_Validate_GpuDecompressOnCpu(zstdgpu_ResourceDataCpu & zstdCp uint32_t prefix = 0; for (uint32_t i = 0; i < zstdFrameCount; ++i) { - uint32_t count = zstdCpu.PerFrameBlockCountRAW[i]; - zstdCpu.PerFrameBlockCountRAW[i] = prefix; - prefix += count; - } - - prefix = 0; - for (uint32_t i = 0; i < zstdFrameCount; ++i) - { - uint32_t count = zstdCpu.PerFrameBlockCountRLE[i]; - zstdCpu.PerFrameBlockCountRLE[i] = prefix; + uint32_t count = zstdCpu.PerFrameBlockCountRR[i]; + zstdCpu.PerFrameBlockCountRR[i] = prefix; prefix += count; } @@ -668,7 +644,7 @@ static void zstdgpu_Validate_GpuDecompressOnCpu(zstdgpu_ResourceDataCpu & zstdCp { zstdgpu_InitResources_SRT srt = {}; zstdgpu_Init_InitResources_SRT(srt, zstdCpu); - srt.allBlockCount = zstdRawBlockCount + zstdRleBlockCount + zstdCmpBlockCount; + srt.allBlockCount = zstdRRBlockCount + zstdCmpBlockCount; srt.cmpBlockCount = zstdCmpBlockCount; srt.frameCount = zstdFrameCount; srt.initResourcesStage = 1; // 1 means -- right before "parse compressed blocks" @@ -1258,7 +1234,7 @@ int WINAPI wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE, _In_ LPWSTR lp } #endif - zstdgpu_ResourceDataCpu zstdCpu; + zstdgpu_ResourceDataCpu zstdCpu = {}; if (chkCpu) { debugPrint(L"[VALIDATION] Running GPU Decompression code on CPU ('--chk-cpu' option was set).\n"); @@ -1331,7 +1307,7 @@ int WINAPI wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE, _In_ LPWSTR lp } if (blkCnt) { - zstdgpu_SetupFrameInfoConstants(perRequestContext, fbInfo.rawBlockCount, fbInfo.rleBlockCount, fbInfo.cmpBlockCount); + zstdgpu_SetupFrameInfoConstants(perRequestContext, fbInfo.rrBlockCount, fbInfo.cmpBlockCount); if (seqCnt) { zstdgpu_CountLiteralAndSequenceInfo blkInfo; @@ -1540,20 +1516,9 @@ int WINAPI wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE, _In_ LPWSTR lp debugPrint(L"[FAIL] %u/%u frames failed validation.\n", failedFrameCount, fbInfo.frameCount); - const uint32_t failedRawBlockCount = zstdgpu_Test_DecompressedDataPerBlockType( - gpuData.GlobalBlockIndexPerRawBlock, - fbInfo.rawBlockCount, - gpuData.PerFrameBlockCountAll, - zstdOutFrameRefs, - fbInfo.frameCount, - gpuData.BlockSizePrefix, - ref, - tst - ); - - const uint32_t failedRleBlockCount = zstdgpu_Test_DecompressedDataPerBlockType( - gpuData.GlobalBlockIndexPerRleBlock, - fbInfo.rleBlockCount, + const uint32_t failedRRBlockCount = zstdgpu_Test_DecompressedDataPerBlockType( + gpuData.GlobalBlockIndexPerRRBlock, + fbInfo.rrBlockCount, gpuData.PerFrameBlockCountAll, zstdOutFrameRefs, fbInfo.frameCount, @@ -1573,8 +1538,8 @@ int WINAPI wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE, _In_ LPWSTR lp tst ); - if (failedRawBlockCount > 0 || failedRleBlockCount > 0) - debugPrint(L"[FAIL] %u/%u RAW blocks and %u/%u RLE blocks failed validation. Likely MemCpy/MemSet pass is broken, unless ExecuteSequence stomps the memory written by MemCpu/MemSet.\n", failedRawBlockCount, fbInfo.rawBlockCount, failedRleBlockCount, fbInfo.rleBlockCount); + if (failedRRBlockCount > 0) + debugPrint(L"[FAIL] %u/%u Raw+RLE blocks failed validation. Likely MemCpy/MemSet pass is broken, unless ExecuteSequence stomps the memory written by MemCpu/MemSet.\n", failedRRBlockCount, fbInfo.rrBlockCount); if (failedCmpBlockCount > 0) debugPrint(L"[FAIL] %u/%u CMP blocks failed validation. ExecuteSequences is likely broken unless an issue happens earlier in the pipeline or unless TDR is hit.\n", failedCmpBlockCount, fbInfo.cmpBlockCount);