From 4969227114e9fa0775d65ca6ddc960d381da92a3 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Sun, 16 Nov 2025 00:23:29 -0300 Subject: [PATCH 1/4] Work on cooperative binary search --- 72_CooperativeBinarySearch/CMakeLists.txt | 24 ++ .../app_resources/binarySearch.comp.hlsl | 20 ++ .../app_resources/common.h | 19 ++ .../app_resources/present.frag.hlsl | 19 ++ .../config.json.template | 28 +++ .../include/nbl/this_example/common.hpp | 11 + 72_CooperativeBinarySearch/main.cpp | 232 ++++++++++++++++++ 72_CooperativeBinarySearch/pipeline.groovy | 50 ++++ CMakeLists.txt | 1 + 9 files changed, 404 insertions(+) create mode 100644 72_CooperativeBinarySearch/CMakeLists.txt create mode 100644 72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl create mode 100644 72_CooperativeBinarySearch/app_resources/common.h create mode 100644 72_CooperativeBinarySearch/app_resources/present.frag.hlsl create mode 100644 72_CooperativeBinarySearch/config.json.template create mode 100644 72_CooperativeBinarySearch/include/nbl/this_example/common.hpp create mode 100644 72_CooperativeBinarySearch/main.cpp create mode 100644 72_CooperativeBinarySearch/pipeline.groovy diff --git a/72_CooperativeBinarySearch/CMakeLists.txt b/72_CooperativeBinarySearch/CMakeLists.txt new file mode 100644 index 000000000..b7e52875d --- /dev/null +++ b/72_CooperativeBinarySearch/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl new file mode 100644 index 000000000..f44a35b21 --- /dev/null +++ b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl @@ -0,0 +1,20 @@ +// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#pragma wave shader_stage(compute) + +#include "common.h" +using namespace nbl::hlsl; + +[[vk::push_constant]] ConstantBuffer Constants; +[[vk::binding(0)]] StructuredBuffer Histogram; +[[vk::binding(1)]] RWStructuredBuffer Output; + +static const uint32_t GroupsharedSize = 256; + +[numthreads(256, 1, 1)] +void main(const uint3 thread : SV_DispatchThreadID, const uint3 groupThread : SV_GroupThreadID, const uint3 group : SV_GroupID) +{ + +} \ No newline at end of file diff --git a/72_CooperativeBinarySearch/app_resources/common.h b/72_CooperativeBinarySearch/app_resources/common.h new file mode 100644 index 000000000..4a3cacaa4 --- /dev/null +++ b/72_CooperativeBinarySearch/app_resources/common.h @@ -0,0 +1,19 @@ +#ifndef _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_ +#define _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_ + +#include +#include + +using namespace nbl::hlsl; +namespace nbl { +namespace hlsl { + +struct PushConstants +{ + uint32_t EntityCount; +}; + +}; +}; + +#endif // _COOPERATIVE_BINARY_SEARCH_HLSL_INCLUDED_ diff --git a/72_CooperativeBinarySearch/app_resources/present.frag.hlsl b/72_CooperativeBinarySearch/app_resources/present.frag.hlsl new file mode 100644 index 000000000..22695657c --- /dev/null +++ b/72_CooperativeBinarySearch/app_resources/present.frag.hlsl @@ -0,0 +1,19 @@ +// Copyright (C) 2024-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#pragma wave shader_stage(fragment) + +// vertex shader is provided by the fullScreenTriangle extension +#include +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +// binding 0 set 0 +[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] Texture2D texture; +[[vk::combinedImageSampler]] [[vk::binding(0, 0)]] SamplerState samplerState; + +[[vk::location(0)]] float32_t4 main(SVertexAttributes vxAttr) : SV_Target0 +{ + return float32_t4(texture.Sample(samplerState, vxAttr.uv).rgb, 1.0f); +} \ No newline at end of file diff --git a/72_CooperativeBinarySearch/config.json.template b/72_CooperativeBinarySearch/config.json.template new file mode 100644 index 000000000..24adf54fb --- /dev/null +++ b/72_CooperativeBinarySearch/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/72_CooperativeBinarySearch/include/nbl/this_example/common.hpp b/72_CooperativeBinarySearch/include/nbl/this_example/common.hpp new file mode 100644 index 000000000..3745ca512 --- /dev/null +++ b/72_CooperativeBinarySearch/include/nbl/this_example/common.hpp @@ -0,0 +1,11 @@ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ + +#include "nbl/examples/examples.hpp" + +// example's own headers +#include "nbl/ui/ICursorControl.h" // TODO: why not in nabla.h ? +#include "nbl/ext/ImGui/ImGui.h" +#include "imgui/imgui_internal.h" + +#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ \ No newline at end of file diff --git a/72_CooperativeBinarySearch/main.cpp b/72_CooperativeBinarySearch/main.cpp new file mode 100644 index 000000000..fda1a63c1 --- /dev/null +++ b/72_CooperativeBinarySearch/main.cpp @@ -0,0 +1,232 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/examples/examples.hpp" +#include "nbl/system/IApplicationFramework.h" +#include "app_resources/common.h" + +#include +#include +#include + + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +//using namespace glm; + +void cpu_tests(); + +class CooperativeBinarySearch final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; +public: + CooperativeBinarySearch(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + m_queue = m_device->getQueue(0, 0); + m_commandPool = m_device->createCommandPool(m_queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + m_commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &m_cmdbuf,1 }, smart_refctd_ptr(m_logger)); + + smart_refctd_ptr shader; + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + auto assetBundle = m_assetMgr->getAsset("app_resources/binarySearch.comp.hlsl", lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return logFail("Could not load shader!"); + + auto source = IAsset::castDown(assets[0]); + // The down-cast should not fail! + assert(source); + + // this time we skip the use of the asset converter since the ICPUShader->IGPUShader path is quick and simple + shader = m_device->compileShader({ source.get() }); + if (!shader) + return logFail("Creation of a GPU Shader to from CPU Shader source failed!"); + } + + const uint32_t bindingCount = 2u; + IGPUDescriptorSetLayout::SBinding bindings[bindingCount] = {}; + bindings[0].type = IDescriptor::E_TYPE::ET_STORAGE_BUFFER; // [[vk::binding(0)]] StructuredBuffer Histogram; + bindings[1].type = IDescriptor::E_TYPE::ET_STORAGE_BUFFER; // [[vk::binding(1)]] RWStructuredBuffer Output; + + for(int i = 0; i < bindingCount; ++i) + { + bindings[i].stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE; + bindings[i].count = 1; + bindings[i].binding = i; + } + m_descriptorSetLayout = m_device->createDescriptorSetLayout(bindings); + { + SPushConstantRange pcRange = {}; + pcRange.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE; + pcRange.offset = 0u; + pcRange.size = 2 * sizeof(uint32_t); + auto layout = m_device->createPipelineLayout({ &pcRange,1 }, smart_refctd_ptr(m_descriptorSetLayout)); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + for (uint32_t i = 0; i < bindingCount; i++) + { + m_buffers[i] = m_device->createBuffer(IGPUBuffer::SCreationParams { + {.size = 500000, .usage = + IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | + IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_BUFFER_BIT, + } + }); + + auto reqs = m_buffers[i]->getMemoryReqs(); + reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits(); + m_device->allocate(reqs, m_buffers[i].get()); + } + + smart_refctd_ptr descriptorPool = nullptr; + { + IDescriptorPool::SCreateInfo createInfo = {}; + createInfo.maxSets = 1; + createInfo.maxDescriptorCount[static_cast(IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 1; + descriptorPool = m_device->createDescriptorPool(std::move(createInfo)); + } + + m_descriptorSet = descriptorPool->createDescriptorSet(smart_refctd_ptr(m_descriptorSetLayout)); + + IGPUDescriptorSet::SDescriptorInfo descriptorInfos[bindingCount] = {}; + IGPUDescriptorSet::SWriteDescriptorSet writeDescriptorSets[bindingCount] = {}; + + for(int i = 0; i < bindingCount; ++i) + { + writeDescriptorSets[i].info = &descriptorInfos[i]; + writeDescriptorSets[i].dstSet = m_descriptorSet.get(); + writeDescriptorSets[i].binding = i; + writeDescriptorSets[i].count = bindings[i].count; + + descriptorInfos[i].desc = m_buffers[i]; + descriptorInfos[i].info.buffer.size = ~0ull; + } + + m_device->updateDescriptorSets(bindingCount, writeDescriptorSets, 0u, nullptr); + + // In contrast to fences, we just need one semaphore to rule all dispatches + return true; + } + + void onAppTerminated_impl() override + { + m_device->waitIdle(); + } + + void workLoopBody() override + { + cpu_tests(); + + constexpr auto StartedValue = 0; + + smart_refctd_ptr progress = m_device->createSemaphore(StartedValue); + + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t layoutBufferBarrier[1] = { { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }, + // whole buffer because we transferred the contents into it + .range = {.offset = 0,.size = m_buffers[1]->getCreationParams().size,.buffer = m_buffers[1]} + } }; + + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo depInfo = { .bufBarriers = layoutBufferBarrier }; + m_cmdbuf->pipelineBarrier(EDF_NONE, depInfo); + + + const uint32_t pushConstants[2] = { 1920, 1080 }; + const IGPUDescriptorSet* set = m_descriptorSet.get(); + m_cmdbuf->bindComputePipeline(m_pipeline.get()); + m_cmdbuf->bindDescriptorSets(EPBP_COMPUTE, m_pipeline->getLayout(), 0u, 1u, &set); + m_cmdbuf->dispatch(240, 135, 1u); + + layoutBufferBarrier[0].barrier.dep = layoutBufferBarrier[0].barrier.dep.nextBarrier(PIPELINE_STAGE_FLAGS::COPY_BIT,ACCESS_FLAGS::TRANSFER_READ_BIT); + m_cmdbuf->pipelineBarrier(EDF_NONE,depInfo); + + m_cmdbuf->end(); + + { + constexpr auto FinishedValue = 69; + IQueue::SSubmitInfo submitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; + submitInfos[0].commandBuffers = cmdbufs; + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = progress.get(),.value = FinishedValue,.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; + submitInfos[0].signalSemaphores = signals; + m_api->startCapture(); + m_queue->submit(submitInfos); + m_api->endCapture(); + const ISemaphore::SWaitInfo waitInfos[] = { { + .semaphore = progress.get(), + .value = FinishedValue + } }; + m_device->blockForSemaphores(waitInfos); + } + + auto mem = m_buffers[1]->getBoundMemory(); + assert(mem.memory->isMappable()); + auto* ptr = mem.memory->map({ .offset = 0, .length = mem.memory->getAllocationSize() }); + printf("readback ptr %p\n", ptr); + + m_keepRunning = false; + } + + bool keepRunning() override + { + return m_keepRunning; + } + + +private: + smart_refctd_ptr m_pipeline = nullptr; + smart_refctd_ptr m_descriptorSetLayout; + smart_refctd_ptr m_descriptorSet; + + smart_refctd_ptr m_buffers[2]; + smart_refctd_ptr m_cmdbuf = nullptr; + IQueue* m_queue; + smart_refctd_ptr m_commandPool; + uint64_t m_iteration = 0; + constexpr static inline uint64_t MaxIterations = 200; + + bool m_keepRunning = true; +}; + +NBL_MAIN_FUNC(CooperativeBinarySearch) + +void cpu_tests() +{ +} diff --git a/72_CooperativeBinarySearch/pipeline.groovy b/72_CooperativeBinarySearch/pipeline.groovy new file mode 100644 index 000000000..eb20d0c5a --- /dev/null +++ b/72_CooperativeBinarySearch/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CComputeShaderPathTracerBuilder extends IBuilder +{ + public CComputeShaderPathTracerBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CComputeShaderPathTracerBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index f8ce94f93..39f3275ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) + add_subdirectory(72_CooperativeBinarySearch) # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) From e54642803cd47e47adfe9a20318ca8c634c86643 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 3 Dec 2025 18:32:45 -0300 Subject: [PATCH 2/4] Patch things for cooperative binary search test --- .../app_resources/binarySearch.comp.hlsl | 103 +- 72_CooperativeBinarySearch/main.cpp | 28 +- 72_CooperativeBinarySearch/testCaseData.h | 1192 +++++++++++++++++ 3 files changed, 1316 insertions(+), 7 deletions(-) create mode 100644 72_CooperativeBinarySearch/testCaseData.h diff --git a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl index f44a35b21..05c0d8464 100644 --- a/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl +++ b/72_CooperativeBinarySearch/app_resources/binarySearch.comp.hlsl @@ -5,16 +5,115 @@ #pragma wave shader_stage(compute) #include "common.h" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl" using namespace nbl::hlsl; -[[vk::push_constant]] ConstantBuffer Constants; +[[vk::push_constant]] PushConstants Constants; [[vk::binding(0)]] StructuredBuffer Histogram; [[vk::binding(1)]] RWStructuredBuffer Output; static const uint32_t GroupsharedSize = 256; +uint getNextPowerOfTwo(uint number) { + return 2 << firstbithigh(number - 1); +} + +uint getLaneWithFirstBitSet(bool condition) { + uint4 ballot = WaveActiveBallot(condition); + if (all(ballot == 0)) { + return WaveGetLaneCount(); + } + return nbl::hlsl::glsl::subgroupBallotFindLSB(ballot); +} + +// findValue must be the same across the entire wave +// Could use something like WaveReadFirstLane to be fully sure +uint binarySearchLowerBoundFindValue(uint findValue, StructuredBuffer searchBuffer, uint searchBufferSize) { + uint lane = WaveGetLaneIndex(); + + uint left = 0; + uint right = searchBufferSize - 1; + + uint32_t range = getNextPowerOfTwo(right - left); + // do pivots as long as we can't coalesced load + while (range > WaveGetLaneCount()) + { + // there must be at least 1 gap between subsequent pivots + const uint32_t step = range / WaveGetLaneCount(); + const uint32_t halfStep = step >> 1; + const uint32_t pivotOffset = lane * step+halfStep; + const uint32_t pivotIndex = left + pivotOffset; + + uint4 notGreaterPivots = WaveActiveBallot(pivotIndex < right && !(findValue < searchBuffer[pivotIndex])); + uint partition = nbl::hlsl::glsl::subgroupBallotBitCount(notGreaterPivots); + // only move left if needed + if (partition != 0) + left += partition * step - halfStep; + // if we go into final half partition, the range becomes less too + range = partition != WaveGetLaneCount() ? step : halfStep; + } + + uint threadSearchIndex = left + lane; + bool laneValid = threadSearchIndex < searchBufferSize; + uint histAtIndex = laneValid ? searchBuffer[threadSearchIndex] : -1; + uint firstLaneGreaterThan = getLaneWithFirstBitSet(histAtIndex > findValue); + + return left + firstLaneGreaterThan - 1; +} + +groupshared uint shared_groupSearchBufferMinIndex; +groupshared uint shared_groupSearchBufferMaxIndex; +groupshared uint shared_groupSearchValues[GroupsharedSize]; + +// Binary search using the entire workgroup, making it log32 or log64 (every iteration, the possible set of +// values is divided by the number of lanes in a wave) +uint binarySearchLowerBoundCooperative(uint groupIndex, uint groupThread, StructuredBuffer searchBuffer, uint searchBufferSize) { + uint minSearchValue = groupIndex.x * GroupsharedSize; + uint maxSearchValue = ((groupIndex.x + 1) * GroupsharedSize) - 1; + + // On each workgroup, two subgroups do the search + // - One searches for the minimum, the other searches for the maximum + // - Store the minimum and maximum on groupshared memory, then do a barrier + uint wave = groupThread / WaveGetLaneCount(); + if (wave < 2) { + uint search = wave == 0 ? minSearchValue : maxSearchValue; + uint searchResult = binarySearchLowerBoundFindValue(search, searchBuffer, searchBufferSize); + if (WaveIsFirstLane()) { + if (wave == 0) shared_groupSearchBufferMinIndex = searchResult; + else shared_groupSearchBufferMaxIndex = searchResult; + } + } + GroupMemoryBarrierWithGroupSync(); + + // Since every instance has at least one triangle, we know that having workgroup values + // for each value in the range of minimum to maximum will suffice. + + // Write every value in the range to groupshared memory and barrier. + uint idx = shared_groupSearchBufferMinIndex + groupThread.x; + if (idx <= shared_groupSearchBufferMaxIndex) { + shared_groupSearchValues[groupThread.x] = searchBuffer[idx]; + } + GroupMemoryBarrierWithGroupSync(); + + uint maxValueIndex = shared_groupSearchBufferMaxIndex - shared_groupSearchBufferMinIndex; + + uint searchValue = minSearchValue + groupThread; + uint currentSearchValueIndex = 0; + uint laneValue = shared_groupSearchBufferMaxIndex; + while (currentSearchValueIndex <= maxValueIndex) { + uint curValue = shared_groupSearchValues[currentSearchValueIndex]; + if (curValue > searchValue) { + laneValue = shared_groupSearchBufferMinIndex + currentSearchValueIndex - 1; + break; + } + currentSearchValueIndex ++; + } + + return laneValue; +} + [numthreads(256, 1, 1)] void main(const uint3 thread : SV_DispatchThreadID, const uint3 groupThread : SV_GroupThreadID, const uint3 group : SV_GroupID) { - + Output[thread.x] = binarySearchLowerBoundCooperative(group.x, groupThread.x, Histogram, Constants.EntityCount); } \ No newline at end of file diff --git a/72_CooperativeBinarySearch/main.cpp b/72_CooperativeBinarySearch/main.cpp index fda1a63c1..e2611dea7 100644 --- a/72_CooperativeBinarySearch/main.cpp +++ b/72_CooperativeBinarySearch/main.cpp @@ -22,6 +22,11 @@ using namespace nbl::examples; //using namespace glm; +static constexpr uint32_t TestCaseIndices[] = { +#include "testCaseData.h" +}; + + void cpu_tests(); class CooperativeBinarySearch final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication @@ -101,14 +106,19 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp auto reqs = m_buffers[i]->getMemoryReqs(); reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits(); - m_device->allocate(reqs, m_buffers[i].get()); + + m_allocations[i] = m_device->allocate(reqs, m_buffers[i].get()); + + auto allocationType = i == 0 ? IDeviceMemoryAllocation::EMCAF_WRITE : IDeviceMemoryAllocation::EMCAF_READ; + auto mapResult = m_allocations[i].memory->map({ 0ull,m_allocations[i].memory->getAllocationSize() }, allocationType); + assert(mapResult); } smart_refctd_ptr descriptorPool = nullptr; { IDescriptorPool::SCreateInfo createInfo = {}; createInfo.maxSets = 1; - createInfo.maxDescriptorCount[static_cast(IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 1; + createInfo.maxDescriptorCount[static_cast(IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = bindingCount; descriptorPool = m_device->createDescriptorPool(std::move(createInfo)); } @@ -130,6 +140,14 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp m_device->updateDescriptorSets(bindingCount, writeDescriptorSets, 0u, nullptr); + // Write test data to the m_buffers[0] + auto outPtr = m_allocations[0].memory->getMappedPointer(); + assert(outPtr); + memcpy( + reinterpret_cast(outPtr), + reinterpret_cast(&TestCaseIndices[0]), + sizeof(TestCaseIndices)); + // In contrast to fences, we just need one semaphore to rule all dispatches return true; } @@ -196,9 +214,8 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp m_device->blockForSemaphores(waitInfos); } - auto mem = m_buffers[1]->getBoundMemory(); - assert(mem.memory->isMappable()); - auto* ptr = mem.memory->map({ .offset = 0, .length = mem.memory->getAllocationSize() }); + auto ptr = m_allocations[1].memory->getMappedPointer(); + assert(ptr); printf("readback ptr %p\n", ptr); m_keepRunning = false; @@ -216,6 +233,7 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp smart_refctd_ptr m_descriptorSet; smart_refctd_ptr m_buffers[2]; + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocations[2] = {}; smart_refctd_ptr m_cmdbuf = nullptr; IQueue* m_queue; smart_refctd_ptr m_commandPool; diff --git a/72_CooperativeBinarySearch/testCaseData.h b/72_CooperativeBinarySearch/testCaseData.h new file mode 100644 index 000000000..16153780e --- /dev/null +++ b/72_CooperativeBinarySearch/testCaseData.h @@ -0,0 +1,1192 @@ +0, +298, +554, +582, +912, +1074, +1076, +1078, +1170, +1188, +2140, +2414, +2736, +2738, +3980, +4800, +5898, +5900, +6936, +8106, +8152, +8650, +8844, +8930, +9504, +10244, +10826, +10828, +11126, +11430, +12206, +13764, +14010, +15302, +15624, +15656, +16414, +16494, +17368, +17432, +18312, +18948, +19376, +19818, +20146, +20604, +21240, +22446, +23482, +24914, +25042, +25538, +26764, +27564, +27566, +28472, +29450, +30202, +31474, +32160, +32676, +33792, +33794, +34704, +36540, +37456, +37950, +38364, +39274, +40442, +40518, +41412, +41590, +41950, +42022, +42714, +43464, +43790, +43792, +44876, +44878, +46188, +46572, +47352, +47650, +48242, +49856, +49858, +50506, +50968, +50970, +51152, +51154, +52870, +52884, +53332, +53334, +53904, +53964, +53966, +53968, +53970, +53972, +53974, +53976, +53978, +53980, +54514, +54516, +54518, +54520, +54762, +55866, +56462, +56478, +56480, +56482, +57510, +57568, +57570, +57572, +57846, +57848, +58760, +59408, +59438, +60198, +60200, +60202, +60204, +60284, +60938, +61274, +61720, +62296, +63116, +63378, +63380, +63382, +63384, +63386, +63388, +63904, +64572, +65142, +65144, +65146, +65554, +65738, +66052, +67016, +67424, +67566, +68270, +68272, +68610, +69240, +69870, +70988, +72622, +73258, +73260, +73580, +74524, +74880, +74958, +74960, +74962, +75114, +75116, +75622, +77144, +77798, +77800, +78314, +79566, +79568, +79570, +79572, +79850, +79852, +81576, +81684, +81686, +82492, +82494, +82496, +82498, +83990, +84860, +84988, +84990, +85138, +85772, +86120, +86122, +86564, +87402, +87404, +87602, +88676, +88714, +88780, +89560, +89732, +90786, +91128, +91130, +91272, +91522, +91804, +92588, +92590, +92834, +93268, +93736, +94448, +94704, +94706, +95074, +95076, +96706, +97040, +97770, +98000, +98676, +99968, +100074, +100318, +100602, +100914, +101020, +101872, +101878, +103078, +104246, +104266, +105436, +106332, +106954, +107856, +108954, +110320, +110780, +111588, +111882, +112502, +112676, +113496, +114070, +115204, +115422, +115424, +115858, +116420, +117426, +118504, +118870, +119296, +119618, +119650, +120408, +120488, +121362, +121426, +122306, +122942, +123370, +123812, +124140, +124598, +125234, +126440, +127476, +128908, +129036, +129532, +130758, +131558, +131560, +132466, +133444, +134196, +135468, +136154, +136670, +137786, +137788, +138698, +140534, +140832, +141608, +142422, +143220, +143468, +143714, +144504, +145078, +145670, +146224, +146874, +147726, +148692, +149536, +151032, +151126, +153382, +154128, +155190, +155212, +156324, +156484, +156526, +157026, +158242, +158446, +158448, +158594, +159256, +160350, +160444, +161040, +161624, +162418, +162524, +162768, +163052, +163364, +163470, +164322, +164328, +165528, +166696, +166716, +167886, +168782, +169404, +170306, +171404, +172770, +173230, +174038, +174332, +174952, +175126, +175946, +176520, +177654, +177872, +177874, +178308, +178870, +179876, +180954, +181320, +181746, +182160, +183070, +184238, +184314, +185208, +185386, +185746, +185818, +186510, +187260, +187586, +187588, +188672, +188674, +189984, +190368, +191148, +191446, +192038, +193652, +193654, +194302, +194764, +194766, +194948, +194950, +196666, +196680, +197128, +197130, +197700, +198048, +198824, +199638, +200436, +200684, +200930, +201720, +202294, +202886, +203440, +204090, +204942, +205908, +206752, +208248, +208342, +210598, +211344, +212406, +212428, +213540, +213700, +213742, +214242, +215458, +215662, +215664, +215810, +216472, +217566, +217660, +218256, +218316, +218318, +218320, +218322, +218324, +218326, +218328, +218330, +218332, +218866, +218868, +218870, +218872, +219114, +220218, +220814, +220830, +220832, +220834, +221862, +221920, +221922, +221924, +222198, +222200, +223112, +223760, +223790, +224550, +224552, +224554, +224556, +225140, +225794, +226130, +226576, +227152, +227972, +228234, +228236, +228238, +228240, +228242, +228244, +228760, +229428, +229998, +230000, +230002, +230410, +230594, +230908, +231872, +232280, +232422, +233126, +233128, +233466, +234096, +234726, +235844, +237478, +238114, +238116, +238512, +239256, +239812, +240660, +241950, +243244, +243366, +244346, +244412, +244710, +245202, +246504, +246728, +246988, +247592, +248630, +249562, +250962, +251964, +252562, +253140, +253412, +254672, +255276, +256084, +256160, +256378, +257104, +257602, +257776, +258240, +258556, +258614, +259208, +260496, +261202, +261398, +262284, +262610, +262976, +263578, +264622, +265558, +266692, +266756, +268110, +268994, +269158, +269718, +270388, +270768, +271098, +271786, +272398, +272996, +273140, +273612, +274226, +274660, +275070, +275416, +275634, +275680, +276088, +276408, +276410, +276852, +277690, +277692, +277890, +278964, +279002, +279068, +279848, +280020, +281074, +281416, +281418, +281560, +281810, +282092, +282876, +282878, +283122, +283556, +284024, +284736, +284992, +284994, +285362, +285364, +286994, +287328, +288058, +288288, +288964, +289708, +289746, +290266, +291136, +292152, +292740, +292834, +293708, +293768, +293936, +294846, +295028, +295040, +295130, +295372, +296154, +296736, +297250, +297606, +298068, +298310, +299420, +300362, +301176, +301502, +301878, +302702, +303576, +303896, +305170, +305928, +306070, +306150, +307094, +307450, +307528, +307530, +307532, +307684, +307686, +308192, +309714, +310368, +310370, +310884, +312136, +312138, +312140, +312142, +312420, +312422, +314146, +314254, +314256, +315062, +315064, +315066, +315068, +316560, +317430, +317558, +317560, +317708, +318342, +319182, +319992, +320612, +320956, +321068, +321076, +322784, +322914, +323106, +324036, +324708, +326092, +326994, +327332, +328080, +328444, +329022, +329256, +330454, +331304, +331610, +332432, +332440, +333298, +334300, +334478, +334622, +335370, +335818, +336456, +336618, +337930, +338932, +339158, +339258, +339746, +340226, +340254, +340256, +340988, +341638, +342674, +343168, +343440, +344024, +344026, +344106, +345118, +346124, +347350, +348560, +348878, +349066, +350192, +350840, +351388, +353610, +354562, +355208, +356084, +356966, +358222, +359304, +359470, +360054, +360710, +360920, +361896, +362930, +362962, +363128, +363234, +363272, +363284, +363456, +363732, +364418, +364926, +365096, +365170, +365920, +366796, +367838, +368232, +368940, +369508, +369530, +370886, +371156, +371348, +372384, +372680, +372690, +373252, +373676, +374168, +374424, +374452, +374782, +374944, +374946, +374948, +375040, +375058, +376010, +376284, +376606, +376608, +377850, +378670, +379768, +379770, +380806, +381976, +382022, +382520, +382714, +382800, +383374, +384114, +384696, +384698, +384996, +385300, +386076, +387634, +387880, +388796, +389290, +389302, +389314, +389338, +389406, +389434, +389470, +389840, +389952, +390908, +391076, +391188, +392118, +392458, +392472, +392622, +392766, +393448, +394586, +394816, +394824, +395486, +396218, +396880, +396910, +397066, +397076, +397124, +397678, +398050, +399160, +400080, +401696, +401762, +402400, +402500, +402512, +403152, +404038, +404444, +404648, +404740, +405322, +406252, +407076, +408252, +408634, +409354, +410112, +411138, +411672, +411880, +412232, +412926, +412956, +413864, +414624, +415770, +415978, +417234, +417256, +417264, +418562, +418812, +418824, +418836, +418860, +418928, +418956, +418992, +419362, +419474, +420430, +420598, +420710, +421640, +421980, +421994, +422144, +422288, +422970, +424108, +424338, +424346, +425008, +425740, +426402, +426432, +426588, +426598, +426646, +427200, +427572, +428682, +429602, +430346, +430412, +431050, +431150, +431162, +431802, +432688, +433094, +433298, +433390, +433972, +434902, +435726, +436902, +437284, +438004, +438762, +439788, +440322, +440530, +440882, +441576, +441606, +442514, +443274, +444420, +444628, +445884, +445906, +445914, +447212, +447462, +448464, +448690, +448790, +449278, +449758, +449786, +449788, +450520, +451170, +452206, +452700, +452972, +453556, +453558, +453638, +454650, +455656, +456882, +458092, +458410, +458598, +459724, +460372, +460920, +463142, +464094, +464740, +465616, +466498, +467754, +468836, +469002, +469586, +470180, +471468, +472174, +472370, +473256, +473582, +473948, +474550, +475594, +476530, +477664, +477728, +479082, +479966, +480130, +480690, +481360, +481740, +482070, +482758, +483370, +483968, +484112, +484584, +485198, +485632, +486042, +486388, +486606, +486652, +487060, +488676, +489420, +489976, +490824, +492114, +493408, +493530, +494510, +494576, +494874, +495366, +496668, +496892, +497152, +497756, +498794, +499726, +501126, +502128, +502726, +503304, +503576, +504836, +505440, +506248, +506324, +506542, +507268, +507766, +507940, +508404, +508720, +509514, +510170, +510380, +511356, +512390, +512422, +512588, +512694, +512732, +512744, +512916, +513192, +513878, +514386, +514556, +514630, +515380, +516256, +517298, +517692, +518400, +518968, +518990, +520346, +520616, +520808, +521844, +522140, +522150, +522712, +523136, +523628, +524468, +525278, +525898, +526242, +526354, +526362, +528070, +528200, +528392, +529322, +529994, +531378, +532280, +532618, +533366, +533730, +534308, +534542, +535740, +536590, +536896, +537718, +537726, +538584, +539586, +539764, +539908, +540656, +541104, +541742, +541904, +543216, +543612, +543650, +544170, +545040, +546056, +546644, +546738, +547612, +547672, +547840, +548750, +548932, +548944, +549034, +549276, +550058, +550640, +551154, +551510, +551972, +552214, +553324, +554266, +555080, +555406, +555782, +556606, +557480, +557800, +559074, +559832, +559974, +550468, +551276, +552568, +552866, +553798, +554120, +554294, +555554, +556448, +556874, +557328, +557680, +558532, +559844, +560774, +561050, +561458, +562684, +563910, +564026, +564542, +565294, +565434, +566278, +567580, +568006, +568328, +569626, +570350, +570998, +572812, +573008, +573500, +573828, +573840, +573842, +574798, +576066, +576774, +577182, +577184, +577522, +577524, +578734, +579854, +579856, +581128, +581278, +582296, +583496, +583944, +584160, +584844, +584954, +584968, +585486, +586592, +586594, +587158, +587320, +588006, +589012, +590302, +590366, +590444, +590944, +581786, +582234, +582920, +582922, +564780, +565486, +565684, +566570, +566896, +567262, +567864, +568958, +570268, +570844, +572014, +573368, +574252, +574416, +574976, +575646, +576026, +576356, +577044, +577046, +577644, +577788, +578260, +578874, +579308, +579718, +580288, +580942, +581534, +581536, +576350, +576352 \ No newline at end of file From 5886b3024d761b087232da0e52aef4877481ef36 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 3 Dec 2025 20:51:18 -0300 Subject: [PATCH 3/4] Fix test --- 72_CooperativeBinarySearch/main.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/72_CooperativeBinarySearch/main.cpp b/72_CooperativeBinarySearch/main.cpp index e2611dea7..828adf34f 100644 --- a/72_CooperativeBinarySearch/main.cpp +++ b/72_CooperativeBinarySearch/main.cpp @@ -85,7 +85,7 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp SPushConstantRange pcRange = {}; pcRange.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE; pcRange.offset = 0u; - pcRange.size = 2 * sizeof(uint32_t); + pcRange.size = sizeof(nbl::hlsl::PushConstants); auto layout = m_device->createPipelineLayout({ &pcRange,1 }, smart_refctd_ptr(m_descriptorSetLayout)); IGPUComputePipeline::SCreationParams params = {}; params.layout = layout.get(); @@ -186,11 +186,18 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp m_cmdbuf->pipelineBarrier(EDF_NONE, depInfo); - const uint32_t pushConstants[2] = { 1920, 1080 }; const IGPUDescriptorSet* set = m_descriptorSet.get(); + const uint32_t numIndices = sizeof(TestCaseIndices) / sizeof(TestCaseIndices[0]); + const uint32_t lastValue = TestCaseIndices[numIndices - 1]; + const uint32_t totalValues = lastValue + 100; + nbl::hlsl::PushConstants coopBinarySearchPC = { + .EntityCount = numIndices, + }; + m_cmdbuf->bindComputePipeline(m_pipeline.get()); m_cmdbuf->bindDescriptorSets(EPBP_COMPUTE, m_pipeline->getLayout(), 0u, 1u, &set); - m_cmdbuf->dispatch(240, 135, 1u); + m_cmdbuf->pushConstants(m_pipeline->getLayout(), nbl::hlsl::ShaderStage::ESS_COMPUTE, 0u, sizeof(nbl::hlsl::PushConstants), &coopBinarySearchPC); + m_cmdbuf->dispatch((totalValues + 255u) / 256u, 1u, 1u); layoutBufferBarrier[0].barrier.dep = layoutBufferBarrier[0].barrier.dep.nextBarrier(PIPELINE_STAGE_FLAGS::COPY_BIT,ACCESS_FLAGS::TRANSFER_READ_BIT); m_cmdbuf->pipelineBarrier(EDF_NONE,depInfo); @@ -216,7 +223,14 @@ class CooperativeBinarySearch final : public application_templates::MonoDeviceAp auto ptr = m_allocations[1].memory->getMappedPointer(); assert(ptr); - printf("readback ptr %p\n", ptr); + + uint32_t* valuesPtr = reinterpret_cast(ptr); + for (uint32_t i = 0; i < totalValues; i++) { + uint32_t value = valuesPtr[i]; + const uint32_t* binarySearchResult = std::upper_bound(TestCaseIndices, TestCaseIndices + numIndices, i); + uint32_t lowerBoundIndex = std::distance(TestCaseIndices, binarySearchResult) - 1; + assert(value == lowerBoundIndex); + } m_keepRunning = false; } From 795066393d9b7918991800b4dda5b482cc9085b3 Mon Sep 17 00:00:00 2001 From: deprilula28 Date: Wed, 3 Dec 2025 20:58:08 -0300 Subject: [PATCH 4/4] Remove unecessary leftover file --- 72_CooperativeBinarySearch/pipeline.groovy | 50 ---------------------- 1 file changed, 50 deletions(-) delete mode 100644 72_CooperativeBinarySearch/pipeline.groovy diff --git a/72_CooperativeBinarySearch/pipeline.groovy b/72_CooperativeBinarySearch/pipeline.groovy deleted file mode 100644 index eb20d0c5a..000000000 --- a/72_CooperativeBinarySearch/pipeline.groovy +++ /dev/null @@ -1,50 +0,0 @@ -import org.DevshGraphicsProgramming.Agent -import org.DevshGraphicsProgramming.BuilderInfo -import org.DevshGraphicsProgramming.IBuilder - -class CComputeShaderPathTracerBuilder extends IBuilder -{ - public CComputeShaderPathTracerBuilder(Agent _agent, _info) - { - super(_agent, _info) - } - - @Override - public boolean prepare(Map axisMapping) - { - return true - } - - @Override - public boolean build(Map axisMapping) - { - IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") - IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") - - def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) - def nameOfConfig = getNameOfConfig(config) - - agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") - - return true - } - - @Override - public boolean test(Map axisMapping) - { - return true - } - - @Override - public boolean install(Map axisMapping) - { - return true - } -} - -def create(Agent _agent, _info) -{ - return new CComputeShaderPathTracerBuilder(_agent, _info) -} - -return this