From 2ecf4c28f495248fe1ec09526caed5e5b757045d Mon Sep 17 00:00:00 2001 From: Kepontry Date: Fri, 7 Nov 2025 15:50:51 +0800 Subject: [PATCH 01/13] Enable prefetching of SW kernel instructions after the first SW task --- .../add_sw_kernel_instruction_prefetch.cpp | 247 ++++++++++++++++-- 1 file changed, 230 insertions(+), 17 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index b432ab7ab0..6eed851c77 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -31,7 +31,8 @@ using namespace vpux; namespace { -static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"}; +// static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"}; +static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; // // AddSwKernelInstructionPrefetch @@ -66,12 +67,12 @@ class AddSwKernelInstructionPrefetch final : size_t clusterIdx, std::string& kernelName, mlir::SymbolRefAttr functionSymbol); - VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier, + VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName); mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier); std::pair getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp); - using SwKernelPrefetchVec = std::vector>; + using SwKernelPrefetchVec = std::vector>; std::pair getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp, VPURT::TaskConfigVec& allTasks); std::tuple getFirstSwTaskInIRAndBestUpdateBarrier( @@ -79,6 +80,8 @@ class AddSwKernelInstructionPrefetch final : std::vector insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch, mlir::Operation* firstShaveTaskInIR, mlir::Value bestUpdateBarrier); + std::vector insertPrefetchTasksDuringExec(mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks); bool hasVPUSWModule(mlir::Operation* funcOp); size_t getOffsetReservedMem(const mlir::ModuleOp module); @@ -94,6 +97,7 @@ class AddSwKernelInstructionPrefetch final : bool _minFreeCyclesHasValue = false; size_t _minimumFreeCyclesForPrefetch = 250000; bool _useDummyKernelForInstructionPrefetch = false; + size_t _dynamicPrefetchTileCounter = 0; }; bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) { @@ -187,20 +191,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, - mlir::Value updateBarrier, + mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) { mlir::OpBuilder builder(firstSwTask); - auto moduleOp = firstSwTask->getParentOfType(); + auto kernelOp = kernelNameToOps[kernelName]; + auto moduleOp = kernelOp->getParentOfType(); auto reservedMemOffset = getOffsetReservedMem(moduleOp); auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset); - auto kernelOp = kernelNameToOps[kernelName]; + auto tileIndexAttr = kernelOp.getTileIndexAttr(); + VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc()); + const int64_t tileIndex = static_cast(clusterIdx); auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector& buffers) { if (auto bufOp = io.getDefiningOp()) { - auto newType = mlir::cast(io.getType()).changeShape({1, 1, 1, 1}); + auto origType = mlir::cast(io.getType()); + auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex); + auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex}); + auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr); auto newBuff = builder.create(appendLoc(bufOp->getLoc(), suffix), newType, - bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(), + bufOp.getSectionAttr(), newSectionIndexAttr, offsetAttr, bufOp.getSwizzlingKeyAttr()); buffers.push_back(newBuff); return true; @@ -230,14 +240,17 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp( builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers), - mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(), + mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex), kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr()); // The dummy kernels here are generated after ActShaveProfilingPass, // so we need to add skipProfiling as attribute to avoid capturing their metadata cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext())); auto args = - (kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; + (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" + || kernelName == "activation_sin" || kernelName == "eltwise_equal" + || kernelName == "eltwise_select" || kernelName == "rms_norm") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; + vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args, _log.nest(), /*swKernelRunOp=*/nullptr); @@ -316,7 +329,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat } if (!cache.isLoaded(kernelName)) { - kernelsToPrefetch.push_back(std::move(kernelNameAndSize)); + kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex)); } cache.loadKernel(kernelName, kernelSize); @@ -394,7 +407,7 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size()); shaveIdx++) { auto clusterIdx = shaveIdx / noOfShavesPerCluster; - auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx]; + auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx]; _log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx); auto newPrefetchKernel = _useDummyKernelForInstructionPrefetch @@ -410,6 +423,200 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas return prefetchedKernels; } +uint64_t findNextSaturationStart(size_t startIndex, + vpux::VPURT::TaskConfigVec& allTasks, + size_t numClusters, + std::map& swKernelCountsCache) { + + // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels) + const size_t saturationThreshold = numClusters * 2; + + // Iterate through tasks strictly AFTER the startIndex + for (size_t i = startIndex + 1; i < allTasks.size(); ++i) { + uint64_t currentStartTime = static_cast(allTasks[i].cycleStart); + + if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) { + size_t swKernelCount = 0; + // Count all SW Kernels that start at this specific time + for (auto& task : allTasks) { + if (static_cast(task.cycleStart) == currentStartTime) { + if (mlir::isa(task.taskOp.getInnerTaskOp())) { + swKernelCount++; + } + } + if (static_cast(task.cycleStart) > currentStartTime) { + break; + } + } + swKernelCountsCache[currentStartTime] = swKernelCount; + } + + if (swKernelCountsCache[currentStartTime] >= saturationThreshold) { + return currentStartTime; + } + } + + return std::numeric_limits::max(); +} + +struct GapCandidate { + uint64_t lookaheadGap = 0; + int64_t insertionPointTaskIndex = -1; + + // used for sort + bool operator>(const GapCandidate& other) const { + return lookaheadGap > other.lookaheadGap; + } +}; + +size_t getSwKernelCountAtTime(uint64_t startTime, + VPURT::TaskConfigVec& allTasks) { + size_t count = 0; + for (auto& taskConfig : allTasks) { + if (static_cast(taskConfig.cycleStart) == startTime) { + if (mlir::isa(taskConfig.taskOp.getInnerTaskOp())) { + count++; + } + } + if (static_cast(taskConfig.cycleStart) > startTime) { + break; + } + } + return count; +} + +std::optional findBestInsertionGap( + const std::string& kernelName, + uint64_t targetKernelGroupStartTime, + VPURT::TaskConfigVec& allTasks, + size_t numClusters, + Logger& log) { + + const int64_t targetInsertTile = 3; + const uint64_t GAP_THRESHOLD = 50000; + const size_t saturationThreshold = numClusters * 2; + + // + std::map> validGaps; + std::map swKernelCountsCache; // local cache + + int64_t previousT3TaskIndex = -1; + uint64_t previousT3TaskEndTime = 0; + + // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched + for (size_t i = 0; i < allTasks.size(); ++i) { + auto& currentTaskConfig = allTasks[i]; + uint64_t currentTaskStartTime = static_cast(currentTaskConfig.cycleStart); + if (currentTaskStartTime > targetKernelGroupStartTime) { + break; + } + + bool isT3Task = false; + if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp())) { + isT3Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile); + } + + if (previousT3TaskIndex != -1 && isT3Task) { + + auto& insertionPointTask = allTasks[previousT3TaskIndex]; + uint64_t insertionPointStartTime = static_cast(insertionPointTask.cycleStart); + + size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); + + if (simultaneousSwKernels < saturationThreshold) { + uint64_t nextSaturationStart = findNextSaturationStart(previousT3TaskIndex, allTasks, numClusters, swKernelCountsCache); + uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); + uint64_t lookaheadGap = 0; + if (gapEnd > previousT3TaskEndTime) { + lookaheadGap = gapEnd - previousT3TaskEndTime; + } + + if (lookaheadGap >= GAP_THRESHOLD) { + GapCandidate gap; + gap.lookaheadGap = lookaheadGap; + gap.insertionPointTaskIndex = previousT3TaskIndex; + validGaps[lookaheadGap] = gap; + } + } + } + + if (isT3Task) { + previousT3TaskIndex = static_cast(i); + previousT3TaskEndTime = currentTaskStartTime + static_cast(allTasks[i].cycleCost); + } + } + + if (validGaps.empty()) { + log.trace("Kernel '{0}': No suitable insertion point found.", kernelName); + return std::nullopt; + } + + return validGaps.begin()->second; +} + +std::vector AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec( + mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks) { + + auto moduleOp = funcOp->getParentOfType(); + const auto numClusters = getNumTiles(moduleOp); + VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero."); + + std::vector prefetchedKernels{}; + + for (auto& kernelInfo : kernelsToPrefetch) { + std::string kernelName = std::get<0>(kernelInfo); + size_t firstAppearanceIndex = std::get<2>(kernelInfo); + + if (firstAppearanceIndex >= allTasks.size()) { + _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex); + continue; + } + if (kernelNameToOps.count(kernelName) == 0) { + _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName); + continue; + } + + uint64_t targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); + + auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, + allTasks, numClusters, _log); + + if (!bestGapOpt.has_value()) { + _log.trace("Kernel '{0}': No valid gap found.", kernelName); + continue; + } + + GapCandidate bestGap = bestGapOpt.value(); + _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", + kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex); + std::cout << "[Prefetch DEBUG] Kernel: " << kernelName + << " Found best gap of " << bestGap.lookaheadGap + << " cycles. Inserting relative to task " << bestGap.insertionPointTaskIndex << std::endl; + + if (bestGap.insertionPointTaskIndex < 0 || static_cast(bestGap.insertionPointTaskIndex) >= allTasks.size()) { + _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", + kernelName, bestGap.insertionPointTaskIndex); + continue; + } + + auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp; + size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters; + _dynamicPrefetchTileCounter++; + + auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask( + insertBeforeOp, + mlir::ValueRange(), + dynamicExecTile, + kernelName + ); + + prefetchedKernels.push_back(newPrefetchKernel); + } + + return prefetchedKernels; +} + void AddSwKernelInstructionPrefetch::safeRunOnFunc() { auto funcOp = getOperation(); if (!hasVPUSWModule(funcOp)) { @@ -444,10 +651,6 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks); auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] = getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex); - if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) { - return; - } - _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); if (_useDummyKernelForInstructionPrefetch) { auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); @@ -455,7 +658,17 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { VPUX_THROW_WHEN(dummyKernelResMem == nullptr, "Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!"); } - auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + if (kernelsToPrefetch.empty()) { + return; + } + _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); + + std::vector newPrefetchKernels; + if (firstShaveTaskInIR == nullptr){ + newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks); + } else { + newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + } // Update dependencies for cache handling operations to meet requirements of control graph split. auto& barrierInfo = getAnalysis(); From 681035a779ce7a4438f6dd993b612c44598c273f Mon Sep 17 00:00:00 2001 From: Kepontry Date: Sat, 8 Nov 2025 01:04:17 +0800 Subject: [PATCH 02/13] style: Code cleanup and formatting --- .../add_sw_kernel_instruction_prefetch.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 6eed851c77..6ed5289fec 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -31,7 +31,6 @@ using namespace vpux; namespace { -// static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"}; static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; // @@ -512,14 +511,14 @@ std::optional findBestInsertionGap( } bool isT3Task = false; - if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp())) { + if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) { isT3Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile); } if (previousT3TaskIndex != -1 && isT3Task) { auto& insertionPointTask = allTasks[previousT3TaskIndex]; - uint64_t insertionPointStartTime = static_cast(insertionPointTask.cycleStart); + auto insertionPointStartTime = static_cast(insertionPointTask.cycleStart); size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); @@ -577,7 +576,7 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas continue; } - uint64_t targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); + auto targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log); @@ -590,9 +589,6 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas GapCandidate bestGap = bestGapOpt.value(); _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex); - std::cout << "[Prefetch DEBUG] Kernel: " << kernelName - << " Found best gap of " << bestGap.lookaheadGap - << " cycles. Inserting relative to task " << bestGap.insertionPointTaskIndex << std::endl; if (bestGap.insertionPointTaskIndex < 0 || static_cast(bestGap.insertionPointTaskIndex) >= allTasks.size()) { _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", @@ -663,12 +659,9 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { } _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); - std::vector newPrefetchKernels; - if (firstShaveTaskInIR == nullptr){ - newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks); - } else { - newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); - } + auto newPrefetchKernels = (firstShaveTaskInIR == nullptr) + ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks) + : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); // Update dependencies for cache handling operations to meet requirements of control graph split. auto& barrierInfo = getAnalysis(); From 8f39a2789324bd32a765055f3b67c0435da8a363 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Thu, 20 Nov 2025 00:08:36 +0800 Subject: [PATCH 03/13] func: Add test case, change t3 to t1 --- .../add_sw_kernel_instruction_prefetch.cpp | 30 +-- ...struction_prefetch_mid_execution_40XX.mlir | 180 ++++++++++++++++++ 2 files changed, 195 insertions(+), 15 deletions(-) create mode 100644 tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 6ed5289fec..4953a690f5 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -31,7 +31,7 @@ using namespace vpux; namespace { -static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; +static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"}; // // AddSwKernelInstructionPrefetch @@ -491,7 +491,7 @@ std::optional findBestInsertionGap( size_t numClusters, Logger& log) { - const int64_t targetInsertTile = 3; + const int64_t targetInsertTile = 1; const uint64_t GAP_THRESHOLD = 50000; const size_t saturationThreshold = numClusters * 2; @@ -499,8 +499,8 @@ std::optional findBestInsertionGap( std::map> validGaps; std::map swKernelCountsCache; // local cache - int64_t previousT3TaskIndex = -1; - uint64_t previousT3TaskEndTime = 0; + int64_t previousT1TaskIndex = -1; + uint64_t previousT1TaskStartTime = 0; // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched for (size_t i = 0; i < allTasks.size(); ++i) { @@ -510,38 +510,38 @@ std::optional findBestInsertionGap( break; } - bool isT3Task = false; + bool isT1Task = false; if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) { - isT3Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile); + isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile); } - if (previousT3TaskIndex != -1 && isT3Task) { + if (previousT1TaskIndex != -1 && isT1Task) { - auto& insertionPointTask = allTasks[previousT3TaskIndex]; + auto& insertionPointTask = allTasks[previousT1TaskIndex]; auto insertionPointStartTime = static_cast(insertionPointTask.cycleStart); size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); if (simultaneousSwKernels < saturationThreshold) { - uint64_t nextSaturationStart = findNextSaturationStart(previousT3TaskIndex, allTasks, numClusters, swKernelCountsCache); + uint64_t nextSaturationStart = findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache); uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); uint64_t lookaheadGap = 0; - if (gapEnd > previousT3TaskEndTime) { - lookaheadGap = gapEnd - previousT3TaskEndTime; + if (gapEnd > previousT1TaskStartTime) { + lookaheadGap = gapEnd - previousT1TaskStartTime; } if (lookaheadGap >= GAP_THRESHOLD) { GapCandidate gap; gap.lookaheadGap = lookaheadGap; - gap.insertionPointTaskIndex = previousT3TaskIndex; + gap.insertionPointTaskIndex = previousT1TaskIndex; validGaps[lookaheadGap] = gap; } } } - if (isT3Task) { - previousT3TaskIndex = static_cast(i); - previousT3TaskEndTime = currentTaskStartTime + static_cast(allTasks[i].cycleCost); + if (isT1Task) { + previousT1TaskIndex = static_cast(i); + previousT1TaskStartTime = currentTaskStartTime; } } diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir new file mode 100644 index 0000000000..2e85f9a246 --- /dev/null +++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir @@ -0,0 +1,180 @@ +// +// Copyright (C) 2024-2025 Intel Corporation. +// SPDX-License-Identifier: Apache-2.0 +// + +// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true enable-sw-kernel-fifo-per-shave-engine=false" --add-sw-kernel-instruction-prefetch %s | FileCheck %s +// REQUIRES: arch-NPU40XX + +!DummyDDRT = memref<32000x1x1x1xf16, @DDR> +!DummyCMX0T = memref<32000x1x1x1xf16, [@CMX_NN, 0]> +!DummyCMX1T = memref<32000x1x1x1xf16, [@CMX_NN, 1]> +!DummyCMX0TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 0]> +!DummyCMX1TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 1]> + +// This test checks following schedule +// Barriers : 0 1 2 3 4 5 +// Cluster 0: | [ DMA ] | [ DMA ] | [ Softmax] | [ TopK ] | [ DMA ] | [ Softmax ] +// Cluster 1: | [ DMA ] | [ Softmax] | [ TopK ] +// Other : [ SyncDMA ] | +// + +module @subgraph attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096] + module @VPU.SW { + func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} + func.func private @builtin_TopK(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, i64, i64, i64, i64) attributes {VPU.kernel_code = "topk.cpp", VPU.kernel_entry = "topk", VPU.task_type = @COMPUTE} + func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} + } + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + config.ExecutorResource 1 of @M2I + config.ExecutorResource 1 of @DMA_NN + config.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64} + net.NetworkInfo {inferenceTiming = 369464 : i64} entryPoint : @main inputsInfo : { + DataInfo "data" : tensor<1x3x62x62xui8> + } outputsInfo : { + DataInfo "out" : tensor<1x3x62x62xui8> + } + func.func @main(%arg0: memref<1x3x62x62xui8, @DDR>) -> memref<1x3x62x62xui8, @DDR> { + %0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %5 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %6 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + %7 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + // CHECK: [[BARRIER_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_2:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_3:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_4:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_5:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_6:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + // CHECK: [[BARRIER_7:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier + + %28 = VPURT.DeclareBuffer <0> -> memref<0x0x0x0xi32, @DDR> + %ddr_buf = VPURT.DeclareBuffer <0> -> !DummyDDRT + %cmx_0 = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0T + %cmx_1 = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1T + + VPURT.Task updates(%0 : !VPURT.Barrier) { + %241 = VPUIP.SyncDMA {port = 0 : i64} inputs(%28 : memref<0x0x0x0xi32, @DDR>) outputs(%28 : memref<0x0x0x0xi32, @DDR>) -> memref<0x0x0x0xi32, @DDR> + } + + VPURT.Task waits(%0: !VPURT.Barrier) updates(%1 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%1: !VPURT.Barrier) updates(%2 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%2: !VPURT.Barrier) updates(%3 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 1 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_1 : !DummyCMX1T) -> !DummyCMX1T + } + + VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T + } + } + + VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T) on tile 1 -> !DummyCMX1T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1T + } + } + + %cmx0_top_k = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0TopK + VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { + %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_TopK inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T, %cmx0_top_k as %arg5: !DummyCMX0TopK) on tile 0 -> (!DummyCMX0T, !DummyCMX0TopK) { + VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX0T, !DummyCMX0T, !DummyCMX0TopK + } + } + + %cmx1_top_k = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1TopK + VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { + %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_TopK inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T, %cmx1_top_k as %arg5: !DummyCMX1TopK) on tile 1 -> (!DummyCMX1T, !DummyCMX1TopK) { + VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX1T, !DummyCMX1T, !DummyCMX1TopK + } + } + + VPURT.Task waits(%6: !VPURT.Barrier) updates(%7 : !VPURT.Barrier) { + %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T + } + + VPURT.Task waits(%7: !VPURT.Barrier) { + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{ + VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T + } + } + + // CHECK: VPURT.Task updates([[BARRIER_0]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.SyncDMA + + // CHECK: VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_1]] : !VPURT.Barrier) updates([[BARRIER_2]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_2]] : !VPURT.Barrier) updates([[BARRIER_3]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + // CHECK: VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + // CHECK: VPURT.Task { + // CHECK-NEXT: VPUIP.SW.Kernel + // CHECK-SAME: skipProfiling + // CHECK-SAME: @VPU.SW::@builtin_TopK + + // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_TopK + + // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_TopK + + // CHECK: VPURT.Task waits([[BARRIER_6]] : !VPURT.Barrier) updates([[BARRIER_7]] : !VPURT.Barrier) { + // CHECK-NEXT: VPUIP.NNDMA + + // CHECK: VPURT.Task waits([[BARRIER_7]] : !VPURT.Barrier) { + // CHECK: VPUIP.SW.Kernel + // CHECK-SAME: @VPU.SW::@builtin_SoftMax + + return %arg0 : memref<1x3x62x62xui8, @DDR> + } +} From bf5cab7c911f6e8e32cc6ef210c009a332a172f5 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Tue, 9 Dec 2025 15:03:51 +0800 Subject: [PATCH 04/13] Add instpf memory to the config of the 4 failed tests --- .../VPUIP/pipelines/default_hw_mode_40XX.mlir | 24 +++++++++++++ .../default_hw_mode_repeating_blocks.mlir | 36 +++++++++++++++++++ ...t_hw_mode_schedule_trace_enabled_40XX.mlir | 12 +++++++ ..._mode_vertical_fusion_outlining_40XX+.mlir | 11 ++++++ 4 files changed, 83 insertions(+) diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir index a2ae982802..8884adf385 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir @@ -20,6 +20,18 @@ // CHECK-LABEL: @SoftMax module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} @@ -166,6 +178,18 @@ module @SoftMax attributes {config.arch = #config.arch_kind, config.com // CHECK-LABEL: @TwoFunctions module @TwoFunctions attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + // CHECK-DAG: {{ }}config.Resources VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir index 6dd21b5f43..ca02746827 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir @@ -9,6 +9,18 @@ !MemRef = memref<1x3x62x62xf16> module @ChainCalls { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x3x62x62xf16> } outputsInfo : { @@ -61,6 +73,18 @@ module @ChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsChainCalls { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1x2x64xf16> } outputsInfo : { @@ -146,6 +170,18 @@ module @SwKernelsChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsIndependentCalls { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x1x2x64xf16> } outputsInfo : { diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir index 5406c523ac..b3bf4c898b 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir @@ -9,6 +9,18 @@ // CHECK-LABEL: @Gather module @Gather attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } + VPURT.SW.Runtime entryPoint: @VPU.SW::@runtime stack_configuration: [4096, 4096, 4096, 4096] diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir index 02e4a016c5..61918ed50d 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir @@ -14,6 +14,17 @@ module @VerticalFusionOutlining attributes {config.compilationMode = #config.com func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} } + config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + builtin.module @ReservedMemory { + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + } + } + config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware + config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} + config.ExecutorResource 2 of @SHAVE_ACT + config.ExecutorResource 1 of @DPU + } net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x128x128xf16, {order = #NHWC}> } outputsInfo : { From ff578129b46e7a30bf8759b3e16ece240f3912f4 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Tue, 9 Dec 2025 15:06:26 +0800 Subject: [PATCH 05/13] Fix clang format check --- .../add_sw_kernel_instruction_prefetch.cpp | 109 ++++++++---------- 1 file changed, 50 insertions(+), 59 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 4953a690f5..fea72079e5 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -31,7 +31,9 @@ using namespace vpux; namespace { -static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"}; +static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = { + "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", + "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"}; // // AddSwKernelInstructionPrefetch @@ -66,8 +68,9 @@ class AddSwKernelInstructionPrefetch final : size_t clusterIdx, std::string& kernelName, mlir::SymbolRefAttr functionSymbol); - VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, - size_t clusterIdx, std::string& kernelName); + VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, + mlir::ValueRange updateBarrier, size_t clusterIdx, + std::string& kernelName); mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier); std::pair getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp); @@ -79,8 +82,9 @@ class AddSwKernelInstructionPrefetch final : std::vector insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch, mlir::Operation* firstShaveTaskInIR, mlir::Value bestUpdateBarrier); - std::vector insertPrefetchTasksDuringExec(mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, - VPURT::TaskConfigVec& allTasks); + std::vector insertPrefetchTasksDuringExec( + mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks); bool hasVPUSWModule(mlir::Operation* funcOp); size_t getOffsetReservedMem(const mlir::ModuleOp module); @@ -189,10 +193,8 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer } // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel -VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, - mlir::ValueRange updateBarrier, - size_t clusterIdx, - std::string& kernelName) { +VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask( + mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) { mlir::OpBuilder builder(firstSwTask); auto kernelOp = kernelNameToOps[kernelName]; auto moduleOp = kernelOp->getParentOfType(); @@ -205,7 +207,8 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector& buffers) { if (auto bufOp = io.getDefiningOp()) { auto origType = mlir::cast(io.getType()); - auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex); + auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), + stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex); auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex}); auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr); auto newBuff = builder.create(appendLoc(bufOp->getLoc(), suffix), newType, @@ -245,10 +248,11 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst // so we need to add skipProfiling as attribute to avoid capturing their metadata cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext())); - auto args = - (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" - || kernelName == "activation_sin" || kernelName == "eltwise_equal" - || kernelName == "eltwise_select" || kernelName == "rms_norm") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; + auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" || + kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" || + kernelName == "rms_norm") + ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) + : kernelNameToArgs[kernelName]; vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args, _log.nest(), /*swKernelRunOp=*/nullptr); @@ -422,11 +426,8 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas return prefetchedKernels; } -uint64_t findNextSaturationStart(size_t startIndex, - vpux::VPURT::TaskConfigVec& allTasks, - size_t numClusters, +uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters, std::map& swKernelCountsCache) { - // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels) const size_t saturationThreshold = numClusters * 2; @@ -468,8 +469,7 @@ struct GapCandidate { } }; -size_t getSwKernelCountAtTime(uint64_t startTime, - VPURT::TaskConfigVec& allTasks) { +size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) { size_t count = 0; for (auto& taskConfig : allTasks) { if (static_cast(taskConfig.cycleStart) == startTime) { @@ -478,26 +478,21 @@ size_t getSwKernelCountAtTime(uint64_t startTime, } } if (static_cast(taskConfig.cycleStart) > startTime) { - break; + break; } } return count; } -std::optional findBestInsertionGap( - const std::string& kernelName, - uint64_t targetKernelGroupStartTime, - VPURT::TaskConfigVec& allTasks, - size_t numClusters, - Logger& log) { - +std::optional findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime, + VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) { const int64_t targetInsertTile = 1; const uint64_t GAP_THRESHOLD = 50000; const size_t saturationThreshold = numClusters * 2; // std::map> validGaps; - std::map swKernelCountsCache; // local cache + std::map swKernelCountsCache; // local cache int64_t previousT1TaskIndex = -1; uint64_t previousT1TaskStartTime = 0; @@ -516,14 +511,14 @@ std::optional findBestInsertionGap( } if (previousT1TaskIndex != -1 && isT1Task) { - auto& insertionPointTask = allTasks[previousT1TaskIndex]; auto insertionPointStartTime = static_cast(insertionPointTask.cycleStart); size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); - + if (simultaneousSwKernels < saturationThreshold) { - uint64_t nextSaturationStart = findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache); + uint64_t nextSaturationStart = + findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache); uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); uint64_t lookaheadGap = 0; if (gapEnd > previousT1TaskStartTime) { @@ -554,58 +549,53 @@ std::optional findBestInsertionGap( } std::vector AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec( - mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, - VPURT::TaskConfigVec& allTasks) { - + mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, + VPURT::TaskConfigVec& allTasks) { auto moduleOp = funcOp->getParentOfType(); const auto numClusters = getNumTiles(moduleOp); VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero."); std::vector prefetchedKernels{}; - + for (auto& kernelInfo : kernelsToPrefetch) { std::string kernelName = std::get<0>(kernelInfo); size_t firstAppearanceIndex = std::get<2>(kernelInfo); if (firstAppearanceIndex >= allTasks.size()) { - _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex); - continue; + _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex); + continue; } if (kernelNameToOps.count(kernelName) == 0) { - _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName); - continue; + _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName); + continue; } auto targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); - auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, - allTasks, numClusters, _log); + auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log); if (!bestGapOpt.has_value()) { _log.trace("Kernel '{0}': No valid gap found.", kernelName); continue; } - + GapCandidate bestGap = bestGapOpt.value(); - _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", - kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex); + _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName, + bestGap.lookaheadGap, bestGap.insertionPointTaskIndex); - if (bestGap.insertionPointTaskIndex < 0 || static_cast(bestGap.insertionPointTaskIndex) >= allTasks.size()) { - _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", - kernelName, bestGap.insertionPointTaskIndex); - continue; + if (bestGap.insertionPointTaskIndex < 0 || + static_cast(bestGap.insertionPointTaskIndex) >= allTasks.size()) { + _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", kernelName, + bestGap.insertionPointTaskIndex); + continue; } - + auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp; size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters; _dynamicPrefetchTileCounter++; - auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask( - insertBeforeOp, - mlir::ValueRange(), - dynamicExecTile, - kernelName - ); + auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(insertBeforeOp, mlir::ValueRange(), + dynamicExecTile, kernelName); prefetchedKernels.push_back(newPrefetchKernel); } @@ -659,9 +649,10 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { } _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); - auto newPrefetchKernels = (firstShaveTaskInIR == nullptr) - ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks) - : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + auto newPrefetchKernels = + (firstShaveTaskInIR == nullptr) + ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks) + : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); // Update dependencies for cache handling operations to meet requirements of control graph split. auto& barrierInfo = getAnalysis(); From 38a8e24baa99a030294a11c1d10573e0e890c6d4 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Mon, 15 Dec 2025 21:46:35 +0800 Subject: [PATCH 06/13] Fix memory allocation assertion --- .../default_hw_mode_repeating_blocks.mlir | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir index ca02746827..2dc31cfdf9 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir @@ -9,16 +9,12 @@ !MemRef = memref<1x3x62x62xf16> module @ChainCalls { - config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + config.Resources 2 of @NCE at 1.300000e+03 MHz { builtin.module @ReservedMemory { module @DummySWKernelsForInstructionPrefetchReservedMemory { - config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + config.MemoryResource 8 bytes of @CMX_NN offset 0 } } - config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware - config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} - config.ExecutorResource 2 of @SHAVE_ACT - config.ExecutorResource 1 of @DPU } net.NetworkInfo entryPoint : @main inputsInfo : { @@ -73,16 +69,12 @@ module @ChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsChainCalls { - config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + config.Resources 2 of @NCE at 1.300000e+03 MHz { builtin.module @ReservedMemory { module @DummySWKernelsForInstructionPrefetchReservedMemory { - config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + config.MemoryResource 8 bytes of @CMX_NN offset 0 } } - config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware - config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} - config.ExecutorResource 2 of @SHAVE_ACT - config.ExecutorResource 1 of @DPU } net.NetworkInfo entryPoint : @main inputsInfo : { @@ -170,16 +162,12 @@ module @SwKernelsChainCalls { !MemRef = memref<1x1x2x64xf16> module @SwKernelsIndependentCalls { - config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { + config.Resources 2 of @NCE at 1.300000e+03 MHz { builtin.module @ReservedMemory { module @DummySWKernelsForInstructionPrefetchReservedMemory { - config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + config.MemoryResource 8 bytes of @CMX_NN offset 0 } } - config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware - config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} - config.ExecutorResource 2 of @SHAVE_ACT - config.ExecutorResource 1 of @DPU } net.NetworkInfo entryPoint : @main inputsInfo : { From 48fdf840046752be223b419947f1f3a75ba37b74 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Mon, 15 Dec 2025 23:10:20 +0800 Subject: [PATCH 07/13] Fix memory allocation assertion in NPU40XX tests --- .../VPUIP/pipelines/default_hw_mode_40XX.mlir | 20 ++++++------------- ...t_hw_mode_schedule_trace_enabled_40XX.mlir | 10 +++------- ..._mode_vertical_fusion_outlining_40XX+.mlir | 10 +++------- 3 files changed, 12 insertions(+), 28 deletions(-) diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir index 8884adf385..63fcf82282 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir @@ -22,14 +22,10 @@ module @SoftMax attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { - module @DummySWKernelsForInstructionPrefetchReservedMemory { - config.MemoryResource 8 bytes of @CMX_NN offset 1474552 - } + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } } - config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware - config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} - config.ExecutorResource 2 of @SHAVE_ACT - config.ExecutorResource 1 of @DPU } VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] @@ -180,14 +176,10 @@ module @SoftMax attributes {config.arch = #config.arch_kind, config.com module @TwoFunctions attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { - module @DummySWKernelsForInstructionPrefetchReservedMemory { - config.MemoryResource 8 bytes of @CMX_NN offset 1474552 - } + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } } - config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware - config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} - config.ExecutorResource 2 of @SHAVE_ACT - config.ExecutorResource 1 of @DPU } // CHECK-DAG: {{ }}config.Resources diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir index b3bf4c898b..e4e470ff78 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir @@ -11,14 +11,10 @@ module @Gather attributes {config.arch = #config.arch_kind, config.compilationMode = #config.compilation_mode} { config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { - module @DummySWKernelsForInstructionPrefetchReservedMemory { - config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } } - } - config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware - config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} - config.ExecutorResource 2 of @SHAVE_ACT - config.ExecutorResource 1 of @DPU } VPURT.SW.Runtime diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir index 61918ed50d..65a149905e 100644 --- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir +++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir @@ -16,14 +16,10 @@ module @VerticalFusionOutlining attributes {config.compilationMode = #config.com config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { builtin.module @ReservedMemory { - module @DummySWKernelsForInstructionPrefetchReservedMemory { - config.MemoryResource 8 bytes of @CMX_NN offset 1474552 + module @DummySWKernelsForInstructionPrefetchReservedMemory { + config.MemoryResource 8 bytes of @CMX_NN offset 1473528 + } } - } - config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware - config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64} - config.ExecutorResource 2 of @SHAVE_ACT - config.ExecutorResource 1 of @DPU } net.NetworkInfo entryPoint : @main inputsInfo : { DataInfo "input" : tensor<1x16x128x128xf16, {order = #NHWC}> From 8110a44ac5f24fe962d0885fe75933ea91af797c Mon Sep 17 00:00:00 2001 From: Kepontry Date: Wed, 17 Dec 2025 17:01:40 +0800 Subject: [PATCH 08/13] Fix CLIP tests in CI --- .../add_sw_kernel_instruction_prefetch.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 3512e47006..75cbab41f1 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -33,7 +33,7 @@ namespace { static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = { "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", - "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"}; + "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; // // AddSwKernelInstructionPrefetch @@ -371,7 +371,7 @@ AddSwKernelInstructionPrefetch::getFirstSwTaskInIRAndBestUpdateBarrier(VPURT::In _log.trace("First SW kernel start time {0}, best barrier release time {1}", firstKernelTask.cycleStart, bestReleaseCycle); if (bestReleaseCycle < _minimumFreeCyclesForPrefetch) { - _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, skipping prefetching", + _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during execution", bestReleaseCycle, _minimumFreeCyclesForPrefetch); return std::make_tuple(nullptr, nullptr, 0); } @@ -647,12 +647,14 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { if (kernelsToPrefetch.empty()) { return; } - _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); - auto newPrefetchKernels = - (firstShaveTaskInIR == nullptr) - ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks) - : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + std::vector newPrefetchKernels; + if (firstShaveTaskInIR) { + _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); + newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); + } else { + newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks); + } // Update dependencies for cache handling operations to meet requirements of control graph split. auto& barrierInfo = getAnalysis(); From 51c2ac38963e73a944e1f7be38f355adf5db2518 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Wed, 17 Dec 2025 17:04:28 +0800 Subject: [PATCH 09/13] Fix clang format check --- .../passes/add_sw_kernel_instruction_prefetch.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 75cbab41f1..b68f26ce30 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -32,8 +32,8 @@ using namespace vpux; namespace { static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = { - "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", - "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; + "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", + "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; // // AddSwKernelInstructionPrefetch @@ -371,7 +371,8 @@ AddSwKernelInstructionPrefetch::getFirstSwTaskInIRAndBestUpdateBarrier(VPURT::In _log.trace("First SW kernel start time {0}, best barrier release time {1}", firstKernelTask.cycleStart, bestReleaseCycle); if (bestReleaseCycle < _minimumFreeCyclesForPrefetch) { - _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during execution", + _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during " + "execution", bestReleaseCycle, _minimumFreeCyclesForPrefetch); return std::make_tuple(nullptr, nullptr, 0); } From e96f0f8f44b63753f11e3fb17117816fc9872dd6 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Wed, 17 Dec 2025 23:56:07 +0800 Subject: [PATCH 10/13] Fix mid execution mlir test by replacing topk with convert --- ...struction_prefetch_mid_execution_40XX.mlir | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir index 2e85f9a246..3eb61f2652 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir @@ -9,13 +9,13 @@ !DummyDDRT = memref<32000x1x1x1xf16, @DDR> !DummyCMX0T = memref<32000x1x1x1xf16, [@CMX_NN, 0]> !DummyCMX1T = memref<32000x1x1x1xf16, [@CMX_NN, 1]> -!DummyCMX0TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 0]> -!DummyCMX1TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 1]> +!DummyCMX0Convert = memref<32000x1x1x1xf32, [@CMX_NN, 0]> +!DummyCMX1Convert = memref<32000x1x1x1xf32, [@CMX_NN, 1]> // This test checks following schedule -// Barriers : 0 1 2 3 4 5 -// Cluster 0: | [ DMA ] | [ DMA ] | [ Softmax] | [ TopK ] | [ DMA ] | [ Softmax ] -// Cluster 1: | [ DMA ] | [ Softmax] | [ TopK ] +// Barriers : 0 1 2 3 4 5 +// Cluster 0: | [ DMA ] | [ DMA ] | [ Softmax] | [ Convert ] | [ DMA ] | [ Softmax ] +// Cluster 1: | [ DMA ] | [ Softmax] | [ Convert ] // Other : [ SyncDMA ] | // @@ -23,7 +23,7 @@ module @subgraph attributes {config.arch = #config.arch_kind, config.co VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096] module @VPU.SW { func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE} - func.func private @builtin_TopK(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, i64, i64, i64, i64) attributes {VPU.kernel_code = "topk.cpp", VPU.kernel_entry = "topk", VPU.task_type = @COMPUTE} + func.func private @builtin_Convert(memref<*xf16, @CMX_NN>, memref<*xf32, @CMX_NN>) attributes {VPU.kernel_code = "convert.cpp", VPU.kernel_entry = "convert", VPU.kernel_name = "convert", VPU.task_type = @COMPUTE} func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"} } config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz { @@ -105,18 +105,18 @@ module @subgraph attributes {config.arch = #config.arch_kind, config.co } } - %cmx0_top_k = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0TopK + %cmx0_convert = VPURT.DeclareBuffer [0] <0> -> !DummyCMX0Convert VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { - %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_TopK inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T, %cmx0_top_k as %arg5: !DummyCMX0TopK) on tile 0 -> (!DummyCMX0T, !DummyCMX0TopK) { - VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX0T, !DummyCMX0T, !DummyCMX0TopK - } + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Convert inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx0_convert as %arg4: !DummyCMX0Convert) on tile 0 -> (!DummyCMX0Convert) { + VPUIP.SW.Kernel.run {attrs = [[]]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0Convert + } } - %cmx1_top_k = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1TopK + %cmx1_convert = VPURT.DeclareBuffer [1] <0> -> !DummyCMX1Convert VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) { - %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_TopK inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T, %cmx1_top_k as %arg5: !DummyCMX1TopK) on tile 1 -> (!DummyCMX1T, !DummyCMX1TopK) { - VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX1T, !DummyCMX1T, !DummyCMX1TopK - } + %results = VPUIP.SW.Kernel {resultSegmentSizes = array} @VPU.SW::@builtin_Convert inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx1_convert as %arg4: !DummyCMX1Convert) on tile 1 -> (!DummyCMX1Convert) { + VPUIP.SW.Kernel.run {attrs = [[]]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1Convert + } } VPURT.Task waits(%6: !VPURT.Barrier) updates(%7 : !VPURT.Barrier) { @@ -158,15 +158,15 @@ module @subgraph attributes {config.arch = #config.arch_kind, config.co // CHECK: VPURT.Task { // CHECK-NEXT: VPUIP.SW.Kernel // CHECK-SAME: skipProfiling - // CHECK-SAME: @VPU.SW::@builtin_TopK + // CHECK-SAME: @VPU.SW::@builtin_Convert // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { // CHECK: VPUIP.SW.Kernel - // CHECK-SAME: @VPU.SW::@builtin_TopK + // CHECK-SAME: @VPU.SW::@builtin_Convert // CHECK: VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) { // CHECK: VPUIP.SW.Kernel - // CHECK-SAME: @VPU.SW::@builtin_TopK + // CHECK-SAME: @VPU.SW::@builtin_Convert // CHECK: VPURT.Task waits([[BARRIER_6]] : !VPURT.Barrier) updates([[BARRIER_7]] : !VPURT.Barrier) { // CHECK-NEXT: VPUIP.NNDMA From a7cc88d4462bdeabd69066f126b2924c50288a56 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Fri, 19 Dec 2025 00:14:04 +0800 Subject: [PATCH 11/13] Refactor code, rename variables and add comments --- .../add_sw_kernel_instruction_prefetch.cpp | 128 +++++++++--------- ...struction_prefetch_mid_execution_40XX.mlir | 2 +- 2 files changed, 68 insertions(+), 62 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index b68f26ce30..47a83f654c 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -31,10 +31,23 @@ using namespace vpux; namespace { +struct GapCandidate { + uint64_t lookaheadGap = 0; + int64_t insertionPointTaskIndex = -1; + + // used for sort + bool operator>(const GapCandidate& other) const { + return lookaheadGap > other.lookaheadGap; + } +}; + static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = { "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; +static const SmallVector SW_DUMMY_KERNELS_WITHOUT_ARGS = { + "convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"}; + // // AddSwKernelInstructionPrefetch // @@ -82,6 +95,9 @@ class AddSwKernelInstructionPrefetch final : std::vector insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch, mlir::Operation* firstShaveTaskInIR, mlir::Value bestUpdateBarrier); + std::optional findBestInsertionGapDuringExec(const std::string& kernelName, + uint64_t targetKernelGroupStartTime, + VPURT::TaskConfigVec& allTasks, size_t numClusters); std::vector insertPrefetchTasksDuringExec( mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, VPURT::TaskConfigVec& allTasks); @@ -101,6 +117,12 @@ class AddSwKernelInstructionPrefetch final : size_t _minimumFreeCyclesForPrefetch = 250000; bool _useDummyKernelForInstructionPrefetch = false; size_t _dynamicPrefetchTileCounter = 0; + // Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger + // than 1. + int64_t _targetInsertTileDuringExec = 1; + // The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap + // to perform instruction prefetching without causing stalls. + uint64_t _prefetchGapThresholdDuringExec = 50000; }; bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) { @@ -248,9 +270,7 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst // so we need to add skipProfiling as attribute to avoid capturing their metadata cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext())); - auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" || - kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" || - kernelName == "rms_norm") + auto args = llvm::is_contained(SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName) ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; @@ -427,6 +447,21 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas return prefetchedKernels; } +size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) { + size_t count = 0; + for (auto& taskConfig : allTasks) { + if (static_cast(taskConfig.cycleStart) == startTime) { + if (mlir::isa(taskConfig.taskOp.getInnerTaskOp())) { + count++; + } + } + if (static_cast(taskConfig.cycleStart) > startTime) { + break; + } + } + return count; +} + uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters, std::map& swKernelCountsCache) { // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels) @@ -437,19 +472,7 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& uint64_t currentStartTime = static_cast(allTasks[i].cycleStart); if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) { - size_t swKernelCount = 0; - // Count all SW Kernels that start at this specific time - for (auto& task : allTasks) { - if (static_cast(task.cycleStart) == currentStartTime) { - if (mlir::isa(task.taskOp.getInnerTaskOp())) { - swKernelCount++; - } - } - if (static_cast(task.cycleStart) > currentStartTime) { - break; - } - } - swKernelCountsCache[currentStartTime] = swKernelCount; + swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime(currentStartTime, allTasks); } if (swKernelCountsCache[currentStartTime] >= saturationThreshold) { @@ -460,43 +483,17 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& return std::numeric_limits::max(); } -struct GapCandidate { - uint64_t lookaheadGap = 0; - int64_t insertionPointTaskIndex = -1; - - // used for sort - bool operator>(const GapCandidate& other) const { - return lookaheadGap > other.lookaheadGap; - } -}; - -size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) { - size_t count = 0; - for (auto& taskConfig : allTasks) { - if (static_cast(taskConfig.cycleStart) == startTime) { - if (mlir::isa(taskConfig.taskOp.getInnerTaskOp())) { - count++; - } - } - if (static_cast(taskConfig.cycleStart) > startTime) { - break; - } - } - return count; -} - -std::optional findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime, - VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) { - const int64_t targetInsertTile = 1; - const uint64_t GAP_THRESHOLD = 50000; +std::optional AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec( + const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks, + size_t numClusters) { const size_t saturationThreshold = numClusters * 2; // std::map> validGaps; std::map swKernelCountsCache; // local cache - int64_t previousT1TaskIndex = -1; - uint64_t previousT1TaskStartTime = 0; + int64_t prevTargetTileTaskIndex = -1; + uint64_t prevTargetTileTaskStartTime = 0; // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched for (size_t i = 0; i < allTasks.size(); ++i) { @@ -506,43 +503,43 @@ std::optional findBestInsertionGap(const std::string& kernelName, break; } - bool isT1Task = false; + bool isTargetTileTask = false; if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) { - isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile); + isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec); } - if (previousT1TaskIndex != -1 && isT1Task) { - auto& insertionPointTask = allTasks[previousT1TaskIndex]; + if (prevTargetTileTaskIndex != -1 && isTargetTileTask) { + auto& insertionPointTask = allTasks[prevTargetTileTaskIndex]; auto insertionPointStartTime = static_cast(insertionPointTask.cycleStart); size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); if (simultaneousSwKernels < saturationThreshold) { uint64_t nextSaturationStart = - findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache); + findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache); uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); uint64_t lookaheadGap = 0; - if (gapEnd > previousT1TaskStartTime) { - lookaheadGap = gapEnd - previousT1TaskStartTime; + if (gapEnd > prevTargetTileTaskStartTime) { + lookaheadGap = gapEnd - prevTargetTileTaskStartTime; } - if (lookaheadGap >= GAP_THRESHOLD) { + if (lookaheadGap >= _prefetchGapThresholdDuringExec) { GapCandidate gap; gap.lookaheadGap = lookaheadGap; - gap.insertionPointTaskIndex = previousT1TaskIndex; + gap.insertionPointTaskIndex = prevTargetTileTaskIndex; validGaps[lookaheadGap] = gap; } } } - if (isT1Task) { - previousT1TaskIndex = static_cast(i); - previousT1TaskStartTime = currentTaskStartTime; + if (isTargetTileTask) { + prevTargetTileTaskIndex = static_cast(i); + prevTargetTileTaskStartTime = currentTaskStartTime; } } if (validGaps.empty()) { - log.trace("Kernel '{0}': No suitable insertion point found.", kernelName); + _log.trace("Kernel '{0}': No suitable insertion point found.", kernelName); return std::nullopt; } @@ -573,7 +570,16 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas auto targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); - auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log); + // Finds the best insertion point for prefetch by identifying non-saturated execution windows. + // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the + // duration from an anchor task to the next saturation event or the target kernel start. + // + // Logic: + // 1. Find a candidate task on the target tile. + // 2. Ensure NPU is not saturated at that time. + // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time. + // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold. + auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters); if (!bestGapOpt.has_value()) { _log.trace("Kernel '{0}': No valid gap found.", kernelName); diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir index 3eb61f2652..7c75b60da1 100644 --- a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir +++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir @@ -1,5 +1,5 @@ // -// Copyright (C) 2024-2025 Intel Corporation. +// Copyright (C) 2025 Intel Corporation. // SPDX-License-Identifier: Apache-2.0 // From 0e46bcd0c231f7eefb8d7ba485dc03f4c3dc271a Mon Sep 17 00:00:00 2001 From: Kepontry Date: Wed, 14 Jan 2026 00:30:22 +0800 Subject: [PATCH 12/13] Address code review comments --- .../add_sw_kernel_instruction_prefetch.cpp | 42 +++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 47a83f654c..2d8c77b7ff 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -42,8 +42,8 @@ struct GapCandidate { }; static const SmallVector SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = { - "activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", - "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; + "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", + "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; static const SmallVector SW_DUMMY_KERNELS_WITHOUT_ARGS = { "convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"}; @@ -462,12 +462,9 @@ size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks return count; } -uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters, +uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t saturationThreshold, std::map& swKernelCountsCache) { - // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels) - const size_t saturationThreshold = numClusters * 2; - - // Iterate through tasks strictly AFTER the startIndex + // Iterate through tasks strictly after the startIndex for (size_t i = startIndex + 1; i < allTasks.size(); ++i) { uint64_t currentStartTime = static_cast(allTasks[i].cycleStart); @@ -485,9 +482,7 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& std::optional AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec( const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks, - size_t numClusters) { - const size_t saturationThreshold = numClusters * 2; - + size_t saturationThreshold) { // std::map> validGaps; std::map swKernelCountsCache; // local cache @@ -504,7 +499,7 @@ std::optional AddSwKernelInstructionPrefetch::findBestInsertionGap } bool isTargetTileTask = false; - if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) { + if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp())) { isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec); } @@ -515,8 +510,8 @@ std::optional AddSwKernelInstructionPrefetch::findBestInsertionGap size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); if (simultaneousSwKernels < saturationThreshold) { - uint64_t nextSaturationStart = - findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache); + uint64_t nextSaturationStart = findNextSaturationStart(prevTargetTileTaskIndex, allTasks, + saturationThreshold, swKernelCountsCache); uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); uint64_t lookaheadGap = 0; if (gapEnd > prevTargetTileTaskStartTime) { @@ -551,24 +546,18 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas VPURT::TaskConfigVec& allTasks) { auto moduleOp = funcOp->getParentOfType(); const auto numClusters = getNumTiles(moduleOp); - VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero."); + const auto noOfShavesPerCluster = + config::getTileExecutor(moduleOp).getSubExecutor(VPU::ExecutorKind::SHAVE_ACT).getCount(); + _log.info("numClusters {0}, noOfShavesPerCluster: {1}", numClusters, noOfShavesPerCluster); std::vector prefetchedKernels{}; for (auto& kernelInfo : kernelsToPrefetch) { std::string kernelName = std::get<0>(kernelInfo); size_t firstAppearanceIndex = std::get<2>(kernelInfo); - - if (firstAppearanceIndex >= allTasks.size()) { - _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex); - continue; - } - if (kernelNameToOps.count(kernelName) == 0) { - _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName); - continue; - } - auto targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); + // Saturation is defined as the total number of SHAVE cores (clusters * shaves per cluster) + const size_t saturationThreshold = numClusters * noOfShavesPerCluster; // Finds the best insertion point for prefetch by identifying non-saturated execution windows. // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the @@ -579,7 +568,8 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas // 2. Ensure NPU is not saturated at that time. // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time. // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold. - auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters); + auto bestGapOpt = + findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, saturationThreshold); if (!bestGapOpt.has_value()) { _log.trace("Kernel '{0}': No valid gap found.", kernelName); @@ -659,7 +649,7 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { if (firstShaveTaskInIR) { _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); - } else { + } else if (_useDummyKernelForInstructionPrefetch) { newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks); } From 768056428be3d624e4eb85511520a46d1de27c5e Mon Sep 17 00:00:00 2001 From: Kepontry Date: Thu, 15 Jan 2026 11:56:43 +0800 Subject: [PATCH 13/13] Address code review comments --- .../add_sw_kernel_instruction_prefetch.cpp | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp index 2d8c77b7ff..b17a6442cc 100644 --- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp +++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp @@ -97,7 +97,8 @@ class AddSwKernelInstructionPrefetch final : mlir::Value bestUpdateBarrier); std::optional findBestInsertionGapDuringExec(const std::string& kernelName, uint64_t targetKernelGroupStartTime, - VPURT::TaskConfigVec& allTasks, size_t numClusters); + VPURT::TaskConfigVec& allTasks, size_t numClusters, + size_t noOfShavesPerCluster); std::vector insertPrefetchTasksDuringExec( mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, VPURT::TaskConfigVec& allTasks); @@ -117,9 +118,9 @@ class AddSwKernelInstructionPrefetch final : size_t _minimumFreeCyclesForPrefetch = 250000; bool _useDummyKernelForInstructionPrefetch = false; size_t _dynamicPrefetchTileCounter = 0; - // Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger - // than 1. - int64_t _targetInsertTileDuringExec = 1; + // Used as the reference tile for analyzing schedule availability (gaps). + // Index 1 ensures prefetching is only enabled for multi-tile (>=2) kernels. + int64_t _referenceTileForGapFindingDuringExec = 1; // The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap // to perform instruction prefetching without causing stalls. uint64_t _prefetchGapThresholdDuringExec = 50000; @@ -482,11 +483,13 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& std::optional AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec( const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks, - size_t saturationThreshold) { + size_t numClusters, size_t noOfShavesPerCluster) { // std::map> validGaps; std::map swKernelCountsCache; // local cache + // Saturation is defined as the total number of SHAVE cores (clusters * shaves per cluster) + const size_t saturationThreshold = numClusters * noOfShavesPerCluster; int64_t prevTargetTileTaskIndex = -1; uint64_t prevTargetTileTaskStartTime = 0; @@ -500,14 +503,14 @@ std::optional AddSwKernelInstructionPrefetch::findBestInsertionGap bool isTargetTileTask = false; if (auto swOp = mlir::dyn_cast(currentTaskConfig.taskOp.getInnerTaskOp())) { - isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec); + isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _referenceTileForGapFindingDuringExec); + } + if (!isTargetTileTask) { + continue; } - if (prevTargetTileTaskIndex != -1 && isTargetTileTask) { - auto& insertionPointTask = allTasks[prevTargetTileTaskIndex]; - auto insertionPointStartTime = static_cast(insertionPointTask.cycleStart); - - size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); + if (prevTargetTileTaskIndex != -1) { + size_t simultaneousSwKernels = getSwKernelCountAtTime(prevTargetTileTaskStartTime, allTasks); if (simultaneousSwKernels < saturationThreshold) { uint64_t nextSaturationStart = findNextSaturationStart(prevTargetTileTaskIndex, allTasks, @@ -526,11 +529,8 @@ std::optional AddSwKernelInstructionPrefetch::findBestInsertionGap } } } - - if (isTargetTileTask) { - prevTargetTileTaskIndex = static_cast(i); - prevTargetTileTaskStartTime = currentTaskStartTime; - } + prevTargetTileTaskIndex = static_cast(i); + prevTargetTileTaskStartTime = currentTaskStartTime; } if (validGaps.empty()) { @@ -556,8 +556,6 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas std::string kernelName = std::get<0>(kernelInfo); size_t firstAppearanceIndex = std::get<2>(kernelInfo); auto targetKernelGroupStartTime = static_cast(allTasks[firstAppearanceIndex].cycleStart); - // Saturation is defined as the total number of SHAVE cores (clusters * shaves per cluster) - const size_t saturationThreshold = numClusters * noOfShavesPerCluster; // Finds the best insertion point for prefetch by identifying non-saturated execution windows. // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the @@ -568,8 +566,8 @@ std::vector AddSwKernelInstructionPrefetch::insertPrefetchTas // 2. Ensure NPU is not saturated at that time. // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time. // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold. - auto bestGapOpt = - findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, saturationThreshold); + auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters, + noOfShavesPerCluster); if (!bestGapOpt.has_value()) { _log.trace("Kernel '{0}': No valid gap found.", kernelName);