From 2ecf4c28f495248fe1ec09526caed5e5b757045d Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Fri, 7 Nov 2025 15:50:51 +0800
Subject: [PATCH 01/13] Enable prefetching of SW kernel instructions after the
 first SW task

---
 .../add_sw_kernel_instruction_prefetch.cpp    | 247 ++++++++++++++++--
 1 file changed, 230 insertions(+), 17 deletions(-)
diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index b432ab7ab0..6eed851c77 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -31,7 +31,8 @@ using namespace vpux;
 
 namespace {
 
-static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"};
+// static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"};
+static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
 
 //
 // AddSwKernelInstructionPrefetch
@@ -66,12 +67,12 @@ class AddSwKernelInstructionPrefetch final :
                                                             size_t clusterIdx, std::string& kernelName,
                                                             mlir::SymbolRefAttr functionSymbol);
 
-    VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier,
+    VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier,
                                                                size_t clusterIdx, std::string& kernelName);
     mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier);
     std::pair<std::string, size_t> getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp);
 
-    using SwKernelPrefetchVec = std::vector<std::pair<std::string, size_t>>;
+    using SwKernelPrefetchVec = std::vector<std::tuple<std::string, size_t, size_t>>;
     std::pair<SwKernelPrefetchVec, size_t> getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp,
                                                                                VPURT::TaskConfigVec& allTasks);
     std::tuple<mlir::Operation*, mlir::Value, size_t> getFirstSwTaskInIRAndBestUpdateBarrier(
@@ -79,6 +80,8 @@ class AddSwKernelInstructionPrefetch final :
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
                                                        mlir::Operation* firstShaveTaskInIR,
                                                        mlir::Value bestUpdateBarrier);
+    std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
+                                                                 VPURT::TaskConfigVec& allTasks);
 
     bool hasVPUSWModule(mlir::Operation* funcOp);
     size_t getOffsetReservedMem(const mlir::ModuleOp module);
@@ -94,6 +97,7 @@ class AddSwKernelInstructionPrefetch final :
     bool _minFreeCyclesHasValue = false;
     size_t _minimumFreeCyclesForPrefetch = 250000;
     bool _useDummyKernelForInstructionPrefetch = false;
+    size_t _dynamicPrefetchTileCounter = 0;
 };
 
 bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
@@ -187,20 +191,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer
 
 // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel
 VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
-                                                                                           mlir::Value updateBarrier,
+                                                                                           mlir::ValueRange updateBarrier,
                                                                                            size_t clusterIdx,
                                                                                            std::string& kernelName) {
     mlir::OpBuilder builder(firstSwTask);
-    auto moduleOp = firstSwTask->getParentOfType<mlir::ModuleOp>();
+    auto kernelOp = kernelNameToOps[kernelName];
+    auto moduleOp = kernelOp->getParentOfType<mlir::ModuleOp>();
     auto reservedMemOffset = getOffsetReservedMem(moduleOp);
     auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset);
-    auto kernelOp = kernelNameToOps[kernelName];
+    auto tileIndexAttr = kernelOp.getTileIndexAttr();
+    VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc());
+    const int64_t tileIndex = static_cast<int64_t>(clusterIdx);
 
     auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector<mlir::Value>& buffers) {
         if (auto bufOp = io.getDefiningOp<VPURT::DeclareBufferOp>()) {
-            auto newType = mlir::cast<NDTypeInterface>(io.getType()).changeShape({1, 1, 1, 1});
+            auto origType = mlir::cast<NDTypeInterface>(io.getType());
+            auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex);
+            auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex});
+            auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr);
             auto newBuff = builder.create<VPURT::DeclareBufferOp>(appendLoc(bufOp->getLoc(), suffix), newType,
-                                                                  bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(),
+                                                                  bufOp.getSectionAttr(), newSectionIndexAttr,
                                                                   offsetAttr, bufOp.getSwizzlingKeyAttr());
             buffers.push_back(newBuff);
             return true;
@@ -230,14 +240,17 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
 
     auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp<VPUIP::SwKernelOp>(
             builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers),
-            mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(),
+            mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex),
             kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr());
     // The dummy kernels here are generated after ActShaveProfilingPass,
     // so we need to add skipProfiling as attribute to avoid capturing their metadata
     cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));
 
     auto args =
-            (kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];
+            (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos"
+                || kernelName == "activation_sin" || kernelName == "eltwise_equal"
+                || kernelName == "eltwise_select" || kernelName == "rms_norm") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];
+
     vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args,
                               _log.nest(), /*swKernelRunOp=*/nullptr);
 
@@ -316,7 +329,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat
             }
 
             if (!cache.isLoaded(kernelName)) {
-                kernelsToPrefetch.push_back(std::move(kernelNameAndSize));
+                kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex));
             }
             cache.loadKernel(kernelName, kernelSize);
 
@@ -394,7 +407,7 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
     for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size());
          shaveIdx++) {
         auto clusterIdx = shaveIdx / noOfShavesPerCluster;
-        auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx];
+        auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx];
         _log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx);
         auto newPrefetchKernel =
                 _useDummyKernelForInstructionPrefetch
@@ -410,6 +423,200 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
     return prefetchedKernels;
 }
 
+uint64_t findNextSaturationStart(size_t startIndex,
+                                 vpux::VPURT::TaskConfigVec& allTasks,
+                                 size_t numClusters,
+                                 std::map<uint64_t, size_t>& swKernelCountsCache) {
+    
+    // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
+    const size_t saturationThreshold = numClusters * 2;
+
+    // Iterate through tasks strictly AFTER the startIndex
+    for (size_t i = startIndex + 1; i < allTasks.size(); ++i) {
+        uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);
+
+        if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
+            size_t swKernelCount = 0;
+            // Count all SW Kernels that start at this specific time
+            for (auto& task : allTasks) {
+                if (static_cast<uint64_t>(task.cycleStart) == currentStartTime) {
+                    if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp.getInnerTaskOp())) {
+                        swKernelCount++;
+                    }
+                }
+                if (static_cast<uint64_t>(task.cycleStart) > currentStartTime) {
+                    break;
+                }
+            }
+            swKernelCountsCache[currentStartTime] = swKernelCount;
+        }
+
+        if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
+            return currentStartTime;
+        }
+    }
+
+    return std::numeric_limits<uint64_t>::max();
+}
+
+struct GapCandidate {
+    uint64_t lookaheadGap = 0;
+    int64_t insertionPointTaskIndex = -1;
+
+    // used for sort
+    bool operator>(const GapCandidate& other) const {
+        return lookaheadGap > other.lookaheadGap;
+    }
+};
+
+size_t getSwKernelCountAtTime(uint64_t startTime,
+                              VPURT::TaskConfigVec& allTasks) {
+    size_t count = 0;
+    for (auto& taskConfig : allTasks) {
+        if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
+            if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
+                count++;
+            }
+        }
+        if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
+            break; 
+        }
+    }
+    return count;
+}
+
+std::optional<GapCandidate> findBestInsertionGap(
+        const std::string& kernelName,
+        uint64_t targetKernelGroupStartTime,
+        VPURT::TaskConfigVec& allTasks,
+        size_t numClusters,
+        Logger& log) {
+
+    const int64_t targetInsertTile = 3;
+    const uint64_t GAP_THRESHOLD = 50000;
+    const size_t saturationThreshold = numClusters * 2;
+
+    // <LookaheadGapSize, GapCandidate>
+    std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
+    std::map<uint64_t, size_t> swKernelCountsCache; // local cache
+
+    int64_t previousT3TaskIndex = -1;
+    uint64_t previousT3TaskEndTime = 0;
+
+    // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
+    for (size_t i = 0; i < allTasks.size(); ++i) {
+        auto& currentTaskConfig = allTasks[i];
+        uint64_t currentTaskStartTime = static_cast<uint64_t>(currentTaskConfig.cycleStart);
+        if (currentTaskStartTime > targetKernelGroupStartTime) {
+            break;
+        }
+
+        bool isT3Task = false;
+        if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp())) {
+            isT3Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
+        }
+
+        if (previousT3TaskIndex != -1 && isT3Task) {
+            
+            auto& insertionPointTask = allTasks[previousT3TaskIndex];
+            uint64_t insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
+
+            size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
+            
+            if (simultaneousSwKernels < saturationThreshold) {
+                uint64_t nextSaturationStart = findNextSaturationStart(previousT3TaskIndex, allTasks, numClusters, swKernelCountsCache);
+                uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
+                uint64_t lookaheadGap = 0;
+                if (gapEnd > previousT3TaskEndTime) {
+                    lookaheadGap = gapEnd - previousT3TaskEndTime;
+                }
+
+                if (lookaheadGap >= GAP_THRESHOLD) {
+                    GapCandidate gap;
+                    gap.lookaheadGap = lookaheadGap;
+                    gap.insertionPointTaskIndex = previousT3TaskIndex;
+                    validGaps[lookaheadGap] = gap;
+                }
+            }
+        }
+
+        if (isT3Task) {
+            previousT3TaskIndex = static_cast<int64_t>(i);
+            previousT3TaskEndTime = currentTaskStartTime + static_cast<uint64_t>(allTasks[i].cycleCost);
+        }
+    }
+
+    if (validGaps.empty()) {
+        log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
+        return std::nullopt;
+    }
+
+    return validGaps.begin()->second;
+}
+
+std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec(
+    mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
+    VPURT::TaskConfigVec& allTasks) {
+    
+    auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>();
+    const auto numClusters = getNumTiles(moduleOp);
+    VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero.");
+
+    std::vector<VPUIP::SwKernelOp> prefetchedKernels{};
+    
+    for (auto& kernelInfo : kernelsToPrefetch) {
+        std::string kernelName = std::get<0>(kernelInfo);
+        size_t firstAppearanceIndex = std::get<2>(kernelInfo);
+
+        if (firstAppearanceIndex >= allTasks.size()) {
+             _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
+             continue;
+        }
+        if (kernelNameToOps.count(kernelName) == 0) {
+             _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
+             continue;
+        }
+
+        uint64_t targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
+
+        auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime,
+                                               allTasks, numClusters, _log);
+
+        if (!bestGapOpt.has_value()) {
+            _log.trace("Kernel '{0}': No valid gap found.", kernelName);
+            continue;
+        }
+        
+        GapCandidate bestGap = bestGapOpt.value();
+        _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.",
+                   kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);
+        std::cout << "[Prefetch DEBUG] Kernel: " << kernelName
+              << " Found best gap of  " << bestGap.lookaheadGap 
+              << " cycles. Inserting relative to task  " << bestGap.insertionPointTaskIndex << std::endl;
+
+        if (bestGap.insertionPointTaskIndex < 0 || static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
+             _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", 
+                        kernelName, bestGap.insertionPointTaskIndex);
+             continue;
+        }
+        
+        auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp;
+        size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters;
+        _dynamicPrefetchTileCounter++;
+
+        auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(
+            insertBeforeOp,
+            mlir::ValueRange(),
+            dynamicExecTile,
+            kernelName
+        );
+
+        prefetchedKernels.push_back(newPrefetchKernel);
+    }
+
+    return prefetchedKernels;
+}
+
 void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     auto funcOp = getOperation();
     if (!hasVPUSWModule(funcOp)) {
@@ -444,10 +651,6 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks);
     auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] =
             getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex);
-    if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) {
-        return;
-    }
-    _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
 
     if (_useDummyKernelForInstructionPrefetch) {
         auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN));
@@ -455,7 +658,17 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
         VPUX_THROW_WHEN(dummyKernelResMem == nullptr,
                         "Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!");
     }
-    auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
+    if (kernelsToPrefetch.empty()) {
+        return;
+    }
+    _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
+
+    std::vector<VPUIP::SwKernelOp> newPrefetchKernels;
+    if (firstShaveTaskInIR == nullptr){
+        newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks);
+    } else {
+        newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
+    }
 
     // Update dependencies for cache handling operations to meet requirements of control graph split.
     auto& barrierInfo = getAnalysis<BarrierInfo>();

From 681035a779ce7a4438f6dd993b612c44598c273f Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Sat, 8 Nov 2025 01:04:17 +0800
Subject: [PATCH 02/13] style: Code cleanup and formatting

---
 .../add_sw_kernel_instruction_prefetch.cpp    | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index 6eed851c77..6ed5289fec 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -31,7 +31,6 @@ using namespace vpux;
 
 namespace {
 
-// static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"};
 static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
 
 //
@@ -512,14 +511,14 @@ std::optional<GapCandidate> findBestInsertionGap(
         }
 
         bool isT3Task = false;
-        if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp())) {
+        if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
             isT3Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
         }
 
         if (previousT3TaskIndex != -1 && isT3Task) {
             
             auto& insertionPointTask = allTasks[previousT3TaskIndex];
-            uint64_t insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
+            auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
 
             size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
             
@@ -577,7 +576,7 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
              continue;
         }
 
-        uint64_t targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
+        auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
 
         auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime,
                                                allTasks, numClusters, _log);
@@ -590,9 +589,6 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
         GapCandidate bestGap = bestGapOpt.value();
         _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.",
                    kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);
-        std::cout << "[Prefetch DEBUG] Kernel: " << kernelName
-              << " Found best gap of  " << bestGap.lookaheadGap 
-              << " cycles. Inserting relative to task  " << bestGap.insertionPointTaskIndex << std::endl;
 
         if (bestGap.insertionPointTaskIndex < 0 || static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
              _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", 
@@ -663,12 +659,9 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     }
     _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
 
-    std::vector<VPUIP::SwKernelOp> newPrefetchKernels;
-    if (firstShaveTaskInIR == nullptr){
-        newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks);
-    } else {
-        newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
-    }
+    auto newPrefetchKernels = (firstShaveTaskInIR == nullptr)
+        ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks)
+        : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
 
     // Update dependencies for cache handling operations to meet requirements of control graph split.
     auto& barrierInfo = getAnalysis<BarrierInfo>();

From 8f39a2789324bd32a765055f3b67c0435da8a363 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Thu, 20 Nov 2025 00:08:36 +0800
Subject: [PATCH 03/13] func: Add test case, change t3 to t1

---
 .../add_sw_kernel_instruction_prefetch.cpp    |  30 +--
 ...struction_prefetch_mid_execution_40XX.mlir | 180 ++++++++++++++++++
 2 files changed, 195 insertions(+), 15 deletions(-)
 create mode 100644 tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index 6ed5289fec..4953a690f5 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -31,7 +31,7 @@ using namespace vpux;
 
 namespace {
 
-static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
+static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"};
 
 //
 // AddSwKernelInstructionPrefetch
@@ -491,7 +491,7 @@ std::optional<GapCandidate> findBestInsertionGap(
         size_t numClusters,
         Logger& log) {
 
-    const int64_t targetInsertTile = 3;
+    const int64_t targetInsertTile = 1;
     const uint64_t GAP_THRESHOLD = 50000;
     const size_t saturationThreshold = numClusters * 2;
 
@@ -499,8 +499,8 @@ std::optional<GapCandidate> findBestInsertionGap(
     std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
     std::map<uint64_t, size_t> swKernelCountsCache; // local cache
 
-    int64_t previousT3TaskIndex = -1;
-    uint64_t previousT3TaskEndTime = 0;
+    int64_t previousT1TaskIndex = -1;
+    uint64_t previousT1TaskStartTime = 0;
 
     // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
     for (size_t i = 0; i < allTasks.size(); ++i) {
@@ -510,38 +510,38 @@ std::optional<GapCandidate> findBestInsertionGap(
             break;
         }
 
-        bool isT3Task = false;
+        bool isT1Task = false;
         if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
-            isT3Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
+            isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
         }
 
-        if (previousT3TaskIndex != -1 && isT3Task) {
+        if (previousT1TaskIndex != -1 && isT1Task) {
             
-            auto& insertionPointTask = allTasks[previousT3TaskIndex];
+            auto& insertionPointTask = allTasks[previousT1TaskIndex];
             auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
 
             size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
             
             if (simultaneousSwKernels < saturationThreshold) {
-                uint64_t nextSaturationStart = findNextSaturationStart(previousT3TaskIndex, allTasks, numClusters, swKernelCountsCache);
+                uint64_t nextSaturationStart = findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
                 uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
                 uint64_t lookaheadGap = 0;
-                if (gapEnd > previousT3TaskEndTime) {
-                    lookaheadGap = gapEnd - previousT3TaskEndTime;
+                if (gapEnd > previousT1TaskStartTime) {
+                    lookaheadGap = gapEnd - previousT1TaskStartTime;
                 }
 
                 if (lookaheadGap >= GAP_THRESHOLD) {
                     GapCandidate gap;
                     gap.lookaheadGap = lookaheadGap;
-                    gap.insertionPointTaskIndex = previousT3TaskIndex;
+                    gap.insertionPointTaskIndex = previousT1TaskIndex;
                     validGaps[lookaheadGap] = gap;
                 }
             }
         }
 
-        if (isT3Task) {
-            previousT3TaskIndex = static_cast<int64_t>(i);
-            previousT3TaskEndTime = currentTaskStartTime + static_cast<uint64_t>(allTasks[i].cycleCost);
+        if (isT1Task) {
+            previousT1TaskIndex = static_cast<int64_t>(i);
+            previousT1TaskStartTime = currentTaskStartTime;
         }
     }
 
diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
new file mode 100644
index 0000000000..2e85f9a246
--- /dev/null
+++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
@@ -0,0 +1,180 @@
+//
+// Copyright (C) 2024-2025 Intel Corporation.
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// RUN: vpux-opt --split-input-file --init-compiler="vpu-arch=%arch% allow-custom-values=true enable-sw-kernel-fifo-per-shave-engine=false" --add-sw-kernel-instruction-prefetch %s | FileCheck %s
+// REQUIRES: arch-NPU40XX
+
+!DummyDDRT = memref<32000x1x1x1xf16, @DDR>
+!DummyCMX0T = memref<32000x1x1x1xf16, [@CMX_NN, 0]>
+!DummyCMX1T = memref<32000x1x1x1xf16, [@CMX_NN, 1]>
+!DummyCMX0TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 0]>
+!DummyCMX1TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 1]>
+
+// This test checks following schedule
+//  Barriers :             0         1         2            3          4         5
+//  Cluster 0:             | [ DMA ] | [ DMA ] | [ Softmax] | [ TopK ] | [ DMA ] | [ Softmax ]
+//  Cluster 1:             | [    DMA    ]     | [ Softmax] | [ TopK ]
+//  Other    : [ SyncDMA ] |
+//
+
+module @subgraph attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+  VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096]
+  module @VPU.SW {
+    func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE}
+    func.func private @builtin_TopK(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, i64, i64, i64, i64) attributes {VPU.kernel_code = "topk.cpp", VPU.kernel_entry = "topk", VPU.task_type = @COMPUTE}
+    func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"}
+  }
+  config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+    builtin.module @ReservedMemory {
+      module @DummySWKernelsForInstructionPrefetchReservedMemory {
+        config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+      }
+    }
+    config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+    config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+    config.ExecutorResource 2 of @SHAVE_ACT
+    config.ExecutorResource 1 of @DPU
+  }
+  config.ExecutorResource 1 of @M2I
+  config.ExecutorResource 1 of @DMA_NN
+  config.MemoryResource 2306867200 bytes of @DDR {config.bandwidth = 64 : i64, config.derateFactor = 6.000000e-01 : f64}
+  net.NetworkInfo {inferenceTiming = 369464 : i64} entryPoint : @main inputsInfo : {
+    DataInfo "data" : tensor<1x3x62x62xui8>
+  } outputsInfo : {
+    DataInfo "out" : tensor<1x3x62x62xui8>
+  }
+  func.func @main(%arg0: memref<1x3x62x62xui8, @DDR>) -> memref<1x3x62x62xui8, @DDR> {
+    %0 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %1 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %2 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %3 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %4 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %5 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %6 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    %7 = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+
+    // CHECK:       [[BARRIER_0:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_1:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_2:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_3:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_4:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_5:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_6:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+    // CHECK:       [[BARRIER_7:%.+]] = VPURT.DeclareVirtualBarrier -> !VPURT.Barrier
+
+    %28 = VPURT.DeclareBuffer <DDR> <0> -> memref<0x0x0x0xi32, @DDR>
+    %ddr_buf = VPURT.DeclareBuffer <DDR> <0> -> !DummyDDRT
+    %cmx_0 = VPURT.DeclareBuffer <CMX_NN> [0] <0> -> !DummyCMX0T
+    %cmx_1 = VPURT.DeclareBuffer <CMX_NN> [1] <0> -> !DummyCMX1T
+
+    VPURT.Task updates(%0 : !VPURT.Barrier) {
+        %241 = VPUIP.SyncDMA {port = 0 : i64} inputs(%28 : memref<0x0x0x0xi32, @DDR>) outputs(%28 : memref<0x0x0x0xi32, @DDR>) -> memref<0x0x0x0xi32, @DDR>
+    }
+
+    VPURT.Task waits(%0: !VPURT.Barrier) updates(%1 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%1: !VPURT.Barrier) updates(%2 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%2: !VPURT.Barrier) updates(%3 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%3: !VPURT.Barrier) updates(%4 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 1 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_1 : !DummyCMX1T) -> !DummyCMX1T
+    }
+
+    VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) {
+        %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{
+                VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T
+    }
+    }
+
+    VPURT.Task waits(%4: !VPURT.Barrier) updates(%5 : !VPURT.Barrier) {
+        %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_SoftMax inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T) on tile 1 -> !DummyCMX1T{
+                VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1T
+    }
+    }
+
+    %cmx0_top_k = VPURT.DeclareBuffer <CMX_NN> [0] <0> -> !DummyCMX0TopK
+    VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) {
+        %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 2, 0, 0>} @VPU.SW::@builtin_TopK inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T, %cmx0_top_k as %arg5: !DummyCMX0TopK) on tile 0 -> (!DummyCMX0T, !DummyCMX0TopK) {
+                VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX0T, !DummyCMX0T, !DummyCMX0TopK
+    }
+    }
+
+    %cmx1_top_k = VPURT.DeclareBuffer <CMX_NN> [1] <0> -> !DummyCMX1TopK
+    VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) {
+        %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 2, 0, 0>} @VPU.SW::@builtin_TopK inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T, %cmx1_top_k as %arg5: !DummyCMX1TopK) on tile 1 -> (!DummyCMX1T, !DummyCMX1TopK) {
+                VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX1T, !DummyCMX1T, !DummyCMX1TopK
+    }
+    }
+
+    VPURT.Task waits(%6: !VPURT.Barrier) updates(%7 : !VPURT.Barrier) {
+        %241 = VPUIP.NNDMA {port = 0 : i64} inputs(%ddr_buf :!DummyDDRT) outputs(%cmx_0 : !DummyCMX0T) -> !DummyCMX0T
+    }
+
+    VPURT.Task waits(%7: !VPURT.Barrier) {
+        %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_SoftMax inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T) on tile 0 -> !DummyCMX0T{
+                VPUIP.SW.Kernel.run {attrs = [0, 0]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0T
+    }
+    }
+
+    // CHECK:       VPURT.Task updates([[BARRIER_0]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.SyncDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_0]] : !VPURT.Barrier) updates([[BARRIER_1]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_1]] : !VPURT.Barrier) updates([[BARRIER_2]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_2]] : !VPURT.Barrier) updates([[BARRIER_3]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_3]] : !VPURT.Barrier) updates([[BARRIER_4]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_SoftMax
+
+    // CHECK:       VPURT.Task waits([[BARRIER_4]] : !VPURT.Barrier) updates([[BARRIER_5]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_SoftMax
+
+    // CHECK:       VPURT.Task {
+    // CHECK-NEXT:        VPUIP.SW.Kernel
+    // CHECK-SAME:        skipProfiling
+    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+
+    // CHECK:       VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+
+    // CHECK:       VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+
+    // CHECK:       VPURT.Task waits([[BARRIER_6]] : !VPURT.Barrier) updates([[BARRIER_7]] : !VPURT.Barrier) {
+    // CHECK-NEXT:        VPUIP.NNDMA
+
+    // CHECK:       VPURT.Task waits([[BARRIER_7]] : !VPURT.Barrier) {
+    // CHECK:             VPUIP.SW.Kernel
+    // CHECK-SAME:        @VPU.SW::@builtin_SoftMax
+
+    return %arg0 : memref<1x3x62x62xui8, @DDR>
+  }
+}

From bf5cab7c911f6e8e32cc6ef210c009a332a172f5 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Tue, 9 Dec 2025 15:03:51 +0800
Subject: [PATCH 04/13] Add instpf memory to the config of the 4 failed tests

---
 .../VPUIP/pipelines/default_hw_mode_40XX.mlir | 24 +++++++++++++
 .../default_hw_mode_repeating_blocks.mlir     | 36 +++++++++++++++++++
 ...t_hw_mode_schedule_trace_enabled_40XX.mlir | 12 +++++++
 ..._mode_vertical_fusion_outlining_40XX+.mlir | 11 ++++++
 4 files changed, 83 insertions(+)

diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
index a2ae982802..8884adf385 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
@@ -20,6 +20,18 @@
 
 // CHECK-LABEL: @SoftMax
 module @SoftMax attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
     module @VPU.SW {
         func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE}
@@ -166,6 +178,18 @@ module @SoftMax attributes {config.arch = #config.arch_kind<NPU40XX>, config.com
 
 // CHECK-LABEL: @TwoFunctions
 module @TwoFunctions attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     // CHECK-DAG: {{  }}config.Resources
 
     VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096]
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
index 6dd21b5f43..ca02746827 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
@@ -9,6 +9,18 @@
 !MemRef = memref<1x3x62x62xf16>
 
 module @ChainCalls {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     net.NetworkInfo entryPoint : @main inputsInfo : {
         DataInfo "input" : tensor<1x3x62x62xf16>
     } outputsInfo : {
@@ -61,6 +73,18 @@ module @ChainCalls {
 
 !MemRef = memref<1x1x2x64xf16>
 module @SwKernelsChainCalls {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     net.NetworkInfo entryPoint : @main inputsInfo : {
         DataInfo "input" : tensor<1x1x2x64xf16>
     } outputsInfo : {
@@ -146,6 +170,18 @@ module @SwKernelsChainCalls {
 
 !MemRef = memref<1x1x2x64xf16>
 module @SwKernelsIndependentCalls {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     net.NetworkInfo entryPoint : @main inputsInfo : {
         DataInfo "input" : tensor<1x1x2x64xf16>
     } outputsInfo : {
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
index 5406c523ac..b3bf4c898b 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
@@ -9,6 +9,18 @@
 
 // CHECK-LABEL: @Gather
 module @Gather attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
+    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+        builtin.module @ReservedMemory {
+        module @DummySWKernelsForInstructionPrefetchReservedMemory {
+            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+        }
+        }
+        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+        config.ExecutorResource 2 of @SHAVE_ACT
+        config.ExecutorResource 1 of @DPU
+    }
+
     VPURT.SW.Runtime
       entryPoint: @VPU.SW::@runtime
       stack_configuration: [4096, 4096, 4096, 4096]
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
index 02e4a016c5..61918ed50d 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
@@ -14,6 +14,17 @@ module @VerticalFusionOutlining attributes {config.compilationMode = #config.com
     func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"}
   }
 
+  config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+    builtin.module @ReservedMemory {
+    module @DummySWKernelsForInstructionPrefetchReservedMemory {
+        config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+    }
+    }
+    config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
+    config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
+    config.ExecutorResource 2 of @SHAVE_ACT
+    config.ExecutorResource 1 of @DPU
+  }
   net.NetworkInfo entryPoint : @main inputsInfo : {
     DataInfo "input" : tensor<1x16x128x128xf16, {order = #NHWC}>
   } outputsInfo : {

From ff578129b46e7a30bf8759b3e16ece240f3912f4 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Tue, 9 Dec 2025 15:06:26 +0800
Subject: [PATCH 05/13] Fix clang format check

---
 .../add_sw_kernel_instruction_prefetch.cpp    | 109 ++++++++----------
 1 file changed, 50 insertions(+), 59 deletions(-)

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index 4953a690f5..fea72079e5 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -31,7 +31,9 @@ using namespace vpux;
 
 namespace {
 
-static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select", "topk"};
+static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
+        "activation_swish", "eltwise_mul",   "softmax",        "convert",        "rms_norm", "activation_swish",
+        "activation_sin",   "eltwise_equal", "activation_cos", "eltwise_select", "topk"};
 
 //
 // AddSwKernelInstructionPrefetch
@@ -66,8 +68,9 @@ class AddSwKernelInstructionPrefetch final :
                                                             size_t clusterIdx, std::string& kernelName,
                                                             mlir::SymbolRefAttr functionSymbol);
 
-    VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier,
-                                                               size_t clusterIdx, std::string& kernelName);
+    VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
+                                                               mlir::ValueRange updateBarrier, size_t clusterIdx,
+                                                               std::string& kernelName);
     mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier);
     std::pair<std::string, size_t> getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp);
 
@@ -79,8 +82,9 @@ class AddSwKernelInstructionPrefetch final :
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
                                                        mlir::Operation* firstShaveTaskInIR,
                                                        mlir::Value bestUpdateBarrier);
-    std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
-                                                                 VPURT::TaskConfigVec& allTasks);
+    std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
+            mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
+            VPURT::TaskConfigVec& allTasks);
 
     bool hasVPUSWModule(mlir::Operation* funcOp);
     size_t getOffsetReservedMem(const mlir::ModuleOp module);
@@ -189,10 +193,8 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer
 }
 
 // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel
-VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
-                                                                                           mlir::ValueRange updateBarrier,
-                                                                                           size_t clusterIdx,
-                                                                                           std::string& kernelName) {
+VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(
+        mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) {
     mlir::OpBuilder builder(firstSwTask);
     auto kernelOp = kernelNameToOps[kernelName];
     auto moduleOp = kernelOp->getParentOfType<mlir::ModuleOp>();
@@ -205,7 +207,8 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
     auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector<mlir::Value>& buffers) {
         if (auto bufOp = io.getDefiningOp<VPURT::DeclareBufferOp>()) {
             auto origType = mlir::cast<NDTypeInterface>(io.getType());
-            auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex);
+            auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(),
+                                                                stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex);
             auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex});
             auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr);
             auto newBuff = builder.create<VPURT::DeclareBufferOp>(appendLoc(bufOp->getLoc(), suffix), newType,
@@ -245,10 +248,11 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
     // so we need to add skipProfiling as attribute to avoid capturing their metadata
     cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));
 
-    auto args =
-            (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos"
-                || kernelName == "activation_sin" || kernelName == "eltwise_equal"
-                || kernelName == "eltwise_select" || kernelName == "rms_norm") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];
+    auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" ||
+                 kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" ||
+                 kernelName == "rms_norm")
+                        ? mlir::ArrayAttr::get(moduleOp->getContext(), {})
+                        : kernelNameToArgs[kernelName];
 
     vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args,
                               _log.nest(), /*swKernelRunOp=*/nullptr);
@@ -422,11 +426,8 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
     return prefetchedKernels;
 }
 
-uint64_t findNextSaturationStart(size_t startIndex,
-                                 vpux::VPURT::TaskConfigVec& allTasks,
-                                 size_t numClusters,
+uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
                                  std::map<uint64_t, size_t>& swKernelCountsCache) {
-    
     // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
     const size_t saturationThreshold = numClusters * 2;
 
@@ -468,8 +469,7 @@ struct GapCandidate {
     }
 };
 
-size_t getSwKernelCountAtTime(uint64_t startTime,
-                              VPURT::TaskConfigVec& allTasks) {
+size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
     size_t count = 0;
     for (auto& taskConfig : allTasks) {
         if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
@@ -478,26 +478,21 @@ size_t getSwKernelCountAtTime(uint64_t startTime,
             }
         }
         if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
-            break; 
+            break;
         }
     }
     return count;
 }
 
-std::optional<GapCandidate> findBestInsertionGap(
-        const std::string& kernelName,
-        uint64_t targetKernelGroupStartTime,
-        VPURT::TaskConfigVec& allTasks,
-        size_t numClusters,
-        Logger& log) {
-
+std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime,
+                                                 VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) {
     const int64_t targetInsertTile = 1;
     const uint64_t GAP_THRESHOLD = 50000;
     const size_t saturationThreshold = numClusters * 2;
 
     // <LookaheadGapSize, GapCandidate>
     std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
-    std::map<uint64_t, size_t> swKernelCountsCache; // local cache
+    std::map<uint64_t, size_t> swKernelCountsCache;  // local cache
 
     int64_t previousT1TaskIndex = -1;
     uint64_t previousT1TaskStartTime = 0;
@@ -516,14 +511,14 @@ std::optional<GapCandidate> findBestInsertionGap(
         }
 
         if (previousT1TaskIndex != -1 && isT1Task) {
-            
             auto& insertionPointTask = allTasks[previousT1TaskIndex];
             auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
 
             size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
-            
+
             if (simultaneousSwKernels < saturationThreshold) {
-                uint64_t nextSaturationStart = findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
+                uint64_t nextSaturationStart =
+                        findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
                 uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
                 uint64_t lookaheadGap = 0;
                 if (gapEnd > previousT1TaskStartTime) {
@@ -554,58 +549,53 @@ std::optional<GapCandidate> findBestInsertionGap(
 }
 
 std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec(
-    mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
-    VPURT::TaskConfigVec& allTasks) {
-    
+        mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
+        VPURT::TaskConfigVec& allTasks) {
     auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>();
     const auto numClusters = getNumTiles(moduleOp);
     VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero.");
 
     std::vector<VPUIP::SwKernelOp> prefetchedKernels{};
-    
+
     for (auto& kernelInfo : kernelsToPrefetch) {
         std::string kernelName = std::get<0>(kernelInfo);
         size_t firstAppearanceIndex = std::get<2>(kernelInfo);
 
         if (firstAppearanceIndex >= allTasks.size()) {
-             _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
-             continue;
+            _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
+            continue;
         }
         if (kernelNameToOps.count(kernelName) == 0) {
-             _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
-             continue;
+            _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
+            continue;
         }
 
         auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
 
-        auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime,
-                                               allTasks, numClusters, _log);
+        auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log);
 
         if (!bestGapOpt.has_value()) {
             _log.trace("Kernel '{0}': No valid gap found.", kernelName);
             continue;
         }
-        
+
         GapCandidate bestGap = bestGapOpt.value();
-        _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.",
-                   kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);
+        _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName,
+                   bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);
 
-        if (bestGap.insertionPointTaskIndex < 0 || static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
-             _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", 
-                        kernelName, bestGap.insertionPointTaskIndex);
-             continue;
+        if (bestGap.insertionPointTaskIndex < 0 ||
+            static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
+            _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", kernelName,
+                       bestGap.insertionPointTaskIndex);
+            continue;
         }
-        
+
         auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp;
         size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters;
         _dynamicPrefetchTileCounter++;
 
-        auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(
-            insertBeforeOp,
-            mlir::ValueRange(),
-            dynamicExecTile,
-            kernelName
-        );
+        auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(insertBeforeOp, mlir::ValueRange(),
+                                                                          dynamicExecTile, kernelName);
 
         prefetchedKernels.push_back(newPrefetchKernel);
     }
@@ -659,9 +649,10 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     }
     _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
 
-    auto newPrefetchKernels = (firstShaveTaskInIR == nullptr)
-        ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks)
-        : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
+    auto newPrefetchKernels =
+            (firstShaveTaskInIR == nullptr)
+                    ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks)
+                    : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
 
     // Update dependencies for cache handling operations to meet requirements of control graph split.
     auto& barrierInfo = getAnalysis<BarrierInfo>();

From 38a8e24baa99a030294a11c1d10573e0e890c6d4 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Mon, 15 Dec 2025 21:46:35 +0800
Subject: [PATCH 06/13] Fix memory allocation assertion

---
 .../default_hw_mode_repeating_blocks.mlir     | 24 +++++--------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
index ca02746827..2dc31cfdf9 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_repeating_blocks.mlir
@@ -9,16 +9,12 @@
 !MemRef = memref<1x3x62x62xf16>
 
 module @ChainCalls {
-    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+    config.Resources 2 of @NCE at 1.300000e+03 MHz {
         builtin.module @ReservedMemory {
         module @DummySWKernelsForInstructionPrefetchReservedMemory {
-            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+            config.MemoryResource 8 bytes of @CMX_NN offset 0
         }
         }
-        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-        config.ExecutorResource 2 of @SHAVE_ACT
-        config.ExecutorResource 1 of @DPU
     }
 
     net.NetworkInfo entryPoint : @main inputsInfo : {
@@ -73,16 +69,12 @@ module @ChainCalls {
 
 !MemRef = memref<1x1x2x64xf16>
 module @SwKernelsChainCalls {
-    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+    config.Resources 2 of @NCE at 1.300000e+03 MHz {
         builtin.module @ReservedMemory {
         module @DummySWKernelsForInstructionPrefetchReservedMemory {
-            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+            config.MemoryResource 8 bytes of @CMX_NN offset 0
         }
         }
-        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-        config.ExecutorResource 2 of @SHAVE_ACT
-        config.ExecutorResource 1 of @DPU
     }
 
     net.NetworkInfo entryPoint : @main inputsInfo : {
@@ -170,16 +162,12 @@ module @SwKernelsChainCalls {
 
 !MemRef = memref<1x1x2x64xf16>
 module @SwKernelsIndependentCalls {
-    config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
+    config.Resources 2 of @NCE at 1.300000e+03 MHz {
         builtin.module @ReservedMemory {
         module @DummySWKernelsForInstructionPrefetchReservedMemory {
-            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+            config.MemoryResource 8 bytes of @CMX_NN offset 0
         }
         }
-        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-        config.ExecutorResource 2 of @SHAVE_ACT
-        config.ExecutorResource 1 of @DPU
     }
 
     net.NetworkInfo entryPoint : @main inputsInfo : {

From 48fdf840046752be223b419947f1f3a75ba37b74 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Mon, 15 Dec 2025 23:10:20 +0800
Subject: [PATCH 07/13] Fix memory allocation assertion in NPU40XX tests

---
 .../VPUIP/pipelines/default_hw_mode_40XX.mlir | 20 ++++++-------------
 ...t_hw_mode_schedule_trace_enabled_40XX.mlir | 10 +++-------
 ..._mode_vertical_fusion_outlining_40XX+.mlir | 10 +++-------
 3 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
index 8884adf385..63fcf82282 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_40XX.mlir
@@ -22,14 +22,10 @@
 module @SoftMax attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
     config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz {
         builtin.module @ReservedMemory {
-        module @DummySWKernelsForInstructionPrefetchReservedMemory {
-            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
-        }
+            module @DummySWKernelsForInstructionPrefetchReservedMemory {
+                config.MemoryResource 8 bytes of @CMX_NN offset 1473528
+            }
         }
-        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-        config.ExecutorResource 2 of @SHAVE_ACT
-        config.ExecutorResource 1 of @DPU
     }
 
     VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
@@ -180,14 +176,10 @@ module @SoftMax attributes {config.arch = #config.arch_kind<NPU40XX>, config.com
 module @TwoFunctions attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
     config.Resources {activity_factor = 0.078934384661980161 : f64} 6 of @NCE at 1.700000e+03 MHz {
         builtin.module @ReservedMemory {
-        module @DummySWKernelsForInstructionPrefetchReservedMemory {
-            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
-        }
+            module @DummySWKernelsForInstructionPrefetchReservedMemory {
+                config.MemoryResource 8 bytes of @CMX_NN offset 1473528
+            }
         }
-        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-        config.ExecutorResource 2 of @SHAVE_ACT
-        config.ExecutorResource 1 of @DPU
     }
 
     // CHECK-DAG: {{  }}config.Resources
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
index b3bf4c898b..e4e470ff78 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_schedule_trace_enabled_40XX.mlir
@@ -11,14 +11,10 @@
 module @Gather attributes {config.arch = #config.arch_kind<NPU40XX>, config.compilationMode = #config.compilation_mode<DefaultHW>} {
     config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
         builtin.module @ReservedMemory {
-        module @DummySWKernelsForInstructionPrefetchReservedMemory {
-            config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+            module @DummySWKernelsForInstructionPrefetchReservedMemory {
+                config.MemoryResource 8 bytes of @CMX_NN offset 1473528
+            }
         }
-        }
-        config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-        config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-        config.ExecutorResource 2 of @SHAVE_ACT
-        config.ExecutorResource 1 of @DPU
     }
 
     VPURT.SW.Runtime
diff --git a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
index 61918ed50d..65a149905e 100644
--- a/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/pipelines/default_hw_mode_vertical_fusion_outlining_40XX+.mlir
@@ -16,14 +16,10 @@ module @VerticalFusionOutlining attributes {config.compilationMode = #config.com
 
   config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
     builtin.module @ReservedMemory {
-    module @DummySWKernelsForInstructionPrefetchReservedMemory {
-        config.MemoryResource 8 bytes of @CMX_NN offset 1474552
+      module @DummySWKernelsForInstructionPrefetchReservedMemory {
+        config.MemoryResource 8 bytes of @CMX_NN offset 1473528
+      }
     }
-    }
-    config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-    config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-    config.ExecutorResource 2 of @SHAVE_ACT
-    config.ExecutorResource 1 of @DPU
   }
   net.NetworkInfo entryPoint : @main inputsInfo : {
     DataInfo "input" : tensor<1x16x128x128xf16, {order = #NHWC}>

From 8110a44ac5f24fe962d0885fe75933ea91af797c Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Wed, 17 Dec 2025 17:01:40 +0800
Subject: [PATCH 08/13] Fix CLIP tests in CI

---
 .../add_sw_kernel_instruction_prefetch.cpp       | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index 3512e47006..75cbab41f1 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -33,7 +33,7 @@ namespace {
 
 static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
         "activation_swish", "eltwise_mul",   "softmax",        "convert",        "rms_norm", "activation_swish",
-        "activation_sin",   "eltwise_equal", "activation_cos", "eltwise_select", "topk"};
+        "activation_sin",   "eltwise_equal", "activation_cos", "eltwise_select"};
 
 //
 // AddSwKernelInstructionPrefetch
@@ -371,7 +371,7 @@ AddSwKernelInstructionPrefetch::getFirstSwTaskInIRAndBestUpdateBarrier(VPURT::In
     _log.trace("First SW kernel start time {0}, best barrier release time {1}", firstKernelTask.cycleStart,
                bestReleaseCycle);
     if (bestReleaseCycle < _minimumFreeCyclesForPrefetch) {
-        _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, skipping prefetching",
+        _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during execution",
                   bestReleaseCycle, _minimumFreeCyclesForPrefetch);
         return std::make_tuple(nullptr, nullptr, 0);
     }
@@ -647,12 +647,14 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     if (kernelsToPrefetch.empty()) {
         return;
     }
-    _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
 
-    auto newPrefetchKernels =
-            (firstShaveTaskInIR == nullptr)
-                    ? insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks)
-                    : insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
+    std::vector<VPUIP::SwKernelOp> newPrefetchKernels;
+    if (firstShaveTaskInIR) {
+        _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
+        newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
+    } else {
+        newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks);
+    }
 
     // Update dependencies for cache handling operations to meet requirements of control graph split.
     auto& barrierInfo = getAnalysis<BarrierInfo>();

From 51c2ac38963e73a944e1f7be38f355adf5db2518 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Wed, 17 Dec 2025 17:04:28 +0800
Subject: [PATCH 09/13] Fix clang format check

---
 .../passes/add_sw_kernel_instruction_prefetch.cpp          | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index 75cbab41f1..b68f26ce30 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -32,8 +32,8 @@ using namespace vpux;
 namespace {
 
 static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
-        "activation_swish", "eltwise_mul",   "softmax",        "convert",        "rms_norm", "activation_swish",
-        "activation_sin",   "eltwise_equal", "activation_cos", "eltwise_select"};
+        "activation_swish", "eltwise_mul",    "softmax",       "convert",        "rms_norm",
+        "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
 
 //
 // AddSwKernelInstructionPrefetch
@@ -371,7 +371,8 @@ AddSwKernelInstructionPrefetch::getFirstSwTaskInIRAndBestUpdateBarrier(VPURT::In
     _log.trace("First SW kernel start time {0}, best barrier release time {1}", firstKernelTask.cycleStart,
                bestReleaseCycle);
     if (bestReleaseCycle < _minimumFreeCyclesForPrefetch) {
-        _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during execution",
+        _log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during "
+                  "execution",
                   bestReleaseCycle, _minimumFreeCyclesForPrefetch);
         return std::make_tuple(nullptr, nullptr, 0);
     }

From e96f0f8f44b63753f11e3fb17117816fc9872dd6 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Wed, 17 Dec 2025 23:56:07 +0800
Subject: [PATCH 10/13] Fix mid execution mlir test by replacing topk with
 convert

---
 ...struction_prefetch_mid_execution_40XX.mlir | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
index 2e85f9a246..3eb61f2652 100644
--- a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
@@ -9,13 +9,13 @@
 !DummyDDRT = memref<32000x1x1x1xf16, @DDR>
 !DummyCMX0T = memref<32000x1x1x1xf16, [@CMX_NN, 0]>
 !DummyCMX1T = memref<32000x1x1x1xf16, [@CMX_NN, 1]>
-!DummyCMX0TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 0]>
-!DummyCMX1TopK = memref<16000x1x1x1xsi32, [@CMX_NN, 1]>
+!DummyCMX0Convert = memref<32000x1x1x1xf32, [@CMX_NN, 0]>
+!DummyCMX1Convert = memref<32000x1x1x1xf32, [@CMX_NN, 1]>
 
 // This test checks following schedule
-//  Barriers :             0         1         2            3          4         5
-//  Cluster 0:             | [ DMA ] | [ DMA ] | [ Softmax] | [ TopK ] | [ DMA ] | [ Softmax ]
-//  Cluster 1:             | [    DMA    ]     | [ Softmax] | [ TopK ]
+//  Barriers :             0         1         2            3             4         5
+//  Cluster 0:             | [ DMA ] | [ DMA ] | [ Softmax] | [ Convert ] | [ DMA ] | [ Softmax ]
+//  Cluster 1:             | [    DMA    ]     | [ Softmax] | [ Convert ]
 //  Other    : [ SyncDMA ] |
 //
 
@@ -23,7 +23,7 @@ module @subgraph attributes {config.arch = #config.arch_kind<NPU40XX>, config.co
   VPURT.SW.Runtime entryPoint : @VPU.SW::@runtime stack_configuration : [4096, 4096, 4096, 4096, 4096, 4096]
   module @VPU.SW {
     func.func private @builtin_SoftMax(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, i64, i64) attributes {VPU.kernel_code = "softmax.cpp", VPU.kernel_entry = "softmax", VPU.task_type = @COMPUTE}
-    func.func private @builtin_TopK(memref<*xf16, @CMX_NN>, memref<*xf16, @CMX_NN>, memref<*xsi32, @CMX_NN>, i64, i64, i64, i64) attributes {VPU.kernel_code = "topk.cpp", VPU.kernel_entry = "topk", VPU.task_type = @COMPUTE}
+    func.func private @builtin_Convert(memref<*xf16, @CMX_NN>, memref<*xf32, @CMX_NN>) attributes {VPU.kernel_code = "convert.cpp", VPU.kernel_entry = "convert", VPU.kernel_name = "convert", VPU.task_type = @COMPUTE}
     func.func private @runtime() attributes {VPU.kernel_code = "nnActEntry"}
   }
   config.Resources {activity_factor = 0.078934384661980161 : f64} 2 of @NCE at 1.700000e+03 MHz {
@@ -105,18 +105,18 @@ module @subgraph attributes {config.arch = #config.arch_kind<NPU40XX>, config.co
     }
     }
 
-    %cmx0_top_k = VPURT.DeclareBuffer <CMX_NN> [0] <0> -> !DummyCMX0TopK
+    %cmx0_convert = VPURT.DeclareBuffer <CMX_NN> [0] <0> -> !DummyCMX0Convert
     VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) {
-        %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 2, 0, 0>} @VPU.SW::@builtin_TopK inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx_0 as %arg4: !DummyCMX0T, %cmx0_top_k as %arg5: !DummyCMX0TopK) on tile 0 -> (!DummyCMX0T, !DummyCMX0TopK) {
-                VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX0T, !DummyCMX0T, !DummyCMX0TopK
-    }
+      %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_Convert inputs(%cmx_0 as %arg3: !DummyCMX0T) outputs(%cmx0_convert as %arg4: !DummyCMX0Convert) on tile 0 -> (!DummyCMX0Convert) {
+        VPUIP.SW.Kernel.run {attrs = [[]]}(%arg3, %arg4) : !DummyCMX0T, !DummyCMX0Convert
+      }
     }
 
-    %cmx1_top_k = VPURT.DeclareBuffer <CMX_NN> [1] <0> -> !DummyCMX1TopK
+    %cmx1_convert = VPURT.DeclareBuffer <CMX_NN> [1] <0> -> !DummyCMX1Convert
     VPURT.Task waits(%5: !VPURT.Barrier) updates(%6 : !VPURT.Barrier) {
-        %results:2 = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 2, 0, 0>} @VPU.SW::@builtin_TopK inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx_1 as %arg4: !DummyCMX1T, %cmx1_top_k as %arg5: !DummyCMX1TopK) on tile 1 -> (!DummyCMX1T, !DummyCMX1TopK) {
-                VPUIP.SW.Kernel.run {attrs = [1, 0, 0, 1]}(%arg3, %arg4, %arg5) : !DummyCMX1T, !DummyCMX1T, !DummyCMX1TopK
-    }
+      %results = VPUIP.SW.Kernel {resultSegmentSizes = array<i32: 1, 0, 0>} @VPU.SW::@builtin_Convert inputs(%cmx_1 as %arg3: !DummyCMX1T) outputs(%cmx1_convert as %arg4: !DummyCMX1Convert) on tile 1 -> (!DummyCMX1Convert) {
+        VPUIP.SW.Kernel.run {attrs = [[]]}(%arg3, %arg4) : !DummyCMX1T, !DummyCMX1Convert
+      }
     }
 
     VPURT.Task waits(%6: !VPURT.Barrier) updates(%7 : !VPURT.Barrier) {
@@ -158,15 +158,15 @@ module @subgraph attributes {config.arch = #config.arch_kind<NPU40XX>, config.co
     // CHECK:       VPURT.Task {
     // CHECK-NEXT:        VPUIP.SW.Kernel
     // CHECK-SAME:        skipProfiling
-    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+    // CHECK-SAME:        @VPU.SW::@builtin_Convert
 
     // CHECK:       VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) {
     // CHECK:             VPUIP.SW.Kernel
-    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+    // CHECK-SAME:        @VPU.SW::@builtin_Convert
 
     // CHECK:       VPURT.Task waits([[BARRIER_5]] : !VPURT.Barrier) updates([[BARRIER_6]] : !VPURT.Barrier) {
     // CHECK:             VPUIP.SW.Kernel
-    // CHECK-SAME:        @VPU.SW::@builtin_TopK
+    // CHECK-SAME:        @VPU.SW::@builtin_Convert
 
     // CHECK:       VPURT.Task waits([[BARRIER_6]] : !VPURT.Barrier) updates([[BARRIER_7]] : !VPURT.Barrier) {
     // CHECK-NEXT:        VPUIP.NNDMA

From a7cc88d4462bdeabd69066f126b2924c50288a56 Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Fri, 19 Dec 2025 00:14:04 +0800
Subject: [PATCH 11/13] Refactor code, rename variables and add comments

---
 .../add_sw_kernel_instruction_prefetch.cpp    | 128 +++++++++---------
 ...struction_prefetch_mid_execution_40XX.mlir |   2 +-
 2 files changed, 68 insertions(+), 62 deletions(-)

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index b68f26ce30..47a83f654c 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -31,10 +31,23 @@ using namespace vpux;
 
 namespace {
 
+struct GapCandidate {
+    uint64_t lookaheadGap = 0;
+    int64_t insertionPointTaskIndex = -1;
+
+    // used for sort
+    bool operator>(const GapCandidate& other) const {
+        return lookaheadGap > other.lookaheadGap;
+    }
+};
+
 static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
         "activation_swish", "eltwise_mul",    "softmax",       "convert",        "rms_norm",
         "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
 
+static const SmallVector<StringRef> SW_DUMMY_KERNELS_WITHOUT_ARGS = {
+        "convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"};
+
 //
 // AddSwKernelInstructionPrefetch
 //
@@ -82,6 +95,9 @@ class AddSwKernelInstructionPrefetch final :
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
                                                        mlir::Operation* firstShaveTaskInIR,
                                                        mlir::Value bestUpdateBarrier);
+    std::optional<GapCandidate> findBestInsertionGapDuringExec(const std::string& kernelName,
+                                                               uint64_t targetKernelGroupStartTime,
+                                                               VPURT::TaskConfigVec& allTasks, size_t numClusters);
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
             mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
             VPURT::TaskConfigVec& allTasks);
@@ -101,6 +117,12 @@ class AddSwKernelInstructionPrefetch final :
     size_t _minimumFreeCyclesForPrefetch = 250000;
     bool _useDummyKernelForInstructionPrefetch = false;
     size_t _dynamicPrefetchTileCounter = 0;
+    // Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger
+    // than 1.
+    int64_t _targetInsertTileDuringExec = 1;
+    // The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap
+    // to perform instruction prefetching without causing stalls.
+    uint64_t _prefetchGapThresholdDuringExec = 50000;
 };
 
 bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
@@ -248,9 +270,7 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst
     // so we need to add skipProfiling as attribute to avoid capturing their metadata
     cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));
 
-    auto args = (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" ||
-                 kernelName == "activation_sin" || kernelName == "eltwise_equal" || kernelName == "eltwise_select" ||
-                 kernelName == "rms_norm")
+    auto args = llvm::is_contained(SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName)
                         ? mlir::ArrayAttr::get(moduleOp->getContext(), {})
                         : kernelNameToArgs[kernelName];
 
@@ -427,6 +447,21 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
     return prefetchedKernels;
 }
 
+size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
+    size_t count = 0;
+    for (auto& taskConfig : allTasks) {
+        if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
+            if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
+                count++;
+            }
+        }
+        if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
+            break;
+        }
+    }
+    return count;
+}
+
 uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
                                  std::map<uint64_t, size_t>& swKernelCountsCache) {
     // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
@@ -437,19 +472,7 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
         uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);
 
         if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
-            size_t swKernelCount = 0;
-            // Count all SW Kernels that start at this specific time
-            for (auto& task : allTasks) {
-                if (static_cast<uint64_t>(task.cycleStart) == currentStartTime) {
-                    if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp.getInnerTaskOp())) {
-                        swKernelCount++;
-                    }
-                }
-                if (static_cast<uint64_t>(task.cycleStart) > currentStartTime) {
-                    break;
-                }
-            }
-            swKernelCountsCache[currentStartTime] = swKernelCount;
+            swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime(currentStartTime, allTasks);
         }
 
         if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
@@ -460,43 +483,17 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
     return std::numeric_limits<uint64_t>::max();
 }
 
-struct GapCandidate {
-    uint64_t lookaheadGap = 0;
-    int64_t insertionPointTaskIndex = -1;
-
-    // used for sort
-    bool operator>(const GapCandidate& other) const {
-        return lookaheadGap > other.lookaheadGap;
-    }
-};
-
-size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
-    size_t count = 0;
-    for (auto& taskConfig : allTasks) {
-        if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
-            if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
-                count++;
-            }
-        }
-        if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
-            break;
-        }
-    }
-    return count;
-}
-
-std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName, uint64_t targetKernelGroupStartTime,
-                                                 VPURT::TaskConfigVec& allTasks, size_t numClusters, Logger& log) {
-    const int64_t targetInsertTile = 1;
-    const uint64_t GAP_THRESHOLD = 50000;
+std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec(
+        const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
+        size_t numClusters) {
     const size_t saturationThreshold = numClusters * 2;
 
     // <LookaheadGapSize, GapCandidate>
     std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
     std::map<uint64_t, size_t> swKernelCountsCache;  // local cache
 
-    int64_t previousT1TaskIndex = -1;
-    uint64_t previousT1TaskStartTime = 0;
+    int64_t prevTargetTileTaskIndex = -1;
+    uint64_t prevTargetTileTaskStartTime = 0;
 
     // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
     for (size_t i = 0; i < allTasks.size(); ++i) {
@@ -506,43 +503,43 @@ std::optional<GapCandidate> findBestInsertionGap(const std::string& kernelName,
             break;
         }
 
-        bool isT1Task = false;
+        bool isTargetTileTask = false;
         if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
-            isT1Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile);
+            isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec);
         }
 
-        if (previousT1TaskIndex != -1 && isT1Task) {
-            auto& insertionPointTask = allTasks[previousT1TaskIndex];
+        if (prevTargetTileTaskIndex != -1 && isTargetTileTask) {
+            auto& insertionPointTask = allTasks[prevTargetTileTaskIndex];
             auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
 
             size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
 
             if (simultaneousSwKernels < saturationThreshold) {
                 uint64_t nextSaturationStart =
-                        findNextSaturationStart(previousT1TaskIndex, allTasks, numClusters, swKernelCountsCache);
+                        findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache);
                 uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
                 uint64_t lookaheadGap = 0;
-                if (gapEnd > previousT1TaskStartTime) {
-                    lookaheadGap = gapEnd - previousT1TaskStartTime;
+                if (gapEnd > prevTargetTileTaskStartTime) {
+                    lookaheadGap = gapEnd - prevTargetTileTaskStartTime;
                 }
 
-                if (lookaheadGap >= GAP_THRESHOLD) {
+                if (lookaheadGap >= _prefetchGapThresholdDuringExec) {
                     GapCandidate gap;
                     gap.lookaheadGap = lookaheadGap;
-                    gap.insertionPointTaskIndex = previousT1TaskIndex;
+                    gap.insertionPointTaskIndex = prevTargetTileTaskIndex;
                     validGaps[lookaheadGap] = gap;
                 }
             }
         }
 
-        if (isT1Task) {
-            previousT1TaskIndex = static_cast<int64_t>(i);
-            previousT1TaskStartTime = currentTaskStartTime;
+        if (isTargetTileTask) {
+            prevTargetTileTaskIndex = static_cast<int64_t>(i);
+            prevTargetTileTaskStartTime = currentTaskStartTime;
         }
     }
 
     if (validGaps.empty()) {
-        log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
+        _log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
         return std::nullopt;
     }
 
@@ -573,7 +570,16 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
 
         auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
 
-        auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, allTasks, numClusters, _log);
+        // Finds the best insertion point for prefetch by identifying non-saturated execution windows.
+        // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
+        // duration from an anchor task to the next saturation event or the target kernel start.
+        //
+        // Logic:
+        // 1. Find a candidate task on the target tile.
+        // 2. Ensure NPU is not saturated at that time.
+        // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
+        // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
+        auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters);
 
         if (!bestGapOpt.has_value()) {
             _log.trace("Kernel '{0}': No valid gap found.", kernelName);
diff --git a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
index 3eb61f2652..7c75b60da1 100644
--- a/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
+++ b/tests/lit/NPU/dialect/VPUIP/passes/add_sw_kernel_instruction_prefetch_mid_execution_40XX.mlir
@@ -1,5 +1,5 @@
 //
-// Copyright (C) 2024-2025 Intel Corporation.
+// Copyright (C) 2025 Intel Corporation.
 // SPDX-License-Identifier: Apache-2.0
 //
 

From 0e46bcd0c231f7eefb8d7ba485dc03f4c3dc271a Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Wed, 14 Jan 2026 00:30:22 +0800
Subject: [PATCH 12/13] Address code review comments

---
 .../add_sw_kernel_instruction_prefetch.cpp    | 42 +++++++------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index 47a83f654c..2d8c77b7ff 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -42,8 +42,8 @@ struct GapCandidate {
 };
 
 static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
-        "activation_swish", "eltwise_mul",    "softmax",       "convert",        "rms_norm",
-        "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
+        "eltwise_mul",    "softmax",       "convert",        "rms_norm",      "activation_swish",
+        "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};
 
 static const SmallVector<StringRef> SW_DUMMY_KERNELS_WITHOUT_ARGS = {
         "convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"};
@@ -462,12 +462,9 @@ size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks
     return count;
 }
 
-uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
+uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t saturationThreshold,
                                  std::map<uint64_t, size_t>& swKernelCountsCache) {
-    // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
-    const size_t saturationThreshold = numClusters * 2;
-
-    // Iterate through tasks strictly AFTER the startIndex
+    // Iterate through tasks strictly after the startIndex
     for (size_t i = startIndex + 1; i < allTasks.size(); ++i) {
         uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);
 
@@ -485,9 +482,7 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
 
 std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec(
         const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
-        size_t numClusters) {
-    const size_t saturationThreshold = numClusters * 2;
-
+        size_t saturationThreshold) {
     // <LookaheadGapSize, GapCandidate>
     std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
     std::map<uint64_t, size_t> swKernelCountsCache;  // local cache
@@ -504,7 +499,7 @@ std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGap
         }
 
         bool isTargetTileTask = false;
-        if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
+        if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp())) {
             isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec);
         }
 
@@ -515,8 +510,8 @@ std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGap
             size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
 
             if (simultaneousSwKernels < saturationThreshold) {
-                uint64_t nextSaturationStart =
-                        findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache);
+                uint64_t nextSaturationStart = findNextSaturationStart(prevTargetTileTaskIndex, allTasks,
+                                                                       saturationThreshold, swKernelCountsCache);
                 uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
                 uint64_t lookaheadGap = 0;
                 if (gapEnd > prevTargetTileTaskStartTime) {
@@ -551,24 +546,18 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
         VPURT::TaskConfigVec& allTasks) {
     auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>();
     const auto numClusters = getNumTiles(moduleOp);
-    VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero.");
+    const auto noOfShavesPerCluster =
+            config::getTileExecutor(moduleOp).getSubExecutor(VPU::ExecutorKind::SHAVE_ACT).getCount();
+    _log.info("numClusters {0}, noOfShavesPerCluster: {1}", numClusters, noOfShavesPerCluster);
 
     std::vector<VPUIP::SwKernelOp> prefetchedKernels{};
 
     for (auto& kernelInfo : kernelsToPrefetch) {
         std::string kernelName = std::get<0>(kernelInfo);
         size_t firstAppearanceIndex = std::get<2>(kernelInfo);
-
-        if (firstAppearanceIndex >= allTasks.size()) {
-            _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
-            continue;
-        }
-        if (kernelNameToOps.count(kernelName) == 0) {
-            _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
-            continue;
-        }
-
         auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
+        // Saturation is defined as the total number of SHAVE cores (clusters * shaves per cluster)
+        const size_t saturationThreshold = numClusters * noOfShavesPerCluster;
 
         // Finds the best insertion point for prefetch by identifying non-saturated execution windows.
         // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
@@ -579,7 +568,8 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
         // 2. Ensure NPU is not saturated at that time.
         // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
         // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
-        auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters);
+        auto bestGapOpt =
+                findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, saturationThreshold);
 
         if (!bestGapOpt.has_value()) {
             _log.trace("Kernel '{0}': No valid gap found.", kernelName);
@@ -659,7 +649,7 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
     if (firstShaveTaskInIR) {
         _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
         newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
-    } else {
+    } else if (_useDummyKernelForInstructionPrefetch) {
         newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks);
     }
 

From 768056428be3d624e4eb85511520a46d1de27c5e Mon Sep 17 00:00:00 2001
From: Kepontry <zjpzhoujiapeng@163.com>
Date: Thu, 15 Jan 2026 11:56:43 +0800
Subject: [PATCH 13/13] Address code review comments

---
 .../add_sw_kernel_instruction_prefetch.cpp    | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
index 2d8c77b7ff..b17a6442cc 100644
--- a/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
+++ b/src/vpux_compiler/src/dialect/VPUIP/transforms/passes/add_sw_kernel_instruction_prefetch.cpp
@@ -97,7 +97,8 @@ class AddSwKernelInstructionPrefetch final :
                                                        mlir::Value bestUpdateBarrier);
     std::optional<GapCandidate> findBestInsertionGapDuringExec(const std::string& kernelName,
                                                                uint64_t targetKernelGroupStartTime,
-                                                               VPURT::TaskConfigVec& allTasks, size_t numClusters);
+                                                               VPURT::TaskConfigVec& allTasks, size_t numClusters,
+                                                               size_t noOfShavesPerCluster);
     std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
             mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
             VPURT::TaskConfigVec& allTasks);
@@ -117,9 +118,9 @@ class AddSwKernelInstructionPrefetch final :
     size_t _minimumFreeCyclesForPrefetch = 250000;
     bool _useDummyKernelForInstructionPrefetch = false;
     size_t _dynamicPrefetchTileCounter = 0;
-    // Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger
-    // than 1.
-    int64_t _targetInsertTileDuringExec = 1;
+    // Used as the reference tile for analyzing schedule availability (gaps).
+    // Index 1 ensures prefetching is only enabled for multi-tile (>=2) kernels.
+    int64_t _referenceTileForGapFindingDuringExec = 1;
     // The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap
     // to perform instruction prefetching without causing stalls.
     uint64_t _prefetchGapThresholdDuringExec = 50000;
@@ -482,11 +483,13 @@ uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec&
 
 std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec(
         const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
-        size_t saturationThreshold) {
+        size_t numClusters, size_t noOfShavesPerCluster) {
     // <LookaheadGapSize, GapCandidate>
     std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
     std::map<uint64_t, size_t> swKernelCountsCache;  // local cache
 
+    // Saturation is defined as the total number of SHAVE cores (clusters * shaves per cluster)
+    const size_t saturationThreshold = numClusters * noOfShavesPerCluster;
     int64_t prevTargetTileTaskIndex = -1;
     uint64_t prevTargetTileTaskStartTime = 0;
 
@@ -500,14 +503,14 @@ std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGap
 
         bool isTargetTileTask = false;
         if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp())) {
-            isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec);
+            isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _referenceTileForGapFindingDuringExec);
+        }
+        if (!isTargetTileTask) {
+            continue;
         }
 
-        if (prevTargetTileTaskIndex != -1 && isTargetTileTask) {
-            auto& insertionPointTask = allTasks[prevTargetTileTaskIndex];
-            auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);
-
-            size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);
+        if (prevTargetTileTaskIndex != -1) {
+            size_t simultaneousSwKernels = getSwKernelCountAtTime(prevTargetTileTaskStartTime, allTasks);
 
             if (simultaneousSwKernels < saturationThreshold) {
                 uint64_t nextSaturationStart = findNextSaturationStart(prevTargetTileTaskIndex, allTasks,
@@ -526,11 +529,8 @@ std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGap
                 }
             }
         }
-
-        if (isTargetTileTask) {
-            prevTargetTileTaskIndex = static_cast<int64_t>(i);
-            prevTargetTileTaskStartTime = currentTaskStartTime;
-        }
+        prevTargetTileTaskIndex = static_cast<int64_t>(i);
+        prevTargetTileTaskStartTime = currentTaskStartTime;
     }
 
     if (validGaps.empty()) {
@@ -556,8 +556,6 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
         std::string kernelName = std::get<0>(kernelInfo);
         size_t firstAppearanceIndex = std::get<2>(kernelInfo);
         auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);
-        // Saturation is defined as the total number of SHAVE cores (clusters * shaves per cluster)
-        const size_t saturationThreshold = numClusters * noOfShavesPerCluster;
 
         // Finds the best insertion point for prefetch by identifying non-saturated execution windows.
         // Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
@@ -568,8 +566,8 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
         // 2. Ensure NPU is not saturated at that time.
         // 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
         // 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
-        auto bestGapOpt =
-                findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, saturationThreshold);
+        auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters,
+                                                         noOfShavesPerCluster);
 
         if (!bestGapOpt.has_value()) {
             _log.trace("Kernel '{0}': No valid gap found.", kernelName);