From 5e18fb14e74601976b976cfe0da4a259e1585248 Mon Sep 17 00:00:00 2001
From: "Doronin, Maksim" <maksim.doronin@intel.com>
Date: Fri, 27 Feb 2026 11:02:05 +0000
Subject: [PATCH 1/3] Revert Adding Compress Convolution as part of VF fusion

---
 .../interfaces/scf/scf_tiling_interfaces.hpp  |   2 -
 .../dialect/VPU/interfaces/scf_tiling_ops.cpp |   1 -
 .../VPU/IR/ops/nce_compress_convolution.cpp   |  27 --
 .../vpux/compiler/dialect/VPU/ops/dpu.td      |   4 +-
 .../VPU/passes/apply_tiling_scf_40XX+.mlir    | 142 ----------
 .../VPU/passes/scf_vertical_fusion_40XX+.mlir | 244 ------------------
 6 files changed, 1 insertion(+), 419 deletions(-)

diff --git a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp
index d27411f807..9400da137b 100644
--- a/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp
+++ b/src/vpux_compiler/include/vpux/compiler/dialect/VPU/interfaces/scf/scf_tiling_interfaces.hpp
@@ -502,8 +502,6 @@ class SCFTilingConvModelOp : public SCFTilingCommonModelOp<ConcreteModel, Concre
 
 class SCFConvOpModel : public SCFTilingConvModelOp<SCFConvOpModel, NCEConvolutionOp> {};
 
-class SCFCompressConvOpModel : public SCFTilingConvModelOp<SCFCompressConvOpModel, NCECompressConvolutionOp> {};
-
 class SCFTilingDepthConvModelOp : public SCFTilingConvModelOp<SCFTilingDepthConvModelOp, NCEDepthConvolutionOp> {
 public:
     SCFTilingInfo backInferSCFTileInfo(mlir::Operation* operation, mlir::OpBuilder& builder,
diff --git a/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp b/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp
index 6eced0936f..26d9d4bdcd 100644
--- a/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp
+++ b/src/vpux_compiler/src/NPU40XX/dialect/VPU/interfaces/scf_tiling_ops.cpp
@@ -14,7 +14,6 @@ void vpux::VPU::arch40xx::registerSCFTilingOpsInterfaces(mlir::DialectRegistry&
         VPU::NCEAveragePoolOp::attachInterface<vpux::VPU::SCFAvgPoolOpModel>(*ctx);
         VPU::NCEMaxPoolOp::attachInterface<vpux::VPU::SCFMaxPoolOpModel>(*ctx);
         VPU::NCEConvolutionOp::attachInterface<vpux::VPU::SCFConvOpModel>(*ctx);
-        VPU::NCECompressConvolutionOp::attachInterface<vpux::VPU::SCFCompressConvOpModel>(*ctx);
         VPU::NCEDepthConvolutionOp::attachInterface<vpux::VPU::SCFTilingDepthConvModelOp>(*ctx);
         VPU::NCEPermuteOp::attachInterface<vpux::VPU::SCFTilingPermuteModelOp>(*ctx);
 
diff --git a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp
index 24ae504b07..b2f69507eb 100644
--- a/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp
+++ b/src/vpux_compiler/src/dialect/VPU/IR/ops/nce_compress_convolution.cpp
@@ -426,30 +426,3 @@ vpux::VPU::SparsitySupport vpux::VPU::NCECompressConvolutionOp::sparsitySupport(
     }
     return VPU::SparsitySupport::SPARSE_OUTPUTS & excludeMode;
 }
-
-mlir::LogicalResult vpux::VPU::NCECompressConvolutionOp::reifyResultShapes(
-        mlir::OpBuilder& builder, mlir::ReifiedRankedShapedTypeDims& reifiedReturnShapes) {
-    // Parse attributes
-    const auto strides = parseIntArrayAttr<int64_t>(getStrides());
-
-    const auto padTop = getPad().getTop().getValue().getSExtValue();
-    const auto padBottom = getPad().getBottom().getValue().getSExtValue();
-    const auto padLeft = getPad().getLeft().getValue().getSExtValue();
-    const auto padRight = getPad().getRight().getValue().getSExtValue();
-
-    const auto dataPaddingAbove = SmallVector<int64_t>({padTop, padLeft});
-    const auto dataPaddingBelow = SmallVector<int64_t>({padBottom, padRight});
-
-    const auto rawFilterShape = Shape(parseIntArrayAttr<int64_t>(getRawFilterShape()));
-    SmallVector<int64_t> kernelSize{rawFilterShape[Dims4D::Filter::KY], rawFilterShape[Dims4D::Filter::KX]};
-
-    // Compute output shape using utility
-    auto outShape = reifyConvPoolTensors(builder, getInput(), getOutput(), getFilter(), kernelSize, strides,
-                                         dataPaddingAbove, dataPaddingBelow, getLoc());
-    if (mlir::failed(outShape)) {
-        return outShape;
-    }
-
-    reifiedReturnShapes.emplace_back(std::move(outShape.value()));
-    return mlir::success();
-}
diff --git a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/dpu.td b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/dpu.td
index 0d9299f52c..63d86dccaf 100644
--- a/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/dpu.td
+++ b/src/vpux_compiler/tblgen/vpux/compiler/dialect/VPU/ops/dpu.td
@@ -425,9 +425,7 @@ def VPU_NCECompressConvolutionOp :
                                                                      "doesLayerFitIntoCMX",
                                                                      "doesLayerChangeOutputAlignmentFitIntoCMX",
                                                                      "getDistributedTypeForOpOperand"]>,
-                DeclareOpInterfaceMethods<VPU_SparseOpInterface>,
-                DeclareOpInterfaceMethods<VPU_VerticalFusionOpInterface>,
-                DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>
+                DeclareOpInterfaceMethods<VPU_SparseOpInterface>
 ]
         > {
     let summary = "NCE version of Compressed Convolution layer";
diff --git a/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir
index 53efde9e36..145f0370f6 100644
--- a/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir
+++ b/tests/lit/NPU/dialect/VPU/passes/apply_tiling_scf_40XX+.mlir
@@ -1166,145 +1166,3 @@ func.func @ApplyTilingD2SPadded(
 
     //CHECK: return [[H_LOOP]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 1080, 1920]> : tensor<4xsi64>, order = #NHWC}>
 }
-
-// -----
-
-//CHECK: #[[$MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 11)>
-
-#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
-
-!dynInputType = tensor<1x4x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-!dynOutputType = tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-
-// CHECK-LABEL: @NoPaddingCompressCONV_W_DynamicInput
-// CHECK-SAME:      [[INPUT:%arg[0-9]]]: tensor<1x4x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-// CHECK-SAME:      [[WEIGHTS:%arg[0-9]]]: tensor<32x4x1x1xf16, {order = #NHWC}>
-// CHECK-SAME:      [[WEIGHTS_TABLE:%arg[0-9]]]: tensor<32x1x1x4xsi32>
-func.func @NoPaddingCompressCONV_W_DynamicInput(
-         %arg0: !dynInputType,
-         %arg1: tensor<32x4x1x1xf16, {order = #NHWC}>,
-         %arg2: tensor<32x1x1x4xsi32>
- ) -> !dynOutputType {
-     %1 = VPU.NCE.CompressConvolution(%arg0, %arg1, %arg2) {
-         pad = #VPU.Padding<
-             left = 0 : i64,
-             right = 0 : i64,
-             top = 0 : i64,
-             bottom = 0 : i64
-         >,
-         ppe = #VPU.PPEInt<
-             mode = <NOOP>,
-             clamp_low = -2147483648 : i64,
-             clamp_high = 2147483647 : i64,
-             lrelu_mult = 1 : i64,
-             lrelu_shift = 0 : i64,
-             fp_prelu_alpha = 1.000000e+00 : f64
-         >,
-         rawFilterShape = [32, 4, 1, 1],
-         strides = [1, 1],
-         tilingStrategy = [1, 1, 1, 117],
-         cm_sp_pattern = 0
-     } : !dynInputType, tensor<32x4x1x1xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> !dynOutputType
-
-    //CHECK-DAG: [[DIM_VALUE_0:%.+]] = arith.constant 3 : index
-    //CHECK-DAG: [[DIM_0:%.+]] = tensor.dim [[INPUT]], [[DIM_VALUE_0]] : tensor<1x4x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK-DAG: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[C11:%.+]] = arith.constant 11 : index
-    //CHECK-DAG: [[LOOP_OUTPUT:%.+]] = tensor.empty([[DIM_0]]) : tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK: [[LOOP:%.+]] = scf.for
-    //CHECK-SAME:           [[LOOP_ITER:%arg[0-9]]] = [[C0]] to [[DIM_0]] step [[C11]]
-    //CHECK-SAME:           iter_args([[LOOP_OUT:%arg[0-9]]]  = [[LOOP_OUTPUT]]) -> (tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>) {
-
-    //CHECK:                [[SIZE:%.+]] = affine.min #[[$MAP]]([[LOOP_ITER]])[[[DIM_0]]]
-    //CHECK:                [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, 0, [[LOOP_ITER]]] [1, 4, 800, [[SIZE]]] [1, 1, 1, 1]
-    //CHECK-SAME:           : tensor<1x4x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x4x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 11]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                [[COMPRESS_CONV:%.+]] = VPU.NCE.CompressConvolution([[SLICE]], [[WEIGHTS]], [[WEIGHTS_TABLE]])
-    //CHECK-SAME:           {cm_sp_pattern = 0 : i64, pad = #VPU.Padding<left = 0 : i64, right = 0 : i64, top = 0 : i64, bottom = 0 : i64>
-    //CHECK-SAME:           , ppe = #VPU.PPEInt<mode = <NOOP>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 4, 1, 1], strides = [1, 1], tiling_loop_index = 0 : i64} -> tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 11]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                [[INSERT:%.+]] = tensor.insert_slice [[COMPRESS_CONV]] into [[LOOP_OUT]][0, 0, 0, [[LOOP_ITER]]] [1, 32, 800, [[SIZE]]] [1, 1, 1, 1] : tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 11]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                scf.yield [[INSERT]] : tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-
-     return %1 : !dynOutputType
-
-    //CHECK: return [[LOOP]] : tensor<1x32x800x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-}
-
-// -----
-
-//CHECK: #[[$MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 400)>
-//CHECK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 11)>
-
-#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
-
-!dynInputType = tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-!dynOutputType = tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-
-// CHECK-LABEL: @NoPaddingCompressCONV_HW_DynamicInput
-// CHECK-SAME:      [[INPUT:%arg[0-9]]]: tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-// CHECK-SAME:      [[WEIGHTS:%arg[0-9]]]: tensor<32x4x1x1xf16, {order = #NHWC}>
-// CHECK-SAME:      [[WEIGHTS_TABLE:%arg[0-9]]]: tensor<32x1x1x4xsi32>
-func.func @NoPaddingCompressCONV_HW_DynamicInput(
-         %arg0: !dynInputType,
-         %arg1: tensor<32x4x1x1xf16, {order = #NHWC}>,
-         %arg2: tensor<32x1x1x4xsi32>
- ) -> !dynOutputType {
-     %1 = VPU.NCE.CompressConvolution(%arg0, %arg1, %arg2) {
-         pad = #VPU.Padding<
-             left = 0 : i64,
-             right = 0 : i64,
-             top = 0 : i64,
-             bottom = 0 : i64
-         >,
-         ppe = #VPU.PPEInt<
-             mode = <NOOP>,
-             clamp_low = -2147483648 : i64,
-             clamp_high = 2147483647 : i64,
-             lrelu_mult = 1 : i64,
-             lrelu_shift = 0 : i64,
-             fp_prelu_alpha = 1.000000e+00 : f64
-         >,
-         rawFilterShape = [32, 4, 1, 1],
-         strides = [1, 1],
-         tilingStrategy = [1, 1, 2, 117],
-         cm_sp_pattern = 0
-     } : !dynInputType, tensor<32x4x1x1xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32> -> !dynOutputType
-
-    //CHECK-DAG: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index
-
-    //CHECK-DAG: [[DIM_VALUE_H_1:%.+]] = arith.constant 2 : index
-    //CHECK-DAG: [[LOOP_END_H:%.+]] = tensor.dim [[INPUT]], [[DIM_VALUE_H_1]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK-DAG: [[DIM_VALUE_W_1:%.+]] = arith.constant 3 : index
-    //CHECK-DAG: [[LOOP_END_W:%.+]] = tensor.dim [[INPUT]], [[DIM_VALUE_W_1]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK-DAG: [[LOOP_STEP_H:%.+]] = arith.constant 400 : index
-
-    //CHECK-DAG: [[LOOP_STEP_W:%.+]] = arith.constant 11 : index
-    
-	//CHECK-DAG: [[LOOP_OUTPUT:%.+]] = tensor.empty([[LOOP_END_H]], [[LOOP_END_W]]) : tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-    
-	//CHECK: [[LOOP_H:%.+]] = scf.for
-    //CHECK-SAME:           [[LOOP_ITER_H:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_H]] step [[LOOP_STEP_H]]
-    //CHECK-SAME:           iter_args([[LOOP_OUT_H:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>) {
-
-
-    //CHECK: [[LOOP_W:%.+]] = scf.for
-    //CHECK-SAME:           [[LOOP_ITER_W:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_W]] step [[LOOP_STEP_W]]
-    //CHECK-SAME:           iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUT_H]]) -> (tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>) {
-
-    //CHECK:                [[SIZE_H:%.+]] = affine.min #[[$MAP]]([[LOOP_ITER_H]])[[[LOOP_END_H]]]
-    //CHECK:                [[SIZE_W:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER_W]])[[[LOOP_END_W]]]
-
-    //CHECK:                [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[LOOP_ITER_H]], [[LOOP_ITER_W]]] [1, 4, [[SIZE_H]], [[SIZE_W]]] [1, 1, 1, 1]
-    //CHECK-SAME:           : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 800, 1280]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 400, 11]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                [[COMPRESS_CONV:%.+]] = VPU.NCE.CompressConvolution([[SLICE]], [[WEIGHTS]], [[WEIGHTS_TABLE]])
-    //CHECK-SAME:           {cm_sp_pattern = 0 : i64, pad = #VPU.Padding<left = 0 : i64, right = 0 : i64, top = 0 : i64, bottom = 0 : i64>
-    //CHECK-SAME:           , ppe = #VPU.PPEInt<mode = <NOOP>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64, lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>, rawFilterShape = [32, 4, 1, 1], strides = [1, 1], tiling_loop_index = 0 : i64} -> tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 400, 11]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                [[INSERT:%.+]] = tensor.insert_slice [[COMPRESS_CONV]] into [[LOOP_OUT]][0, 0, [[LOOP_ITER_H]], [[LOOP_ITER_W]]] [1, 32, [[SIZE_H]], [[SIZE_W]]] [1, 1, 1, 1] : tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 400, 11]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                scf.yield [[INSERT]] : tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 800, 1280]> : tensor<4xsi64>, order = #NHWC}>
-
-     return %1 : !dynOutputType
-}
diff --git a/tests/lit/NPU/dialect/VPU/passes/scf_vertical_fusion_40XX+.mlir b/tests/lit/NPU/dialect/VPU/passes/scf_vertical_fusion_40XX+.mlir
index 559198b229..0caff9b889 100644
--- a/tests/lit/NPU/dialect/VPU/passes/scf_vertical_fusion_40XX+.mlir
+++ b/tests/lit/NPU/dialect/VPU/passes/scf_vertical_fusion_40XX+.mlir
@@ -1470,247 +1470,3 @@ func.func @PermuteEltwiseFusion(%arg0: tensor<1x16x?x?xf16, {bounds = #const.Opa
     // CHECK:  scf.yield [[LOOP_W]]
     // CHECK: return [[LOOP_H]] : tensor<1x16x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 16, 1280, 1280]> : tensor<4xsi64>, order = #NHWC}>
 }
-
-// -----
-
-config.Resources 3 of @NCE at 1.700000e+03 MHz {
-    config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-    config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-    config.ExecutorResource 2 of @SHAVE_ACT
-    config.ExecutorResource 1 of @DPU
-}
-
-#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
-
-//CHECK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 45)>
-//CHECK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 320)>
-//CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (0, d0 - 1)>
-//CHECK: #[[$MAP3:.+]] = affine_map<(d0) -> (-d0 + 1, 0)>
-//CHECK: #[[$MAP4:.+]] = affine_map<()[s0] -> (1, s0)>
-//CHECK: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (0, d0 + d1 - s0 + 2)>
-//CHECK: #[[$MAP6:.+]] = affine_map<(d0, d1, d2, d3)[s0] -> (0, d0 + d1 - d2 - d3 - s0 + 4)>
-//CHECK: #[[$MAP7:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (-d0 - d1 + d2 - d3 - d4 + 4)>
-
-
-// CHECK-LABEL: @Merge2DVFChainCompressConv
-// CHECK-SAME:  [[INPUT:%arg[0-9]]]: tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>)
-func.func @Merge2DVFChainCompressConv(%arg0: tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
- {
-    %cst = const.Declare tensor<32x4x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x4x3x3xf32>, [#const.CastElemType<f16>, #const.Reorder<#NHWC>]
-    %cst_3 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32>
-    %cst_0 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1x32x1x1xf32>, [#const.Reshape<[32, 1, 1, 1]>, #const.CastElemType<f16>, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>]
-    %cst_1 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType<f16>, #const.Reorder<#NHWC>]
-
-    %0 = VPU.NCE.CompressConvolution(%arg0, %cst, %cst_3) {mpe_engine = #VPU.MPEEngine37XX<mode = <SCL>>,
-                                           multiClusterStrategy = #VPU.multi_cluster_strategy<SplitOverHeight>,
-                                           pad = #VPU.Padding<left = 1 : i64, right = 1 : i64, top = 1 : i64, bottom = 1 : i64>,
-                                           ppe = #VPU.PPEInt<mode = <LRELU>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64,
-                                           lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>,
-                                           rawFilterShape = [32, 4, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 21], cm_sp_pattern = 0}
-        : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>,
-          tensor<32x4x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32>
-        -> tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    %1 = VPU.NCE.DepthConvolution(%0, %cst_0) {multiClusterStrategy = #VPU.multi_cluster_strategy<SplitOverHeight>,
-                                                       pad = #VPU.Padding<left = 0 : i64, right = 0 : i64, top = 0 : i64, bottom = 0 : i64>,
-                                                       ppe = #VPU.PPEInt<mode = <NOOP>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64,
-                                                       lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>,
-                                                       rawFilterShape = [32, 1, 1, 1], strides = [1, 1], tilingStrategy = [1, 1, 1, 20]}
-        -> tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    %2 = VPU.NCE.Convolution(%1, %cst_1) {mpe_engine = #VPU.MPEEngine37XX<mode = <SCL>>,
-                                          multiClusterStrategy = #VPU.multi_cluster_strategy<SplitOverHeight>,
-                                          pad = #VPU.Padding<left = 1 : i64, right = 1 : i64, top = 1 : i64, bottom = 1 : i64>,
-                                          ppe = #VPU.PPEInt<mode = <LRELU>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64,
-                                          lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>,
-                                          rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]}
-        : tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>,
-          tensor<32x32x3x3xf16, {order = #NHWC}>
-          -> tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    return %2: tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK-DAG: [[DIM_INDEX_W:%.+]] = arith.constant 3 : index
-    //CHECK-DAG: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16
-    //CHECK-DAG: [[DIM_INDEX_H:%.+]] = arith.constant 2 : index
-    //CHECK-DAG: [[LOOP_STEP_W:%.+]] = arith.constant 320 : index
-    //CHECK-DAG: [[LOOP_STEP_H:%.+]] = arith.constant 45 : index
-    //CHECK-DAG: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index
-
-    //CHECK: [[DIM_H:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_H]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK: [[DIM_W:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK: [[LOOP_OUTPUT:%.+]] = tensor.empty([[DIM_H]], [[DIM_W]]) : tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK: [[LOOP_END_H:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_H]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK: [[LOOP_END_W:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK: [[LOOP_H:%.+]] = scf.for
-    //CHECK-SAME:             [[LOOP_ITER_H:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_H]] step [[LOOP_STEP_H]]
-    //CHECK-SAME:             iter_args([[LOOP_OUT:%arg[0-9]]] = [[LOOP_OUTPUT]]) -> (tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>) {
-
-    //CHECK:                  [[LOOP_W:%.+]] = scf.for
-    //CHECK-SAME:             [[LOOP_ITER_W:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_W]] step [[LOOP_STEP_W]]
-    //CHECK-SAME:             iter_args([[LOOP_OUT_W:%arg[0-9]]] = [[LOOP_OUT]]) -> (tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>) {
-
-    //CHECK:                  [[INSERT_SIZE_H:%.+]] = affine.min #[[$MAP0]]([[LOOP_ITER_H]])[[[LOOP_END_H]]]
-    //CHECK:                  [[INSERT_SIZE_W:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER_W]])[[[LOOP_END_W]]]
-
-    //CHECK:                  [[DIM_H_1:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_H]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                  [[DIM_W_1:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  [[TMP_VALUE7:%.+]] = affine.max #[[$MAP2]]([[LOOP_ITER_H]])
-    //CHECK:                  [[TMP_VALUE6:%.+]] = affine.max #[[$MAP3]]([[LOOP_ITER_H]])
-    //CHECK:                  [[PAD1_LOW_H:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE6]]]
-    //CHECK:                  [[TMP_VALUE9:%.+]] = affine.max #[[$MAP5]]([[INSERT_SIZE_H]], [[TMP_VALUE7]])[[[DIM_H_1]]]
-    //CHECK:                  [[PAD1_HIGH_H:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE9]]]
-    //CHECK:                  [[TMP_VALUE5:%.+]] = affine.max #[[$MAP2]]([[LOOP_ITER_W]])
-    //CHECK:                  [[TMP_VALUE8:%.+]] = affine.max #[[$MAP3]]([[LOOP_ITER_W]])
-    //CHECK:                  [[PAD1_LOW_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE8]]]
-    //CHECK:                  [[TMP_VALUE4:%.+]] = affine.max #[[$MAP5]]([[INSERT_SIZE_W]], [[TMP_VALUE5]])[[[DIM_W_1]]]
-    //CHECK:                  [[PAD1_HIGH_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE4]]]
-
-    //CHECK:                  [[DIM_H_2:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_H]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                  [[DIM_W_2:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  [[SLICE_OFFSET_H:%.+]] = affine.max #[[$MAP2]]([[TMP_VALUE7]])
-    //CHECK:                  [[TMP_VALUE3:%.+]] = affine.max #[[$MAP3]]([[TMP_VALUE7]])
-    //CHECK:                  [[PAD0_LOW_H:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE3]]]
-    //CHECK:                  [[TMP_VALUE2:%.+]] = affine.max #[[$MAP6]]([[SLICE_OFFSET_H]], [[INSERT_SIZE_H]], [[PAD1_LOW_H]], [[PAD1_HIGH_H]])[[[DIM_H_2]]]
-    //CHECK:                  [[PAD0_HIGH_H:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE2]]]
-    //CHECK:                  [[SLICE_SIZE_H:%.+]] = affine.apply #[[$MAP7]]([[PAD0_LOW_H]], [[PAD0_HIGH_H]], [[INSERT_SIZE_H]], [[PAD1_LOW_H]], [[PAD1_HIGH_H]])
-    //CHECK:                  [[SLICE_OFFSET_W:%.+]] = affine.max #[[$MAP2]]([[TMP_VALUE5]])
-    //CHECK:                  [[TMP_VALUE1:%.+]] = affine.max #[[$MAP3]]([[TMP_VALUE5]])
-    //CHECK:                  [[PAD0_LOW_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE1]]]
-    //CHECK:                  [[TMP_VALUE0:%.+]] = affine.max #[[$MAP6]]([[SLICE_OFFSET_W]], [[INSERT_SIZE_W]], [[PAD1_LOW_W]], [[PAD1_HIGH_W]])[[[DIM_W_2]]]
-    //CHECK:                  [[PAD0_HIGH_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE0]]]
-    //CHECK:                  [[SLICE_SIZE_W:%.+]] = affine.apply #[[$MAP7]]([[PAD0_LOW_W]], [[PAD0_HIGH_W]], [[INSERT_SIZE_W]], [[PAD1_LOW_W]], [[PAD1_HIGH_W]])
-
-    //CHECK:                  [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, [[SLICE_OFFSET_H]], [[SLICE_OFFSET_W]]] [1, 4, [[SLICE_SIZE_H]], [[SLICE_SIZE_W]]] [1, 1, 1, 1]
-    //CHECK-SAME:             tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 45, 320]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  [[PAD0:%.+]] = tensor.pad [[SLICE]] low[0, 0, [[PAD0_LOW_H]], [[PAD0_LOW_W]]] high[0, 0, [[PAD0_HIGH_H]], [[PAD0_HIGH_W]]] {
-    //CHECK:                  tensor.yield [[PAD_VALUE]] : f16
-    //CHECK:                  tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 45, 320]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x4x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 47, 322]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                  [[CONV0:%.+]]  = VPU.NCE.CompressConvolution([[PAD0]]
-    //CHECK:                  [[DWCONV:%.+]] = VPU.NCE.DepthConvolution([[CONV0]]
-    //CHECK:                  [[PAD1:%.+]] = tensor.pad [[DWCONV]] low[0, 0, [[PAD1_LOW_H]], [[PAD1_LOW_W]]] high[0, 0, [[PAD1_HIGH_H]], [[PAD1_HIGH_W]]] {
-    //CHECK:                  tensor.yield [[PAD_VALUE]] : f16
-    //CHECK:                  tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 45, 320]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 47, 322]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                  [[CONV1:%.+]] = VPU.NCE.Convolution([[PAD1]]
-    //CHECK:                  [[INSERT:%.+]] = tensor.insert_slice [[CONV1]] into [[LOOP_OUT_W]][0, 0, [[LOOP_ITER_H]], [[LOOP_ITER_W]]] [1, 32, [[INSERT_SIZE_H]], [[INSERT_SIZE_W]]] [1, 1, 1, 1]
-    //CHECK-SAME:             tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 45, 320]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                  scf.yield [[INSERT]]
-
-    //CHECK:  scf.yield [[LOOP_W]]
-
-    //CHECK: return [[LOOP_H]] : tensor<1x32x?x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-}
-
-// -----
-
-config.Resources 3 of @NCE at 1.700000e+03 MHz {
-    config.MemoryResource 1326182 bytes of @CMX_NN_FragmentationAware
-    config.MemoryResource 1473536 bytes of @CMX_NN {config.bandwidth = 64 : i64, config.derateFactor = 1.000000e+00 : f64}
-    config.ExecutorResource 2 of @SHAVE_ACT
-    config.ExecutorResource 1 of @DPU
-}
-
-#NHWC = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3, d1)>
-
-//CHECK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 35)>
-//CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (0, d0 - 1)>
-//CHECK: #[[$MAP3:.+]] = affine_map<(d0) -> (-d0 + 1, 0)>
-//CHECK: #[[$MAP4:.+]] = affine_map<()[s0] -> (1, s0)>
-//CHECK: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (0, d0 + d1 - s0 + 2)>
-//CHECK: #[[$MAP6:.+]] = affine_map<(d0, d1, d2, d3)[s0] -> (0, d0 + d1 - d2 - d3 - s0 + 4)>
-//CHECK: #[[$MAP7:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (-d0 - d1 + d2 - d3 - d4 + 4)>
-
-
-// CHECK-LABEL: @Merge1DVFChainCompressConv
-// CHECK-SAME:  [[INPUT:%arg[0-9]]]: tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>)
-func.func @Merge1DVFChainCompressConv(%arg0: tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>) -> tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
- {
-    %cst = const.Declare tensor<32x4x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x4x3x3xf32>, [#const.CastElemType<f16>, #const.Reorder<#NHWC>]
-    %cst_3 = const.Declare tensor<32x1x1x4xsi32> = dense<1> : tensor<32x1x1x4xsi32>
-    %cst_0 = const.Declare tensor<32x16x1x1xf16, {order = #NHWC}> = dense<1.0> : tensor<1x32x1x1xf32>, [#const.Reshape<[32, 1, 1, 1]>, #const.CastElemType<f16>, #const.PadWithZero<[0, 0, 0, 0], [0, 15, 0, 0]>, #const.Reorder<#NHWC>]
-    %cst_1 = const.Declare tensor<32x32x3x3xf16, {order = #NHWC}> = dense<1.0> : tensor<32x32x3x3xf32>, [#const.CastElemType<f16>, #const.Reorder<#NHWC>]
-
-    %0 = VPU.NCE.CompressConvolution(%arg0, %cst, %cst_3) {mpe_engine = #VPU.MPEEngine37XX<mode = <SCL>>,
-                                           multiClusterStrategy = #VPU.multi_cluster_strategy<SplitOverHeight>,
-                                           pad = #VPU.Padding<left = 1 : i64, right = 1 : i64, top = 1 : i64, bottom = 1 : i64>,
-                                           ppe = #VPU.PPEInt<mode = <LRELU>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64,
-                                           lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>,
-                                           rawFilterShape = [32, 4, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 21], cm_sp_pattern = 0}
-        : tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>,
-          tensor<32x4x3x3xf16, {order = #NHWC}>, tensor<32x1x1x4xsi32>
-        -> tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    %1 = VPU.NCE.DepthConvolution(%0, %cst_0) {multiClusterStrategy = #VPU.multi_cluster_strategy<SplitOverHeight>,
-                                                       pad = #VPU.Padding<left = 0 : i64, right = 0 : i64, top = 0 : i64, bottom = 0 : i64>,
-                                                       ppe = #VPU.PPEInt<mode = <NOOP>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64,
-                                                       lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>,
-                                                       rawFilterShape = [32, 1, 1, 1], strides = [1, 1], tilingStrategy = [1, 1, 1, 20]}
-        -> tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    %2 = VPU.NCE.Convolution(%1, %cst_1) {mpe_engine = #VPU.MPEEngine37XX<mode = <SCL>>,
-                                          multiClusterStrategy = #VPU.multi_cluster_strategy<SplitOverHeight>,
-                                          pad = #VPU.Padding<left = 1 : i64, right = 1 : i64, top = 1 : i64, bottom = 1 : i64>,
-                                          ppe = #VPU.PPEInt<mode = <LRELU>, clamp_low = -2147483648 : i64, clamp_high = 2147483647 : i64,
-                                          lrelu_mult = 1 : i64, lrelu_shift = 0 : i64, fp_prelu_alpha = 1.000000e+00 : f64>,
-                                          rawFilterShape = [32, 32, 3, 3], strides = [1, 1], tilingStrategy = [1, 1, 1, 22]}
-        : tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>,
-          tensor<32x32x3x3xf16, {order = #NHWC}>
-          -> tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    return %2: tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK-DAG: [[DIM_INDEX_W:%.+]] = arith.constant 3 : index
-    //CHECK-DAG: [[PAD_VALUE:%.+]] = arith.constant 0.000000e+00 : f16
-    //CHECK-DAG: [[LOOP_STEP_W:%.+]] = arith.constant 35 : index
-    //CHECK-DAG: [[LOOP_BEGIN:%.+]] = arith.constant 0 : index
-
-    //CHECK: [[DIM_W:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK: [[LOOP_OUT:%.+]] = tensor.empty([[DIM_W]]) : tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK: [[LOOP_END_W:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  [[LOOP_W:%.+]] = scf.for
-    //CHECK-SAME:             [[LOOP_ITER_W:%arg[0-9]]] = [[LOOP_BEGIN]] to [[LOOP_END_W]] step [[LOOP_STEP_W]]
-    //CHECK-SAME:             iter_args([[LOOP_OUT_W:%arg[0-9]]] = [[LOOP_OUT]]) -> (tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>) {
-
-    //CHECK:                  [[INSERT_SIZE_W:%.+]] = affine.min #[[$MAP1]]([[LOOP_ITER_W]])[[[LOOP_END_W]]]
-
-    //CHECK:                  [[DIM_W_1:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  [[TMP_VALUE5:%.+]] = affine.max #[[$MAP2]]([[LOOP_ITER_W]])
-    //CHECK:                  [[TMP_VALUE8:%.+]] = affine.max #[[$MAP3]]([[LOOP_ITER_W]])
-    //CHECK:                  [[PAD1_LOW_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE8]]]
-    //CHECK:                  [[TMP_VALUE4:%.+]] = affine.max #[[$MAP5]]([[INSERT_SIZE_W]], [[TMP_VALUE5]])[[[DIM_W_1]]]
-    //CHECK:                  [[PAD1_HIGH_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE4]]]
-
-    //CHECK:                  [[DIM_W_2:%.+]] = tensor.dim [[INPUT]], [[DIM_INDEX_W]] : tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  [[SLICE_OFFSET_W:%.+]] = affine.max #[[$MAP2]]([[TMP_VALUE5]])
-    //CHECK:                  [[TMP_VALUE1:%.+]] = affine.max #[[$MAP3]]([[TMP_VALUE5]])
-    //CHECK:                  [[PAD0_LOW_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE1]]]
-    //CHECK:                  [[TMP_VALUE0:%.+]] = affine.max #[[$MAP6]]([[SLICE_OFFSET_W]], [[INSERT_SIZE_W]], [[PAD1_LOW_W]], [[PAD1_HIGH_W]])[[[DIM_W_2]]]
-    //CHECK:                  [[PAD0_HIGH_W:%.+]] = affine.min #[[$MAP4]]()[[[TMP_VALUE0]]]
-    //CHECK:                  [[SLICE_SIZE_W:%.+]] = affine.apply #[[$MAP7]]([[PAD0_LOW_W]], [[PAD0_HIGH_W]], [[INSERT_SIZE_W]], [[PAD1_LOW_W]], [[PAD1_HIGH_W]])
-
-    //CHECK:                  [[SLICE:%.+]] = tensor.extract_slice [[INPUT]][0, 0, 0, [[SLICE_OFFSET_W]]] [1, 4, 540, [[SLICE_SIZE_W]]] [1, 1, 1, 1]
-    //CHECK-SAME:             tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 960]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 35]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  [[PAD0:%.+]] = tensor.pad [[SLICE]] low[0, 0, 1, [[PAD0_LOW_W]]] high[0, 0, 1, [[PAD0_HIGH_W]]] {
-    //CHECK:                  tensor.yield [[PAD_VALUE]] : f16
-    //CHECK:                  tensor<1x4x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 540, 35]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x4x542x?xf16, {bounds = #const.OpaqueI64Elements<[1, 4, 542, 37]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                  [[CONV0:%.+]]  = VPU.NCE.CompressConvolution([[PAD0]]
-    //CHECK:                  [[DWCONV:%.+]] = VPU.NCE.DepthConvolution([[CONV0]]
-    //CHECK:                  [[PAD1:%.+]] = tensor.pad [[DWCONV]] low[0, 0, 1, [[PAD1_LOW_W]]] high[0, 0, 1, [[PAD1_HIGH_W]]] {
-    //CHECK:                  tensor.yield [[PAD_VALUE]] : f16
-    //CHECK:                  tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 35]> : tensor<4xsi64>, order = #NHWC}> to tensor<1x32x542x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 542, 37]> : tensor<4xsi64>, order = #NHWC}>
-    //CHECK:                  [[CONV1:%.+]] = VPU.NCE.Convolution([[PAD1]]
-    //CHECK:                  [[INSERT:%.+]] = tensor.insert_slice [[CONV1]] into [[LOOP_OUT_W]][0, 0, 0, [[LOOP_ITER_W]]] [1, 32, 540, [[INSERT_SIZE_W]]] [1, 1, 1, 1]
-    //CHECK-SAME:             tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 35]> : tensor<4xsi64>, order = #NHWC}> into tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-
-    //CHECK:                  scf.yield [[INSERT]]
-
-    //CHECK: return [[LOOP_W]] : tensor<1x32x540x?xf16, {bounds = #const.OpaqueI64Elements<[1, 32, 540, 960]> : tensor<4xsi64>, order = #NHWC}>
-}

From 27609b095bfdeb16a96918e9d3ffcf3b6335ca9c Mon Sep 17 00:00:00 2001
From: Maksim Doronin <maksim.doronin@intel.com>
Date: Fri, 27 Feb 2026 10:19:44 +0000
Subject: [PATCH 2/3] Generate build manifest for CiD builds (#259)

* Increase timeout for CodeQL job

* Generate build manifest

* fix postcommit for release branch

* Specify CiD subdir
---
 .github/actions/versions/action.yml          | 87 ++++++++++++++++++++
 .github/workflows/clang-format.yml           |  2 +-
 .github/workflows/codeql.yml                 |  2 +-
 .github/workflows/job_build_cid.yml          | 38 ++++++++-
 .github/workflows/job_build_plugin_linux.yml |  2 +-
 .github/workflows/scorecard.yml              |  2 +-
 .github/workflows/ubuntu_22.yml              |  2 +-
 .github/workflows/ubuntu_24.yml              |  2 +-
 .github/workflows/windows_2022.yml           |  2 +-
 9 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/.github/actions/versions/action.yml b/.github/actions/versions/action.yml
index bdaa7463fd..3fe75ec7db 100644
--- a/.github/actions/versions/action.yml
+++ b/.github/actions/versions/action.yml
@@ -1,6 +1,20 @@
 name: Versions
 description: Parse git versions from config files
 
+inputs:
+  build-manifest-dir:
+    description: 'Directory where build_manifest.json will be created (optional)'
+    required: false
+    default: ''
+  openvino-repo-path:
+    description: 'Path to OpenVINO repository (for TBB version detection)'
+    required: false
+    default: ''
+  npu-compiler-repo-path:
+    description: 'Path to NPU Compiler repository (for LLVM submodule SHA detection)'
+    required: false
+    default: ''
+
 outputs:
   openvino-repository:
     description: 'OpenVINO Repository full name: fork/openvino'
@@ -38,6 +52,15 @@ outputs:
   opencv-sha:
     description: 'OpenCV commit SHA'
     value: ${{ steps.read-ocv-sha.outputs.ocv-sha }}
+  llvm-sha:
+    description: 'LLVM SHA detected in NPU Compiler submodule'
+    value: ${{ steps.get-llvm-sha.outputs.llvm-sha }}
+  tbb-version:
+    description: 'TBB version detected from OpenVINO'
+    value: ${{ steps.get-tbb-version.outputs.tbb-version }}
+  build-manifest-path:
+    description: 'Absolute path to build_manifest.json file'
+    value: ${{ steps.create-build-manifest.outputs.build-manifest-path }}
 
 runs:
   using: 'composite'
@@ -179,3 +202,67 @@ runs:
         echo "OpenCV commit sha = $OCV_SHA"
         echo "ocv-repository=$OCV_ORG/opencv" >> $GITHUB_OUTPUT
         echo "ocv-sha=$OCV_SHA" >> $GITHUB_OUTPUT
+
+    - name: Get LLVM submodule commit sha
+      id: get-llvm-sha
+      shell: bash
+      run: |
+        LLVM_SHA="N/A"
+
+        if [[ -n "${{ inputs.npu-compiler-repo-path }}" ]]; then
+          LLVM_SHA=$(git -C "${{ inputs.npu-compiler-repo-path }}" ls-tree HEAD thirdparty/llvm-project | awk '{print $3}')
+          [[ -z "$LLVM_SHA" ]] && LLVM_SHA="N/A"
+        fi
+
+        echo "LLVM commit sha = $LLVM_SHA"
+        echo "llvm-sha=$LLVM_SHA" >> $GITHUB_OUTPUT
+
+    - name: Get TBB version from OpenVINO
+      id: get-tbb-version
+      shell: bash
+      run: |
+        TBB_VERSION="N/A"
+
+        if [[ -n "${{ inputs.openvino-repo-path }}" ]]; then
+          TBB_VERSION_FILES=$(find "${{ inputs.openvino-repo-path }}/temp" -name "TBBConfigVersion.cmake" 2>/dev/null)
+          
+          if [[ -n "$TBB_VERSION_FILES" ]]; then
+            TBB_VERSION=$(grep -h 'set(PACKAGE_VERSION' $TBB_VERSION_FILES | \
+              grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1)
+          fi
+          
+          [[ -z "$TBB_VERSION" ]] && TBB_VERSION="N/A"
+        fi
+
+        echo "TBB version = $TBB_VERSION"
+        echo "tbb-version=$TBB_VERSION" >> $GITHUB_OUTPUT
+
+    - name: Create build manifest
+      id: create-build-manifest
+      if: inputs.build-manifest-dir != ''
+      shell: bash
+      run: |
+        OUTPUT_DIR="${{ inputs.build-manifest-dir }}"
+
+        mkdir -p "$OUTPUT_DIR"
+
+        jq -n \
+          --arg openvino_sha "${{ steps.read-openvino-sha.outputs.openvino-sha }}" \
+          --arg npu_compiler_sha "${{ steps.get-npu-sha.outputs.npu-compiler-sha }}" \
+          --arg llvm_sha "${{ steps.get-llvm-sha.outputs.llvm-sha }}" \
+          --arg tbb_version "${{ steps.get-tbb-version.outputs.tbb-version }}" \
+          --arg opencv_sha "${{ steps.read-ocv-sha.outputs.ocv-sha }}" \
+          --arg omz_sha "${{ steps.read-omz-sha.outputs.omz-sha }}" \
+          '{
+            openvino_sha: $openvino_sha,
+            npu_compiler_sha: $npu_compiler_sha,
+            llvm_sha: $llvm_sha,
+            tbb_version: $tbb_version,
+            opencv_sha: $opencv_sha,
+            omz_sha: $omz_sha            
+          }' > "$OUTPUT_DIR/build_manifest.json"
+        
+        MANIFEST_PATH=$(cd "$OUTPUT_DIR" && pwd)/build_manifest.json
+        echo "Build manifest created at: $MANIFEST_PATH"
+        echo "build-manifest-path=$MANIFEST_PATH" >> $GITHUB_OUTPUT
+        cat "$MANIFEST_PATH"
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index f14375691b..d920a03cef 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -5,7 +5,7 @@ on:
   push:
     branches:
       - develop
-      - 'releases/*'
+      - releases/**
 
 concurrency:
   group: clang-format-${{ github.event_name }}-${{ github.ref_name }}
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 78a9517de0..a2c1722529 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -5,7 +5,7 @@ on:
   push:
     branches:
       - develop
-      - 'releases/*'
+      - releases/**
   schedule:
     - cron: '24 8 * * 6'
 
diff --git a/.github/workflows/job_build_cid.yml b/.github/workflows/job_build_cid.yml
index 448be4ac9e..9ebd5c4f0e 100644
--- a/.github/workflows/job_build_cid.yml
+++ b/.github/workflows/job_build_cid.yml
@@ -287,6 +287,15 @@ jobs:
             -S ${OPENVINO_REPO} \
             -B ${OPENVINO_BUILD_DIR}
 
+      - name: Get versions with build manifest
+        uses: ./npu_actions/.github/actions/versions
+        id: versions-with-manifest
+        if: ${{ !steps.cache-restore.outputs.cache-hit }}
+        with:
+          build-manifest-dir: ${{ env.CID_PACKAGE_ARTIFACTS_DIR }}
+          openvino-repo-path: ${{ env.OPENVINO_REPO }}
+          npu-compiler-repo-path: ${{ env.NPU_COMPILER_REPO }}
+
       - name: CMake build - CiD targets
         if: ${{ !steps.cache-restore.outputs.cache-hit }}
         run: |
@@ -299,13 +308,16 @@ jobs:
       - name: CMake cpack - CiD target
         if: ${{ !steps.cache-restore.outputs.cache-hit }}
         run: |
+          COMPONENT="CiD"
+          GENERATOR="Ninja"
+
           cpack -V \
             --config "${OPENVINO_BUILD_DIR}/CPackConfig.cmake" \
             -C "${CMAKE_BUILD_TYPE}" \
             -G "${{ steps.package-params.outputs.cpack-generator }}" \
             -B "${CID_PACKAGE_ARTIFACTS_DIR}" \
-            -D CPACK_COMPONENTS_ALL=CiD \
-            -D CPACK_CMAKE_GENERATOR=Ninja \
+            -D CPACK_COMPONENTS_ALL=${COMPONENT} \
+            -D CPACK_CMAKE_GENERATOR=${GENERATOR} \
             -D CPACK_PACKAGE_FILE_NAME="${{ steps.package-name.outputs.cid-package-base-name }}"
 
       - name: CiD Package renaming
@@ -313,6 +325,28 @@ jobs:
           mv "${CID_PACKAGE_ARTIFACTS_DIR}/"*.${{ steps.package-params.outputs.package-extension }} \
             "${CID_PACKAGE_ARTIFACTS_DIR}/${{ steps.package-name.outputs.cid-package-full-name }}"
 
+      - name: Inject build manifest into CiD package
+        if: ${{ !steps.cache-restore.outputs.cache-hit }}
+        run: |
+          PACKAGE_PATH="${CID_PACKAGE_ARTIFACTS_DIR}/${{ steps.package-name.outputs.cid-package-full-name }}"
+          EXT="${{ steps.package-params.outputs.package-extension }}"
+          MANIFEST_PATH="${{ steps.versions-with-manifest.outputs.build-manifest-path }}"
+          MANIFEST_FILENAME=$(basename "${MANIFEST_PATH}")
+          MANIFEST_DIR=$(dirname "${MANIFEST_PATH}")
+
+          PACKAGE_ABS_PATH="$(realpath ${PACKAGE_PATH})"
+
+          if [[ "${EXT}" == "zip" ]]; then
+            echo "Updating ZIP archive using 7-Zip..."
+            (cd "${MANIFEST_DIR}" && 7z u "${PACKAGE_ABS_PATH}" "${MANIFEST_FILENAME}")
+          elif [[ "${EXT}" == "tar.gz" ]]; then
+            echo "Updating TAR.GZ archive using tar..."
+            gunzip -S .gz "${PACKAGE_ABS_PATH}"
+            PACKAGE_ABS_PATH="$(realpath ${PACKAGE_PATH%.gz})"
+            (cd "${MANIFEST_DIR}" && tar -rf "${PACKAGE_ABS_PATH}" "${MANIFEST_FILENAME}")
+            gzip "${PACKAGE_ABS_PATH}"
+          fi
+
       - name: Cache CiD artifacts
         if: ${{ inputs.build-cache && !steps.cache-restore.outputs.cache-hit }}
         uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
diff --git a/.github/workflows/job_build_plugin_linux.yml b/.github/workflows/job_build_plugin_linux.yml
index e67bf00192..392f3675b1 100644
--- a/.github/workflows/job_build_plugin_linux.yml
+++ b/.github/workflows/job_build_plugin_linux.yml
@@ -56,7 +56,7 @@ jobs:
   Build:
     name: Build
     runs-on: ${{ inputs.build-runner }}
-    timeout-minutes: 240
+    timeout-minutes: 360
     permissions:
       actions: read
       contents: read
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 9de0eab66b..330b2ad4a8 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -8,7 +8,7 @@ on:
   push:
     branches:
       - develop
-      - 'releases/*'
+      - releases/**
 
 permissions: read-all
 
diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
index 89b4551a9c..ed40a06b4c 100644
--- a/.github/workflows/ubuntu_22.yml
+++ b/.github/workflows/ubuntu_22.yml
@@ -5,7 +5,7 @@ on:
   push:
     branches:
       - develop
-      - 'releases/*'
+      - releases/**
 
 concurrency:
   group: linux-${{ github.event_name }}-${{ github.ref_name }}-ubuntu_22_04
diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml
index d195412512..1cfb838ceb 100644
--- a/.github/workflows/ubuntu_24.yml
+++ b/.github/workflows/ubuntu_24.yml
@@ -5,7 +5,7 @@ on:
   push:
     branches:
       - develop
-      - 'releases/*'
+      - releases/**
 
 concurrency:
   group: linux-${{ github.event_name }}-${{ github.ref_name }}-ubuntu_24_04
diff --git a/.github/workflows/windows_2022.yml b/.github/workflows/windows_2022.yml
index 4f0b76cfaa..0c5bca9af4 100644
--- a/.github/workflows/windows_2022.yml
+++ b/.github/workflows/windows_2022.yml
@@ -5,7 +5,7 @@ on:
   push:
     branches:
       - develop
-      - 'releases/*'
+      - releases/**
 
 concurrency:
   group: windows-${{ github.event_name }}-${{ github.ref_name }}-2022

From b2d63e7098d0aa305e54e481ca1350938a121b2e Mon Sep 17 00:00:00 2001
From: "Doronin, Maksim" <maksim.doronin@intel.com>
Date: Fri, 27 Feb 2026 11:05:31 +0000
Subject: [PATCH 3/3] Expose restoreWeightsOffsets

---
 src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_common.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_common.cpp b/src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_common.cpp
index 2ae22753ec..bcc1b1a500 100644
--- a/src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_common.cpp
+++ b/src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_common.cpp
@@ -634,10 +634,7 @@ vcl_result_t BuildInfo::prepareModel(const uint8_t* modelIR, uint64_t modelIRSiz
             throw std::invalid_argument(error_message.str());
         }
 
-#ifdef VPUX_DEVELOPER_BUILD
-        // E#103359: WS is only available in developer builds
         restoreWeightsOffsets(model, logger);
-#endif  // VPUX_DEVELOPER_BUILD
 
         if (enableProfiling) {
             stopWatch.stop();